# Dataset pre-processing

Use this notebook to preprocess the datasets.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

dataset_dir = '../datasets'

In [None]:
def get_num_unique(dataset):
    _, idxs, counts = np.unique(dataset, axis=0, return_index=True, return_counts=True)
    unique_idxs = list(idxs[counts==1])
    return len(unique_idxs)

In [None]:
def discretize_dataset(dataset):
    columns = dataset.columns
    value_mapping = {}    
    discrete = dataset.copy()
    for column in columns:
        # Compute a mapping value -> integer.
        mapper = {v: i for i, v in enumerate(sorted(dataset[column].unique()))}
        mapping = {i: v for i,v in mapper.items()}
        value_mapping[column] = mapping
        discrete[column] = [mapper[x] for x in dataset[column]]
    return discrete

## Adults dataset

Link to the dataset: https://archive.ics.uci.edu/ml/datasets/adult. Download the dataset and save it under a directory called ```../datasets/adults```.

We load the dataset train and test splits, we combine them into a dataset which we discretize and save to disk.

In [None]:
columns = [
    "age",
    "workClass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex", 
    "capital-gain",
    "capital-loss",
    "hours-per-week", 
    "native-country",
    "income"
]

In [None]:
adults_train = pd.read_csv(f'{dataset_dir}/adults/adult.data', names=columns)

print(f'Size of adults (train): {len(adults_train)}')
#adults_train.nunique()

In [None]:
adults_test = pd.read_csv(f'{dataset_dir}/adults/adult.test', names=columns)

print(f'Size of adults (test): {len(adults_test)}')
#adults_test.nunique()

In [None]:
adults = pd.concat((adults_train, adults_test), ignore_index=True)

print(f'Size of adults (train + test): {len(adults)}')
adults.nunique()

In [None]:
discrete_adults = discretize_dataset(adults.drop(columns=['fnlwgt']))

In [None]:
discrete_adults.drop(columns=['income'], inplace=True)

In [None]:
discrete_adults.to_csv(f'{dataset_dir}/adults/discrete.csv', index=False)

## Census Income dataset

Link to the dataset: https://archive.ics.uci.edu/ml/datasets/Census-Income+%28KDD%29. Download the dataset and save it under a directory called ```../datasets/census```.

We load the dataset train and test splits, we combine them into a dataset which we discretize and save to disk.

In [None]:
columns = ['age', 
                'class of worker',
                'industry code',
                'occupation code',
                #'adjusted gross income',
                'education',
                'wage per hour', 
                'enrolled in edu inst last wk',
                'marital status',
                'major industry code',
                'major occupation code',
                'race',
                'hispanic Origin',
                'sex',
                'member of a labor union',
                'reason for unemployment',
                'full or part time employment stat',
                'capital gains',
                'capital losses',
                'divdends from stocks',
                #'federal income tax liability',
                'tax filer status',
                'region of previous residence',
                'state of previous residence',
                'detailed household and family stat',
                'detailed household summary in household',
                'instance weight',
                'migration code-change in msa',
                'migration code-change in reg',
                'migration code-move within reg',
                'live in this house 1 year ago',
                'migration prev res in sunbelt',
                'num persons worked for employer',
                'family members under 18',
                #'total person earnings',
                'country of birth father',
                'country of birth mother',
                'country of birth self',
                'citizenship',
                #'total person income',
                'own business or self employed',
                #'taxable income amount',
                "fill inc questionnaire for veteran's admin",
                'veterans benefits',
                'weeks worked in year',
                'year',
               'income']

print(len(columns))

In [None]:
census_train = pd.read_csv(f'{dataset_dir}/census/census-income.data.gz', compression='gzip', header=None, names=columns)
census_test = pd.read_csv(f'{dataset_dir}/census/census-income.test.gz', compression='gzip', header=None, names=columns)

print(f'Size of census train: {len(census_train)} and test: {len(census_test)}')

In [None]:
census = pd.concat((census_train, census_test), ignore_index=True)

print(f'Size of census (train + test): {len(census)}')
census.nunique()

In [None]:
census_weights = census['instance weight']
census.drop(columns=['instance weight', 'income'], inplace=True)

In [None]:
discrete_census = discretize_dataset(census)

In [None]:
discrete_census.to_csv(f'{dataset_dir}/census/discrete.csv', index=False)

## Car insurance policy dataset
Link to dataset https://archive.ics.uci.edu/ml/datasets/Insurance+Company+Benchmark+%28COIL+2000%29. Download the dataset and save it under a directory called ```../datasets/insurance```.


In [None]:
insurance_train = pd.read_csv(f'{dataset_dir}/insurance/ticdata2000.txt', header=None, sep='\t')
print(insurance_train.shape)
insurance_eval = pd.read_csv(f'{dataset_dir}/insurance/ticeval2000.txt', header=None, sep='\t')
print(insurance_eval.shape)

In [None]:
insurance = pd.concat((insurance_train[insurance_train.columns[:-1]], insurance_eval), axis=0)

In [None]:
discrete_insurance = discretize_dataset(insurance[insurance.columns[:43]])

In [None]:
discrete_insurance.to_csv(f'{dataset_dir}/insurance/discrete.csv', index=False)