# Data preparation

## Download

In [1]:
import os
from tqdm import tqdm
import requests
import math

In [2]:
os.makedirs('../data/original/', exist_ok=True)

In [3]:
# Source: https://stackoverflow.com/questions/37573483/progress-bar-while-download-file-over-http-with-requests
def download_data_with_progress(url, destination_filepath):
    r = requests.get(url, stream=True)
    
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024
    wrote = 0
    with open(destination_filepath, 'wb') as f:
        for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size), unit='KB', unit_scale=True):
            wrote = wrote + len(data)
            f.write(data)
    if total_size != 0 and wrote != total_size:
        print("ERROR")

In [4]:
filenames = [
    'adult.data',
    'adult.names',
    'adult.test'
]

In [5]:
for filename in filenames:
    print("Downloading `{}`...".format(filename))
    download_data_with_progress(
        'http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/{}'.format(filename),
        '../data/original/{}'.format(filename)
    )
    
    print("Downloaded!")
    print()

Downloading `adult.data`...


3.88kKB [00:03, 1.14kKB/s]                                                                                             


Downloaded!

Downloading `adult.names`...


6.00KB [00:00, ?KB/s]                                                                                                  


Downloaded!

Downloading `adult.test`...


1.96kKB [00:02, 900KB/s]                                                                                               


Downloaded!



## Load data

In [6]:
import numpy as np
import pandas as pd

In [7]:
def load_census_data(filepath, drop_first_row=False):
    df = pd.read_csv(
        filepath, 
        sep=', ',
        names=[
            'age', 
            'workclass', 
            'fnlwgt', 
            'education', 
            'education_num', 
            'marital_status',
            'occupation',
            'relationship',
            'race',
            'sex',
            'capital_gain',
            'capital_loss',
            'hours_per_week',
            'native_country',
            'income'
        ],
        engine='python',
        skiprows=1 if drop_first_row else None
    )
    
    # Special care is needed by the label
    df['income'] = df.income.apply(lambda x: x.replace('.', ''))
    
    return df

In [8]:
train_df = load_census_data('../data/original/adult.data')
test_df = load_census_data('../data/original/adult.test', drop_first_row=True)

In [9]:
train_test_split = train_df.shape[0]

In [10]:
original_df = pd.concat([train_df, test_df])

In [11]:
original_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## One-hot encoding

In [12]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer

In [13]:
df_dtypes = original_df.dtypes

numerical_features = df_dtypes[df_dtypes != 'object'].index.values
categorical_features = df_dtypes[df_dtypes == 'object'].index.values

In [14]:
transformer = make_column_transformer(
  (StandardScaler(), numerical_features),
  (OneHotEncoder(sparse=False), categorical_features)
)

In [15]:
encoded_array = transformer.fit_transform(original_df)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [16]:
encoded_array.shape

(48842, 110)

Get feature names for categoricals

In [17]:
onehot_encoder = transformer.named_transformers_['onehotencoder']

In [18]:
encoded_categorical_features = onehot_encoder.get_feature_names(input_features=categorical_features)

In [19]:
new_features = np.concatenate([numerical_features, encoded_categorical_features])

In [20]:
prepared_df = pd.DataFrame(encoded_array, columns=new_features)

In [21]:
prepared_df.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia,income_<=50K,income_>50K
0,0.025996,-1.061979,1.136512,0.146932,-0.217127,-0.034087,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.828308,-1.007104,1.136512,-0.144804,-0.217127,-2.213032,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.046942,0.246034,-0.419335,-0.144804,-0.217127,-0.034087,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.047121,0.426663,-1.197259,-0.144804,-0.217127,-0.034087,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,-0.776316,1.40853,1.136512,-0.144804,-0.217127,-0.034087,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Drop column corresponding analogous to the target variable

In [22]:
final_df = prepared_df.drop('income_<=50K', axis=1)

## Save the final dataframe

In [23]:
final_train_df = final_df.iloc[:train_test_split, :]
final_test_df = final_df.iloc[train_test_split:, :]

In [24]:
final_train_df.to_csv('../data/census_train.csv', index=False)
final_test_df.to_csv('../data/census_test.csv', index=False)