In [40]:
## Basics
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

## Sklearn Packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.preprocessing import OneHotEncoder

**Data Preprocessing**

In [41]:
# ===== get the columns from the information .txt file ======
with open("split-data/adult.names", "r") as file:
    contents = file.read()

filtered_content = contents[-1375:].split('\n')
vars = []
for var in filtered_content:
    if var != '':
        x = var.split(':')[0]
        vars.append(x)
vars = vars + ['target']

# ===== import training and testing datasets ======
params_d = {
    'names': vars,
    'header': None,
    'na_values': ' ?',
    'skipinitialspace': True
}

data_train = pd.read_csv('split-data/adult.data', **params_d).replace('?', np.nan)
data_test = pd.read_csv('split-data/adult.test', skiprows=1, **params_d).replace('?', np.nan)

print(f'Training Data Dimensions: {data_train.shape}\nTesting Data Dimensions: {data_test.shape}')

Training Data Dimensions: (32561, 15)
Testing Data Dimensions: (16281, 15)


In [42]:
# ===== combine tr and ts into full_data to perform feature engineering (but keep track with col = 'set) ======
data_train['set'] = 'train'
data_test['set'] = 'test'
full_data = pd.concat([data_train, data_test]).reset_index(drop=True)

# ===== one-hot encode ======

def one_hot_encode(raw_data, col_name_to_encode):
    n_obs = raw_data.shape[0]
    cols = sorted([f'{col_name_to_encode.upper()}_{col}' for col in raw_data[col_name_to_encode].unique()])
    encoded_df = pd.DataFrame(np.zeros(shape=(n_obs, len(cols))), columns=cols)
    for i in range(n_obs):
        value = raw_data.at[i, col_name_to_encode]
        col_name = f'{col_name_to_encode.upper()}_{value}'
        encoded_df.at[i, col_name] = 1
        
    return encoded_df

engineered_df_d = {}

# Define relevant columns to one-hot encode
list_col_names_to_encode = [
'age', 'workclass', 'education', 'marital-status', 'occupation', 
'relationship', 'race', 'sex','native-country'
]

# Define columns that are not to be one-hot encoded
valid_cols = [
'fnlwgt', 'education-num', 'capital-gain',
'capital-loss', 'hours-per-week', 'target', 'set'
]

assert len(list_col_names_to_encode) + len(valid_cols) == len(full_data.columns), 'not true'

# Loop function over relevant columns
for col in list_col_names_to_encode:
    params = {'raw_data':full_data, 'col_name_to_encode':col}
    engineered_df = one_hot_encode(**params)
    engineered_df_d[col] = engineered_df # Store df in dictionary to combine later

engineered_df = pd.concat(list(engineered_df_d.values()),axis=1)

# ===== split into tr, ts ======
df = pd.concat([engineered_df,full_data[valid_cols]],axis=1)
X = df.drop(columns='target')
y = df['target']
bool_musk = (df['set'] == 'train').values
X_tr, X_ts = X.loc[bool_musk].drop('set', axis=1), X.loc[~bool_musk].drop('set', axis=1)
y_tr, y_ts = y[bool_musk], y[~bool_musk]

In [43]:
X_tr

Unnamed: 0,AGE_17,AGE_18,AGE_19,AGE_20,AGE_21,AGE_22,AGE_23,AGE_24,AGE_25,AGE_26,...,NATIVE-COUNTRY_Trinadad&Tobago,NATIVE-COUNTRY_United-States,NATIVE-COUNTRY_Vietnam,NATIVE-COUNTRY_Yugoslavia,NATIVE-COUNTRY_nan,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,77516,13,2174,0,40
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,83311,13,0,0,13
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,215646,9,0,0,40
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,234721,7,0,0,40
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,338409,13,0,0,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,257302,12,0,0,38
32557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,154374,9,0,0,40
32558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,151910,9,0,0,40
32559,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,201490,9,0,0,20
