In [20]:
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

**Load data + feature engineering**

In [2]:
train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
test_df = pd.read_csv("data/census_income/adult.test", sep=', ')

dataset = pd.concat([train_df, test_df])

dataset = dataset.reset_index(drop=True) # fix index

#NaN are flagged as "?"
dataset['workclass'] = dataset['workclass'].replace('?', np.nan)
dataset['occupation'] = dataset['occupation'].replace('?', np.nan)
dataset['native.country'] = dataset['native.country'].replace('?', np.nan)

#### FEATUE ENGINEERING
# Identify Numeric features
numeric_features = ['age','fnlwgt','education.num','capital.gain','capital.loss','hours.per.week','income']
cat_features = ['workclass','education','marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Sex
dataset["sex"] = dataset["sex"].map({"Male": 0, "Female":1})

# Marital Status
dataset["marital.status"] = dataset["marital.status"].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
dataset["marital.status"] = dataset["marital.status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
dataset["marital.status"] = dataset["marital.status"].map({"Married":1, "Single":0})
dataset["marital.status"] = dataset["marital.status"].astype(int)

# Education
dummies_ed = pd.get_dummies(dataset['education'], prefix='education')
dataset = pd.concat([dataset, dummies_ed], axis=1)
dataset = dataset.drop('education', axis=1)

# Workclass
dataset['workclass'] = dataset['workclass'].str.replace('?', 'Unemployed')   # Missing values
dummies_w = pd.get_dummies(dataset['workclass'], prefix='workclass')
dataset = pd.concat([dataset, dummies_w], axis=1)
dataset = dataset.drop('workclass', axis=1)

# Occupation
dataset['occupation'] = dataset['occupation'].str.replace('?', 'Unemployed')   # Missing values
dummies_o = pd.get_dummies(dataset['occupation'], prefix='occupation')
dataset = pd.concat([dataset, dummies_o], axis=1)
dataset = dataset.drop('occupation', axis=1)

# Race 
dummies_r = pd.get_dummies(dataset['race'], prefix='race')
dataset = pd.concat([dataset, dummies_r], axis=1)
dataset = dataset.drop('race', axis=1)

# Relationship
dummies_re = pd.get_dummies(dataset['relationship'], prefix='relationship')
dataset = pd.concat([dataset, dummies_re], axis=1)
dataset = dataset.drop('relationship', axis=1)

# Native Country and fnlwgt dropped
dataset.drop(labels=["native.country", "fnlwgt"], axis = 1, inplace = True)

# Convert to bool
for col in dataset.columns:
    if dataset[col].dtype == 'bool':
        dataset[col] = dataset[col].astype(int)
dataset['income']=dataset['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1}).astype(int)

  train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
  test_df = pd.read_csv("data/census_income/adult.test", sep=', ')


**Load Embeddings**

In [13]:
embeddings = pd.read_csv('data/embeddings_three.csv', index_col=0)

In [14]:
embeddings#.reset_index(drop=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.039026,0.083643,-0.006012,0.049184,-0.054871,0.039502,-0.003620,-0.018624,-0.099075,-0.019245,...,0.004741,-0.102041,0.059321,0.087003,-0.089279,-0.049207,0.040364,-0.099796,-0.026120,-0.016798
1,0.027534,0.081433,0.004280,0.060161,-0.051280,0.038296,-0.005563,-0.028956,-0.076627,-0.009940,...,0.003284,-0.094972,0.071254,0.055294,-0.119392,-0.022936,0.034595,-0.068931,-0.007616,-0.003190
2,0.028838,0.079131,0.005577,0.032449,-0.030661,0.045024,-0.012447,-0.018873,-0.073591,-0.021455,...,0.002749,-0.108626,0.059476,0.069770,-0.105638,-0.016599,0.064647,-0.070888,-0.025164,-0.001137
3,0.009379,0.068352,0.001628,0.064624,-0.041974,0.045018,0.004580,-0.041240,-0.089373,-0.006193,...,0.005060,-0.103620,0.054449,0.050977,-0.110134,-0.032914,0.068069,-0.068330,-0.049449,0.013280
4,0.007120,0.052828,-0.005047,0.117845,-0.032977,0.073406,-0.052051,-0.012753,-0.058291,-0.014553,...,-0.005530,-0.053696,0.116197,0.065556,-0.042030,-0.009490,-0.009398,-0.092325,-0.026194,-0.031214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.021136,0.045845,0.011391,0.065562,-0.041699,0.067231,-0.037025,0.005638,-0.063542,-0.020633,...,-0.035309,-0.057135,0.072141,0.035106,-0.100375,0.026807,0.041409,-0.076980,-0.019366,-0.013934
48838,0.026922,0.087038,-0.006390,0.034871,-0.023873,0.052667,-0.028780,-0.043497,-0.091849,-0.031310,...,-0.017029,-0.096382,0.042998,0.071217,-0.112285,-0.032941,0.094064,-0.069105,-0.035706,-0.007028
48839,0.042622,0.076069,-0.001209,0.057808,-0.046720,0.043923,-0.001243,-0.024602,-0.087601,-0.011226,...,-0.007089,-0.092548,0.063754,0.066648,-0.108461,-0.026214,0.041539,-0.073822,-0.036897,-0.003668
48840,0.042411,0.070807,0.000363,0.053638,-0.049871,0.036463,-0.015207,-0.027775,-0.062938,-0.018996,...,0.011251,-0.079640,0.075235,0.077276,-0.099512,-0.003458,0.042897,-0.086952,-0.030336,-0.007412


**Define grid search**

In [4]:
data_size = [100, 500, 1000, 5000, 10000, 32561]

In [5]:
def find_best_threshold(y_true, y_prob):
    best_threshold = 0.5
    best_score = 0
    for threshold in np.arange(0.1, 0.9, 0.001):
        score = f1_score(y_true, y_prob >= threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

XGB_grid = {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    }

skf = StratifiedKFold(n_splits=5, shuffle=True)

**Baseline**

In [6]:
# Run in 2:30 minutes
auc_baseline = {}
acc_baseline = {}

for size in data_size:
    ## Baseline model
    X_train, Y_train = dataset.drop(['income'], axis=1).iloc[:size], dataset[['income']].iloc[:size]
    X_test, Y_test = dataset.drop(['income'], axis=1).iloc[32561:], dataset[['income']].iloc[32561:]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    xgb = XGBClassifier()
    search = RandomizedSearchCV(xgb, 
                                XGB_grid, 
                                scoring='roc_auc', 
                                cv=skf.split(X_train, Y_train))
    
    search.fit(X_train, Y_train)
    best_model = search.best_estimator_
    probas = best_model.predict_proba(X_test)[:, 1]
    best_threshold = find_best_threshold(Y_test, probas)

    preds = (best_model.predict_proba(X_test)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    acc = accuracy_score(Y_test, preds)

    auc_baseline[size] = auc
    acc_baseline[size] = acc

In [7]:
auc_baseline

{100: 0.7200236236228701,
 500: 0.7870222504919496,
 1000: 0.8022879660662904,
 5000: 0.8218933148158254,
 10000: 0.8151190245438528,
 32561: 0.8191613864795846}

**Tabtext**

In [35]:
auc_tabtext = {}
acc_tabtext = {}

pca = PCA(n_components=3)
embeddings_3d = pca.fit_transform(embeddings)
embeddings_df = pd.DataFrame(embeddings_3d, columns=['x1', 'x2', 'x3'])

df_concat = pd.concat([dataset, embeddings_df], axis=1)

for size in data_size:
    ## Baseline model
    X_train, Y_train = df_concat.drop(['income'], axis=1).iloc[:size], df_concat[['income']].iloc[:size]
    X_test, Y_test = df_concat.drop(['income'], axis=1).iloc[32561:], df_concat[['income']].iloc[32561:]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    xgb = XGBClassifier()
    search = RandomizedSearchCV(xgb, 
                                XGB_grid, 
                                scoring='roc_auc', 
                                cv=skf.split(X_train, Y_train))
    
    search.fit(X_train, Y_train)
    best_model = search.best_estimator_
    probas = best_model.predict_proba(X_test)[:, 1]
    best_threshold = find_best_threshold(Y_test, probas)

    preds = (best_model.predict_proba(X_test)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    acc = accuracy_score(Y_test, preds)

    auc_tabtext[size] = auc
    acc_tabtext[size] = acc

In [36]:
auc_tabtext

{100: 0.7238584163390661,
 500: 0.7955688143086641,
 1000: 0.7981608263124251,
 5000: 0.8108647023806164,
 10000: 0.8217381240484841,
 32561: 0.8304862351309493}

In [37]:
acc_tabtext = pd.DataFrame.from_dict(acc_tabtext, orient='index').rename({0: 'Accuracy - TabText'}, axis=1)
auc_tabtext = pd.DataFrame.from_dict(auc_tabtext, orient='index').rename({0: 'AUC - TabText'}, axis=1)

In [25]:
acc_baseline = pd.DataFrame.from_dict(acc_baseline, orient='index').rename({0: 'Accuracy - Baseline'}, axis=1)
auc_baseline = pd.DataFrame.from_dict(auc_baseline, orient='index').rename({0: 'AUC - Baseline'}, axis=1)
acc_tabtext = pd.DataFrame.from_dict(acc_tabtext, orient='index').rename({0: 'Accuracy - TabText'}, axis=1)
auc_tabtext = pd.DataFrame.from_dict(auc_tabtext, orient='index').rename({0: 'AUC - TabText'}, axis=1)

TypeError: 'numpy.ndarray' object is not callable

In [38]:
comparison = pd.concat([acc_baseline, acc_tabtext, auc_baseline, auc_tabtext], axis=1)

comparison['Accuracy change'] = comparison['Accuracy - TabText'] - comparison['Accuracy - Baseline']
comparison['AUC change'] = comparison['AUC - TabText'] - comparison['AUC - Baseline']

**3-dimensional PCA**

In [39]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.699711,0.720024,0.723858,-0.032,0.003835
1,500,0.81979,0.814876,0.787022,0.795569,-0.004914,0.008547
2,1000,0.832136,0.814446,0.802288,0.798161,-0.017689,-0.004127
3,5000,0.842332,0.849764,0.821893,0.810865,0.007432,-0.011029
4,10000,0.862846,0.851422,0.815119,0.821738,-0.011424,0.006619
5,32561,0.869295,0.863276,0.819161,0.830486,-0.006019,0.011325


**5-dimensional PCA**

In [34]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.709969,0.720024,0.720426,-0.021743,0.000403
1,500,0.81979,0.82802,0.787022,0.790435,0.00823,0.003413
2,1000,0.832136,0.834838,0.802288,0.792833,0.002703,-0.009455
3,5000,0.842332,0.850623,0.821893,0.80586,0.008292,-0.016033
4,10000,0.862846,0.86346,0.815119,0.809774,0.000614,-0.005345
5,32561,0.869295,0.86733,0.819161,0.825597,-0.001965,0.006436


**7-dimensional PCA**

In [29]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.730299,0.720024,0.685515,-0.001413,-0.034509
1,500,0.81979,0.80855,0.787022,0.783076,-0.01124,-0.003946
2,1000,0.832136,0.828266,0.802288,0.79742,-0.00387,-0.004868
3,5000,0.842332,0.855414,0.821893,0.811601,0.013083,-0.010293
4,10000,0.862846,0.859407,0.815119,0.816908,-0.00344,0.001789
5,32561,0.869295,0.863031,0.819161,0.825117,-0.006265,0.005956


**No PCA, Embeddings 3**

In [19]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.70094,0.720024,0.668181,-0.030772,-0.051843
1,500,0.81979,0.802592,0.787022,0.780523,-0.017198,-0.006499
2,1000,0.832136,0.802653,0.802288,0.796008,-0.029482,-0.00628
3,5000,0.842332,0.843314,0.821893,0.804847,0.000983,-0.017046
4,10000,0.862846,0.846385,0.815119,0.815747,-0.016461,0.000628
5,32561,0.869295,0.860512,0.819161,0.816554,-0.008783,-0.002607


**No PCA, Embeddings 2**

In [19]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.767582,0.719612,0.708036,0.711833,-0.04797,0.003797
1,500,0.81979,0.794976,0.787022,0.767186,-0.024814,-0.019836
2,1000,0.831153,0.82974,0.797604,0.78321,-0.001413,-0.014394
3,5000,0.847552,0.841533,0.818666,0.807183,-0.006019,-0.011483
4,10000,0.862846,0.857011,0.815119,0.810132,-0.005835,-0.004987
5,32561,0.869295,0.860451,0.819161,0.820735,-0.008845,0.001573
