In [44]:
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

**Load data + feature engineering**

In [46]:
train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
test_df = pd.read_csv("data/census_income/adult.test", sep=', ')

dataset = pd.concat([train_df, test_df])

dataset = dataset.reset_index(drop=True) # fix index

#NaN are flagged as "?"
dataset['workclass'] = dataset['workclass'].replace('?', np.nan)
dataset['occupation'] = dataset['occupation'].replace('?', np.nan)
dataset['native.country'] = dataset['native.country'].replace('?', np.nan)

#### FEATUE ENGINEERING
# Identify Numeric features
numeric_features = ['age','fnlwgt','education.num','capital.gain','capital.loss','hours.per.week','income']
cat_features = ['workclass','education','marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Sex
dataset["sex"] = dataset["sex"].map({"Male": 0, "Female":1})

# Marital Status
dataset["marital.status"] = dataset["marital.status"].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
dataset["marital.status"] = dataset["marital.status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
dataset["marital.status"] = dataset["marital.status"].map({"Married":1, "Single":0})
dataset["marital.status"] = dataset["marital.status"].astype(int)

# Education
dummies_ed = pd.get_dummies(dataset['education'], prefix='education')
dataset = pd.concat([dataset, dummies_ed], axis=1)
dataset = dataset.drop('education', axis=1)

# Workclass
dataset['workclass'] = dataset['workclass'].str.replace('?', 'Unemployed')   # Missing values
dummies_w = pd.get_dummies(dataset['workclass'], prefix='workclass')
dataset = pd.concat([dataset, dummies_w], axis=1)
dataset = dataset.drop('workclass', axis=1)

# Occupation
dataset['occupation'] = dataset['occupation'].str.replace('?', 'Unemployed')   # Missing values
dummies_o = pd.get_dummies(dataset['occupation'], prefix='occupation')
dataset = pd.concat([dataset, dummies_o], axis=1)
dataset = dataset.drop('occupation', axis=1)

# Race 
dummies_r = pd.get_dummies(dataset['race'], prefix='race')
dataset = pd.concat([dataset, dummies_r], axis=1)
dataset = dataset.drop('race', axis=1)

# Relationship
dummies_re = pd.get_dummies(dataset['relationship'], prefix='relationship')
dataset = pd.concat([dataset, dummies_re], axis=1)
dataset = dataset.drop('relationship', axis=1)

# Native Country and fnlwgt dropped
dataset.drop(labels=["native.country", "fnlwgt"], axis = 1, inplace = True)

# Convert to bool
for col in dataset.columns:
    if dataset[col].dtype == 'bool':
        dataset[col] = dataset[col].astype(int)
dataset['income']=dataset['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1}).astype(int)

  train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
  test_df = pd.read_csv("data/census_income/adult.test", sep=', ')


**Load Embeddings**

In [47]:
embeddings = pd.read_csv('data/embeddings-finetuned.csv', index_col=0)

In [48]:
embeddings#.reset_index(drop=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.035360,0.022549,-0.007589,0.050871,-0.060745,-0.035152,0.111855,-0.037031,0.009979,0.020047,...,-0.042485,-0.084843,0.015672,-0.073311,-0.005877,-0.047895,-0.026162,-0.042927,-0.003594,0.008632
1,-0.032857,-0.029700,0.016033,0.030637,-0.040461,-0.037377,0.117896,-0.048750,0.023549,0.038313,...,-0.053596,-0.080206,-0.004534,-0.057768,-0.012053,0.009787,-0.040216,-0.028040,0.003096,0.009196
2,-0.023483,0.010663,-0.006748,0.037087,-0.062510,-0.023429,0.115462,-0.039483,-0.008054,0.020186,...,-0.024188,-0.094730,0.015135,-0.069029,-0.026648,-0.041145,-0.014038,-0.023826,-0.004342,0.023390
3,-0.048346,0.025542,-0.013994,0.043534,-0.069787,-0.053106,0.110434,-0.032748,0.021770,0.017684,...,-0.034078,-0.084353,0.001684,-0.078801,-0.004006,-0.033225,-0.021182,-0.040656,-0.007733,0.022023
4,-0.030368,0.009924,0.007800,0.035145,-0.067556,-0.037239,0.114391,-0.056107,0.003597,0.008387,...,-0.033936,-0.090382,0.009133,-0.058547,-0.023332,-0.032942,-0.026733,-0.018526,-0.005428,0.032371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,-0.053574,-0.015847,-0.000676,0.041990,-0.051328,-0.055476,0.104646,-0.039574,0.041480,0.035466,...,-0.058882,-0.064229,-0.004421,-0.090264,-0.005488,0.006804,-0.040342,-0.057963,0.008182,0.002438
16277,-0.026639,0.007834,-0.003539,0.040561,-0.078775,-0.033048,0.118137,-0.043240,0.008379,0.018539,...,-0.036019,-0.090419,0.001768,-0.074043,-0.019900,-0.040379,-0.010911,-0.031792,-0.000406,0.026478
16278,-0.037011,-0.033163,0.014149,0.036309,-0.048313,-0.038713,0.117023,-0.046530,0.032015,0.022442,...,-0.049331,-0.073891,-0.008103,-0.068025,0.003834,0.000126,-0.062187,-0.031872,0.004026,-0.003032
16279,-0.020667,0.009221,0.008026,0.049759,-0.071663,-0.041197,0.121158,-0.060702,0.016001,-0.004056,...,-0.007961,-0.096071,-0.007033,-0.048922,0.004443,-0.028456,-0.046850,-0.001086,-0.014730,0.031963


**Define grid search**

In [4]:
data_size = [100, 500, 1000, 5000, 10000, 32561]

In [5]:
def find_best_threshold(y_true, y_prob):
    best_threshold = 0.5
    best_score = 0
    for threshold in np.arange(0.1, 0.9, 0.001):
        score = f1_score(y_true, y_prob >= threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

XGB_grid = {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    }

skf = StratifiedKFold(n_splits=5, shuffle=True)

**Baseline**

In [6]:
# Run in 2:30 minutes
auc_baseline = {}
acc_baseline = {}

for size in data_size:
    ## Baseline model
    X_train, Y_train = dataset.drop(['income'], axis=1).iloc[:size], dataset[['income']].iloc[:size]
    X_test, Y_test = dataset.drop(['income'], axis=1).iloc[32561:], dataset[['income']].iloc[32561:]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    xgb = XGBClassifier()
    search = RandomizedSearchCV(xgb, 
                                XGB_grid, 
                                scoring='roc_auc', 
                                cv=skf.split(X_train, Y_train))
    
    search.fit(X_train, Y_train)
    best_model = search.best_estimator_
    probas = best_model.predict_proba(X_test)[:, 1]
    best_threshold = find_best_threshold(Y_test, probas)

    preds = (best_model.predict_proba(X_test)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    acc = accuracy_score(Y_test, preds)

    auc_baseline[size] = auc
    acc_baseline[size] = acc

In [49]:
auc_baseline

Unnamed: 0,AUC - Baseline
100,0.720024
500,0.787022
1000,0.802288
5000,0.821893
10000,0.815119
32561,0.819161


In [62]:
embeddings.reset_index(drop=True, inplace=True)

**Tabtext**

In [63]:
auc_tabtext = {}
acc_tabtext = {}

#pca = PCA(n_components=5)
#embeddings_3d = pca.fit_transform(embeddings)
#embeddings_df = pd.DataFrame(embeddings_3d, columns=['x1', 'x2', 'x3', 'x4', 'x5'])

df_concat = pd.concat([dataset, embeddings], axis=1)

for size in data_size:
    ## Baseline model
    X_train, Y_train = df_concat.drop(['income'], axis=1).iloc[:size], df_concat[['income']].iloc[:size]
    X_test, Y_test = df_concat.drop(['income'], axis=1).iloc[32561:], df_concat[['income']].iloc[32561:]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    xgb = XGBClassifier()
    search = RandomizedSearchCV(xgb, 
                                XGB_grid, 
                                scoring='roc_auc', 
                                cv=skf.split(X_train, Y_train))
    
    search.fit(X_train, Y_train)
    best_model = search.best_estimator_
    probas = best_model.predict_proba(X_test)[:, 1]
    best_threshold = find_best_threshold(Y_test, probas)

    preds = (best_model.predict_proba(X_test)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    acc = accuracy_score(Y_test, preds)

    auc_tabtext[size] = auc
    acc_tabtext[size] = acc

In [56]:
auc_tabtext

{100: 0.6931390186849934,
 500: 0.796906754436643,
 1000: 0.7965745433194891,
 5000: 0.8226974965608997,
 10000: 0.8192320503435336,
 32561: 0.8294152369231078}

In [64]:
acc_tabtext = pd.DataFrame.from_dict(acc_tabtext, orient='index').rename({0: 'Accuracy - TabText'}, axis=1)
auc_tabtext = pd.DataFrame.from_dict(auc_tabtext, orient='index').rename({0: 'AUC - TabText'}, axis=1)

In [25]:
acc_baseline = pd.DataFrame.from_dict(acc_baseline, orient='index').rename({0: 'Accuracy - Baseline'}, axis=1)
auc_baseline = pd.DataFrame.from_dict(auc_baseline, orient='index').rename({0: 'AUC - Baseline'}, axis=1)
acc_tabtext = pd.DataFrame.from_dict(acc_tabtext, orient='index').rename({0: 'Accuracy - TabText'}, axis=1)
auc_tabtext = pd.DataFrame.from_dict(auc_tabtext, orient='index').rename({0: 'AUC - TabText'}, axis=1)

TypeError: 'numpy.ndarray' object is not callable

In [65]:
comparison = pd.concat([acc_baseline, acc_tabtext, auc_baseline, auc_tabtext], axis=1)

comparison['Accuracy change'] = comparison['Accuracy - TabText'] - comparison['Accuracy - Baseline']
comparison['AUC change'] = comparison['AUC - TabText'] - comparison['AUC - Baseline']

**3-dimensional PCA with finetuned embeddings**

In [54]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.720287,0.720024,0.677973,-0.011424,-0.04205
1,500,0.81979,0.838339,0.787022,0.79719,0.018549,0.010168
2,1000,0.832136,0.828512,0.802288,0.804944,-0.003624,0.002656
3,5000,0.842332,0.837111,0.821893,0.827006,-0.005221,0.005113
4,10000,0.862846,0.854739,0.815119,0.820048,-0.008108,0.004929
5,32561,0.869295,0.865856,0.819161,0.820322,-0.00344,0.001161


**5-dimensional PCA with finetuned embeddings**

In [59]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.708065,0.720024,0.693139,-0.023647,-0.026885
1,500,0.81979,0.827345,0.787022,0.796907,0.007555,0.009885
2,1000,0.832136,0.832873,0.802288,0.796575,0.000737,-0.005713
3,5000,0.842332,0.84356,0.821893,0.822697,0.001228,0.000804
4,10000,0.862846,0.857196,0.815119,0.819232,-0.005651,0.004113
5,32561,0.869295,0.858485,0.819161,0.829415,-0.01081,0.010254


**3-dimensional PCA with not-finetuned embeddings**

In [39]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.699711,0.720024,0.723858,-0.032,0.003835
1,500,0.81979,0.814876,0.787022,0.795569,-0.004914,0.008547
2,1000,0.832136,0.814446,0.802288,0.798161,-0.017689,-0.004127
3,5000,0.842332,0.849764,0.821893,0.810865,0.007432,-0.011029
4,10000,0.862846,0.851422,0.815119,0.821738,-0.011424,0.006619
5,32561,0.869295,0.863276,0.819161,0.830486,-0.006019,0.011325


**5-dimensional PCA with not-finetuned embeddings**

In [34]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.709969,0.720024,0.720426,-0.021743,0.000403
1,500,0.81979,0.82802,0.787022,0.790435,0.00823,0.003413
2,1000,0.832136,0.834838,0.802288,0.792833,0.002703,-0.009455
3,5000,0.842332,0.850623,0.821893,0.80586,0.008292,-0.016033
4,10000,0.862846,0.86346,0.815119,0.809774,0.000614,-0.005345
5,32561,0.869295,0.86733,0.819161,0.825597,-0.001965,0.006436


**7-dimensional PCA**

In [29]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.730299,0.720024,0.685515,-0.001413,-0.034509
1,500,0.81979,0.80855,0.787022,0.783076,-0.01124,-0.003946
2,1000,0.832136,0.828266,0.802288,0.79742,-0.00387,-0.004868
3,5000,0.842332,0.855414,0.821893,0.811601,0.013083,-0.010293
4,10000,0.862846,0.859407,0.815119,0.816908,-0.00344,0.001789
5,32561,0.869295,0.863031,0.819161,0.825117,-0.006265,0.005956


**No PCA, Embeddings with fine-tuning**

In [66]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.660832,0.720024,0.650096,-0.07088,-0.069928
1,500,0.81979,0.814385,0.787022,0.790039,-0.005405,0.003017
2,1000,0.832136,0.828512,0.802288,0.795246,-0.003624,-0.007042
3,5000,0.842332,0.832873,0.821893,0.819922,-0.009459,-0.001972
4,10000,0.862846,0.850992,0.815119,0.815889,-0.011854,0.00077
5,32561,0.869295,0.864075,0.819161,0.82167,-0.005221,0.002509


**No PCA, Embeddings 3**

In [19]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.731712,0.70094,0.720024,0.668181,-0.030772,-0.051843
1,500,0.81979,0.802592,0.787022,0.780523,-0.017198,-0.006499
2,1000,0.832136,0.802653,0.802288,0.796008,-0.029482,-0.00628
3,5000,0.842332,0.843314,0.821893,0.804847,0.000983,-0.017046
4,10000,0.862846,0.846385,0.815119,0.815747,-0.016461,0.000628
5,32561,0.869295,0.860512,0.819161,0.816554,-0.008783,-0.002607


**No PCA, Embeddings 2**

In [19]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.767582,0.719612,0.708036,0.711833,-0.04797,0.003797
1,500,0.81979,0.794976,0.787022,0.767186,-0.024814,-0.019836
2,1000,0.831153,0.82974,0.797604,0.78321,-0.001413,-0.014394
3,5000,0.847552,0.841533,0.818666,0.807183,-0.006019,-0.011483
4,10000,0.862846,0.857011,0.815119,0.810132,-0.005835,-0.004987
5,32561,0.869295,0.860451,0.819161,0.820735,-0.008845,0.001573


In [None]:
df