In [2]:
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np

**Load data + feature engineering**

In [3]:
train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
test_df = pd.read_csv("data/census_income/adult.test", sep=', ')

dataset = pd.concat([train_df, test_df])

dataset = dataset.reset_index(drop=True) # fix index

#NaN are flagged as "?"
dataset['workclass'] = dataset['workclass'].replace('?', np.nan)
dataset['occupation'] = dataset['occupation'].replace('?', np.nan)
dataset['native.country'] = dataset['native.country'].replace('?', np.nan)

#### FEATUE ENGINEERING
# Identify Numeric features
numeric_features = ['age','fnlwgt','education.num','capital.gain','capital.loss','hours.per.week','income']
cat_features = ['workclass','education','marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Sex
dataset["sex"] = dataset["sex"].map({"Male": 0, "Female":1})

# Marital Status
dataset["marital.status"] = dataset["marital.status"].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
dataset["marital.status"] = dataset["marital.status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
dataset["marital.status"] = dataset["marital.status"].map({"Married":1, "Single":0})
dataset["marital.status"] = dataset["marital.status"].astype(int)

# Education
dummies_ed = pd.get_dummies(dataset['education'], prefix='education')
dataset = pd.concat([dataset, dummies_ed], axis=1)
dataset = dataset.drop('education', axis=1)

# Workclass
dataset['workclass'] = dataset['workclass'].str.replace('?', 'Unemployed')   # Missing values
dummies_w = pd.get_dummies(dataset['workclass'], prefix='workclass')
dataset = pd.concat([dataset, dummies_w], axis=1)
dataset = dataset.drop('workclass', axis=1)

# Occupation
dataset['occupation'] = dataset['occupation'].str.replace('?', 'Unemployed')   # Missing values
dummies_o = pd.get_dummies(dataset['occupation'], prefix='occupation')
dataset = pd.concat([dataset, dummies_o], axis=1)
dataset = dataset.drop('occupation', axis=1)

# Race 
dummies_r = pd.get_dummies(dataset['race'], prefix='race')
dataset = pd.concat([dataset, dummies_r], axis=1)
dataset = dataset.drop('race', axis=1)

# Relationship
dummies_re = pd.get_dummies(dataset['relationship'], prefix='relationship')
dataset = pd.concat([dataset, dummies_re], axis=1)
dataset = dataset.drop('relationship', axis=1)

# Native Country and fnlwgt dropped
dataset.drop(labels=["native.country", "fnlwgt"], axis = 1, inplace = True)

# Convert to bool
for col in dataset.columns:
    if dataset[col].dtype == 'bool':
        dataset[col] = dataset[col].astype(int)
dataset['income']=dataset['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1}).astype(int)

  train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
  test_df = pd.read_csv("data/census_income/adult.test", sep=', ')


**Load Embeddings**

In [4]:
embeddings = pd.read_csv('data/embeddings_two.csv', index_col=0)

**Define grid search**

In [5]:
data_size = [100, 500, 1000, 5000, 10000, 32561]

In [6]:
def find_best_threshold(y_true, y_prob):
    best_threshold = 0.5
    best_score = 0
    for threshold in np.arange(0.1, 0.9, 0.001):
        score = f1_score(y_true, y_prob >= threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

XGB_grid = {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    }

skf = StratifiedKFold(n_splits=5, shuffle=True)

**Baseline**

In [7]:
# Run in 2:30 minutes
auc_baseline = {}
acc_baseline = {}

for size in data_size:
    ## Baseline model
    X_train, Y_train = dataset.drop(['income'], axis=1).iloc[:size], dataset[['income']].iloc[:size]
    X_test, Y_test = dataset.drop(['income'], axis=1).iloc[32561:], dataset[['income']].iloc[32561:]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    xgb = XGBClassifier()
    search = RandomizedSearchCV(xgb, 
                                XGB_grid, 
                                scoring='roc_auc', 
                                cv=skf.split(X_train, Y_train))
    
    search.fit(X_train, Y_train)
    best_model = search.best_estimator_
    probas = best_model.predict_proba(X_test)[:, 1]
    best_threshold = find_best_threshold(Y_test, probas)

    preds = (best_model.predict_proba(X_test)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    acc = accuracy_score(Y_test, preds)

    auc_baseline[size] = auc
    acc_baseline[size] = acc

In [8]:
auc_baseline

{100: 0.7080362659620981,
 500: 0.7870222504919496,
 1000: 0.797603795587288,
 5000: 0.8186661748737741,
 10000: 0.8151190245438528,
 32561: 0.8191613864795846}

**Tabtext**

In [9]:
auc_tabtext = {}
acc_tabtext = {}

df_concat = pd.concat([dataset, embeddings], axis=1)

for size in data_size:
    ## Baseline model
    X_train, Y_train = df_concat.drop(['income'], axis=1).iloc[:size], df_concat[['income']].iloc[:size]
    X_test, Y_test = df_concat.drop(['income'], axis=1).iloc[32561:], df_concat[['income']].iloc[32561:]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    xgb = XGBClassifier()
    search = RandomizedSearchCV(xgb, 
                                XGB_grid, 
                                scoring='roc_auc', 
                                cv=skf.split(X_train, Y_train))
    
    search.fit(X_train, Y_train)
    best_model = search.best_estimator_
    probas = best_model.predict_proba(X_test)[:, 1]
    best_threshold = find_best_threshold(Y_test, probas)

    preds = (best_model.predict_proba(X_test)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    acc = accuracy_score(Y_test, preds)

    auc_tabtext[size] = auc
    acc_tabtext[size] = acc

In [10]:
auc_tabtext

{100: 0.7118330450950245,
 500: 0.7671859347232756,
 1000: 0.783209569637309,
 5000: 0.8071828108347494,
 10000: 0.8101318117863435,
 32561: 0.8207345591773008}

In [15]:
acc_baseline = pd.DataFrame.from_dict(acc_baseline, orient='index').rename({0: 'Accuracy - Baseline'}, axis=1)
auc_baseline = pd.DataFrame.from_dict(auc_baseline, orient='index').rename({0: 'AUC - Baseline'}, axis=1)
acc_tabtext = pd.DataFrame.from_dict(acc_tabtext, orient='index').rename({0: 'Accuracy - TabText'}, axis=1)
auc_tabtext = pd.DataFrame.from_dict(auc_tabtext, orient='index').rename({0: 'AUC - TabText'}, axis=1)

TypeError: 'numpy.ndarray' object is not callable

In [16]:
comparison = pd.concat([acc_baseline, acc_tabtext, auc_baseline, auc_tabtext], axis=1)

comparison['Accuracy change'] = comparison['Accuracy - TabText'] - comparison['Accuracy - Baseline']
comparison['AUC change'] = comparison['AUC - TabText'] - comparison['AUC - Baseline']

In [19]:
comparison.reset_index().rename({'index': 'Training set size'}, axis=1)

Unnamed: 0,Training set size,Accuracy - Baseline,Accuracy - TabText,AUC - Baseline,AUC - TabText,Accuracy change,AUC change
0,100,0.767582,0.719612,0.708036,0.711833,-0.04797,0.003797
1,500,0.81979,0.794976,0.787022,0.767186,-0.024814,-0.019836
2,1000,0.831153,0.82974,0.797604,0.78321,-0.001413,-0.014394
3,5000,0.847552,0.841533,0.818666,0.807183,-0.006019,-0.011483
4,10000,0.862846,0.857011,0.815119,0.810132,-0.005835,-0.004987
5,32561,0.869295,0.860451,0.819161,0.820735,-0.008845,0.001573
