In [None]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from copy import deepcopy
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.preprocessing import QuantileTransformer
from copy import deepcopy
from lib.data import preprocess
from lib.model import Model

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

from matplotlib import pyplot as plt
%matplotlib inline

# Download census-income dataset

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = 'census-income'
out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')

In [None]:
out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())

# Load data and split

In [None]:
out = 'train_bench_bm.csv'

In [None]:
df = pd.read_csv(out)
target = 'y'  # ' <=50K'
if "Set" not in df.columns:
    df["Set"] = np.random.choice(["train", "test"], p =[.8, .2], size=(df.shape[0],))

train_indices = df[df.Set=="train"].index
valid_indices = df[df.Set=="valid"].index
test_indices = df[df.Set=="test"].index

In [None]:
df.drop(['Set'], axis=1, inplace=True)
df.fillna(-1, inplace=True)

In [None]:
# train = df.loc[df.Set=="train"].reset_index(drop=True)
# test = df.loc[df.Set=="test"].reset_index(drop=True)

In [None]:
df.head()

In [None]:
split_indices = dict(
    train=train_indices,
    valid=valid_indices,
    test=test_indices
)

In [None]:
data = preprocess(df, target=target, split_indices=split_indices, quantile_transform=True)

# Define categorical features for categorical embeddings

# Training

In [None]:
def split_and_preprocess(X_train, y_train, n_splits=10, random_state=0):

    # CVSplit = KFold if self.learning_task == 'regression' else StratifiedKFold
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for train_index, test_index in cv.split(X_train, y_train):
        train, test = X_train[train_index], X_train[test_index]
        train, ytr = train, y_train[train_index]
        test, yte = test, y_train[test_index]
        yield train, ytr, test, yte

In [None]:
evals_results = []
clfs = []
for fold, (_train, _ytr, _test, _yte) in enumerate(split_and_preprocess(data['X_train'], data['y_train'])):
    clf = Model(input_dim=data['X_train'].shape[1], output_dim=2,
                experiment_name=f'fold_{fold}')
    evals_result = clf.fit(
        X_train=_train, y_train=_ytr,
        X_valid=_test, y_valid=_yte,
        early_stopping_rounds=10000,
        report_frequency=100,
        plot=True
    )
    print(evals_result)
    evals_results.append(evals_result)
    clfs.append(deepcopy(clf))
    torch.cuda.empty_cache()

### Predictions

In [None]:
for c in clfs:
    preds = c.predict(data['X_test'])
    print(roc_auc_score(y_score=preds[:,1], y_true=data['y_test']))

In [None]:
preds = 0
for c in clfs:
    preds += c.predict(data['X_test']) / len(clfs)
    
test_auc = roc_auc_score(y_score=preds[:,1], y_true=data['y_test'])

print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")