In [1]:
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from utils import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sc
import xgboost as xgb

In [2]:
# load training data
a_train = pd.read_csv(DATA_PATHS['A']['train'], index_col='id')
b_train = pd.read_csv(DATA_PATHS['B']['train'], index_col='id')
c_train = pd.read_csv(DATA_PATHS['C']['train'], index_col='id')

In [3]:
# Filter out columns with low entropy
print("Country A")
a_train_reduc = filter_columns(a_train.drop('poor', axis=1))
aX_train = pre_process_data(a_train_reduc)
a_train.poor.fillna(False, inplace=True)
ay_train = np.ravel(a_train.poor.astype(int))

print("\nCountry B")
b_train_reduc = filter_columns(b_train.drop('poor', axis=1))
bX_train = pre_process_data(b_train_reduc)
b_train.poor.fillna(False, inplace=True)
by_train = np.ravel(b_train.poor.astype(int))

print("\nCountry C")
c_train_reduc = filter_columns(c_train.drop('poor', axis=1))
cX_train = pre_process_data(c_train_reduc)
c_train.poor.fillna(False, inplace=True)
cy_train = np.ravel(c_train.poor.astype(int))

Country A
Total columns: 344. To delete: 172
Input shape:	(1855, 172)
After standardization (1855, 172)
After converting categoricals:	(1855, 498)

Country B
Total columns: 441. To delete: 220
Input shape:	(3255, 221)
After standardization (3255, 221)
After converting categoricals:	(3255, 936)

Country C
Total columns: 163. To delete: 81
Input shape:	(6469, 82)
After standardization (6469, 82)
After converting categoricals:	(6469, 564)


In [7]:
from sklearn.decomposition import TruncatedSVD  # PCA

def reduce_dimensions(x, n_comp=100):
    #pca = PCA(n_components=40)
    svd = TruncatedSVD(n_components=n_comp, n_iter=7, random_state=42)
    return svd.fit_transform(x), svd

# reduce dimensions for all countries
aX_train_svd, a_svd = reduce_dimensions(aX_train)
bX_train_svd, b_svd = reduce_dimensions(bX_train)
cX_train_svd, c_svd = reduce_dimensions(cX_train)

In [5]:
# Prepare data to train
test_size = 0.2

xgb_ax_train, xgb_ax_test, xgb_ay_train, xgb_ay_test = prepare_data(aX_train_svd, ay_train, test_size=test_size, xgb_format=True)
xgb_bx_train, xgb_bx_test, xgb_by_train, xgb_by_test = prepare_data(bX_train_svd, by_train, test_size=test_size, xgb_format=True)
xgb_cx_train, xgb_cx_test, xgb_cy_train, xgb_cy_test = prepare_data(cX_train_svd, cy_train, test_size=test_size, xgb_format=True)

In [6]:
num_round = 8000
params = {'max_depth': 20, 'eta': 0.05, 'silent': 0, 'lambda': 0.5, 'alpha': 0.5, 'lambda_bias': 0.5, 'min_child_weight': 1, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}

xgb_a = train_xgb_model(xgb_ax_train, params=params, num_round=num_round)
xgb_b = train_xgb_model(xgb_bx_train, params=params, num_round=num_round)
xgb_c = train_xgb_model(xgb_cx_train, params=params, num_round=num_round)

In [7]:
# With PCA reduction (40 dims): 0.7444784675249769
# Without reduction: 0.7105216116701498
# With SVD reduction: 0.585940340593645
print("A Loss. Train: {} - Test: {}".format(*cross_validate(xgb_ax_train, xgb_ax_test, xgb_ay_train, xgb_ay_test, xgb_a)))
print("B Loss. Train: {} - Test: {}".format(*cross_validate(xgb_bx_train, xgb_bx_test, xgb_by_train, xgb_by_test, xgb_b)))
print("C Loss. Train: {} - Test: {}".format(*cross_validate(xgb_cx_train, xgb_cx_test, xgb_cy_train, xgb_cy_test, xgb_c)))

A Loss. Train: 0.2744477151049117 - Test: None
B Loss. Train: 0.17159917608951814 - Test: None
C Loss. Train: 0.1456253027320118 - Test: None


In [8]:
# Prepare Submission
# TODO: tidy this up

# load test data
a_test = pd.read_csv(DATA_PATHS['A']['test'], index_col='id')
b_test = pd.read_csv(DATA_PATHS['B']['test'], index_col='id')
c_test = pd.read_csv(DATA_PATHS['C']['test'], index_col='id')

# columns to keep from test data
a_keep = a_train_reduc.columns.tolist()
b_keep = b_train_reduc.columns.tolist()
c_keep = c_train_reduc.columns.tolist()
a_test = a_test[a_keep]
b_test = b_test[b_keep]
c_test = c_test[c_keep]

# Create dummies, standarize numeric values
a_test = pre_process_data(a_test)
b_test = pre_process_data(b_test)
c_test = pre_process_data(c_test)

# Delete new columns that were not in training set
a_diff = set(a_test.columns.tolist()) - set(aX_train.columns.tolist())
b_diff = set(b_test.columns.tolist()) - set(bX_train.columns.tolist())
c_diff = set(c_test.columns.tolist()) - set(cX_train.columns.tolist())
a_test = a_test[a_test.columns.difference(list(a_diff))]
b_test = b_test[b_test.columns.difference(list(b_diff))]
c_test = c_test[c_test.columns.difference(list(c_diff))]

# Add dummy columns that are not in the test set
a_diff = set(aX_train.columns.tolist()) - set(a_test.columns.tolist())
b_diff = set(bX_train.columns.tolist()) - set(b_test.columns.tolist())
c_diff = set(cX_train.columns.tolist()) - set(c_test.columns.tolist())
a_test = a_test.assign(**{c: 0 for c in a_diff})
b_test = b_test.assign(**{c: 0 for c in b_diff})
c_test = c_test.assign(**{c: 0 for c in c_diff})

# Reorder columns in the original way so XGBoost does not explode
a_test = a_test[aX_train.columns.tolist()]
b_test = b_test[bX_train.columns.tolist()]
c_test = c_test[cX_train.columns.tolist()]

a_test.fillna(0, inplace=True)
b_test.fillna(0, inplace=True)
c_test.fillna(0, inplace=True)

# Reduce dimensions (comment if not testing this)
a_test_svd = a_svd.transform(a_test)
b_test_svd = b_svd.transform(b_test)
c_test_svd = c_svd.transform(c_test)

# Create XGBoost matrix
a_testxgb = xgb.DMatrix(a_test_svd)
b_testxgb = xgb.DMatrix(b_test_svd)
c_testxgb = xgb.DMatrix(c_test_svd)

a_preds = xgb_a.predict(a_testxgb)
b_preds = xgb_b.predict(b_testxgb)
c_preds = xgb_c.predict(c_testxgb)

# Prepare dataframes for each country
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

submission = pd.concat([a_sub, b_sub, c_sub])
submission.to_csv('submission_xgb.csv')
print("Submission saved.")

Input shape:	(4041, 172)
After standardization (4041, 172)
After converting categoricals:	(4041, 500)
Input shape:	(1604, 221)
After standardization (1604, 221)
After converting categoricals:	(1604, 925)
Input shape:	(3187, 82)
After standardization (3187, 82)
After converting categoricals:	(3187, 547)
Submission saved.


# Tune Hyperparams

In [6]:
# Lets try to find the optimal hyperparameters for each country
def tune_params(dtrain, dtest, y_test):
    params = {
        'max_depth': 20,
        'eta': 0.05, 
        'silent': 0,
        'lambda': 0.5,
        'alpha': 0.5,
        'lambda_bias': 0.5, 
        'min_child_weight': 1,
        'objective': 'binary:logistic', 
        'eval_metric': 'logloss', 
        'seed': 42
    }

    current_loss = 10000
    best_num_rounds = 0
    best_hyperparams = {}
    num_rounds = [1000, 3000, 5000, 8000]
    max_depths = list(range(3, 15))
    etas = [0.01, 0.05]
    min_child_weights = [1, 2]
    lamdas = [0.5, 1]
    alphas = [0.5, 1]
    lambda_biases = [0.5, 1]
    total_combinations = len(num_rounds) * len(max_depths) * len(etas)*\
        len(min_child_weights) * len(lamdas) * len(alphas) * len(lambda_biases)

    with tqdm(total=total_combinations) as pbar:
        for num_round in num_rounds:
            for max_depth in max_depths:
                for eta in etas:
                    for min_child_weight in min_child_weights:
                        for lamda in lamdas:
                            for alpha in alphas:
                                for lambda_bias in lambda_biases:
                                    params['max_depth'] = max_depth
                                    params['eta'] = eta
                                    params['min_child_weight'] = min_child_weight
                                    params['lambda'] = lamda
                                    params['alpha'] = alpha
                                    params['lambda_bias'] = lambda_bias

                                    model = train_xgb_model(dtrain, params=params, num_round=num_round)

                                    pred = model.predict(dtest)
                                    loss = log_loss(pred, y_test)

                                    if loss < current_loss:
                                        current_loss = loss
                                        best_hyperparams = params
                                        best_num_rounds = num_round
                                    pbar.update(1)

    return best_hyperparams, best_num_rounds, current_loss

In [None]:
test_size = 0.2
best_params = {}
best_rounds = 0
best_loss = 1000
best_svd = None
for n in [50, 100, 200]:
    aX_train_svd, a_svd = reduce_dimensions(aX_train, n_comp=n)
    xgb_ax_train, xgb_ax_test, xgb_ay_train, xgb_ay_test = prepare_data(aX_train_svd, ay_train, test_size=test_size, xgb_format=True)
    a_params, a_num_rounds, loss = tune_params(xgb_ax_train, xgb_ax_test, xgb_ay_test)
    if loss < best_loss:
        best_params = a_params
        best_rounds = a_num_rounds
        best_loss = loss
        best_svd = a_svd

print("A best params for loss: {} :".format(best_loss))
print(best_params)
print(best_rounds)

  8%|▊         | 124/1536 [09:25<1:30:42,  3.85s/it]

In [None]:
b_params, b_num_rounds = tune_params(xgb_bx_train, xgb_bx_test, xgb_by_test)

In [None]:
c_params, c_num_rounds = tune_params(xgb_cx_train, xgb_cx_test, xgb_cy_test)