In [1]:
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from utils import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sc
import scipy.stats as ss
import xgboost as xgb

In [2]:
# load training data
a_train = pd.read_csv(DATA_PATHS['A']['train'], index_col='id')
b_train = pd.read_csv(DATA_PATHS['B']['train'], index_col='id')
c_train = pd.read_csv(DATA_PATHS['C']['train'], index_col='id')

In [3]:
# Lets filter out the columns with high correlation
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

def get_highly_correlated_columns(df):
    df_objs = df.select_dtypes(include=['O'])
    del df_objs['country']
    corr_matrix = pd.DataFrame(columns=df_objs.columns, index=df_objs.columns.tolist())
    with tqdm(total=len(corr_matrix.columns.tolist()) * len(corr_matrix.columns.tolist())) as pbar:
        for col1 in df_objs.columns.tolist():
            for col2 in df_objs.columns.tolist():
                if col1 != col2:
                    confusion_matrix = pd.crosstab(df_objs[col1], df_objs[col2])
                    corr = cramers_corrected_stat(confusion_matrix)
                else:
                    corr = 1
                corr_matrix.loc[col1, col2] = corr
                pbar.update(1)
    
    cols = {}
    for c1 in corr_matrix.columns.tolist():
        s = corr_matrix.loc[c1]
        s = s[s > 0.5]
        s = list(s.index)
        s.remove(c1)
        cols[c1] = s

    cols = {k: cols[k] for k in cols if cols[k]}
    return cols

In [292]:
a_corr_cols = get_highly_correlated_columns(a_train)
b_corr_cols = get_highly_correlated_columns(b_train)
c_corr_cols = get_highly_correlated_columns(c_train)

  1%|          | 1095/114921 [00:05<09:58, 190.24it/s]


KeyboardInterrupt: 

In [293]:
# Lets delete the highly correlated columns in a greedy manner
def delete_columns(df, dict_cols):
    to_delete = list(set([elem for values in dict_cols.values() for elem in values]))
    return df.drop(to_delete, axis=1)


a_train_small = delete_columns(a_train, a_corr_cols)
b_train_small = delete_columns(b_train, b_corr_cols)
c_train_small = delete_columns(c_train, c_corr_cols)


print("Columns removed from A: {}".format(len(a_train.columns) - len(a_train_small.columns)))
print("Columns removed from B: {}".format(len(b_train.columns) - len(b_train_small.columns)))
print("Columns removed from C: {}".format(len(c_train.columns) - len(c_train_small.columns)))

Columns removed from A: 103
Columns removed from B: 183
Columns removed from C: 75


Great! we deleted hundreds of columns just by checking their correlation. Now lets train:

In [3]:
from sklearn.utils import resample
def balance(df):
    poor = df[df['poor'] == True]
    not_poor = df[df['poor'] == False]
    poor_upsampled = resample(poor, 
                              replace=True,
                              n_samples=len(not_poor),
                              random_state=42)
    return pd.concat([poor_upsampled, not_poor])

In [4]:
# Filter out columns with low entropy
print("Country A")
#a_train_reduc = a_train_small
a_train_bala = balance(a_train)
a_train_reduc = filter_columns(a_train_bala.drop('poor', axis=1))
aX_train = pre_process_data(a_train_reduc)
a_train.poor.fillna(False, inplace=True)
ay_train = np.ravel(a_train_bala.poor.astype(int))

Country A
Total columns: 344. To delete: 172
Input shape:	(9000, 172)
After standardization (9000, 172)
After converting categoricals:	(9000, 504)


In [6]:
print("\nCountry B")
#b_train_reduc = b_train_small
b_train_bala = balance(b_train)
b_train_reduc = filter_columns(b_train_bala.drop('poor', axis=1))
bX_train = pre_process_data(b_train_reduc)
b_train.poor.fillna(False, inplace=True)
by_train = np.ravel(b_train_bala.poor.astype(int))


Country B
Total columns: 441. To delete: 220
Input shape:	(6008, 221)
After standardization (6008, 221)
After converting categoricals:	(6008, 934)


In [7]:
print("\nCountry C")
#c_train_reduc = c_train_small
c_train_bala = balance(c_train)
c_train_reduc = filter_columns(c_train_bala.drop('poor', axis=1))
cX_train = pre_process_data(c_train_reduc)
c_train.poor.fillna(False, inplace=True)
cy_train = np.ravel(c_train_bala.poor.astype(int))


Country C
Total columns: 163. To delete: 81
Input shape:	(10992, 82)
After standardization (10992, 82)
After converting categoricals:	(10992, 562)


In [8]:
# Omit this step if you dont want to do dimension reduction
from sklearn.decomposition import TruncatedSVD  # PCA

def reduce_dimensions(x, n_comp=90):
    #pca = PCA(n_components=40)
    svd = TruncatedSVD(n_components=n_comp, n_iter=7, random_state=42)
    return svd.fit_transform(x), svd

# reduce dimensions for all countries
aX_train_svd, a_svd = reduce_dimensions(aX_train)
bX_train_svd, b_svd = reduce_dimensions(bX_train)
cX_train_svd, c_svd = reduce_dimensions(cX_train)

#aX_train = aX_train_svd
#bX_train = bX_train_svd
#cX_train = cX_train_svd

In [9]:
# Prepare data to train
test_size = 0.2

xgb_ax_train, xgb_ax_test, xgb_ay_train, xgb_ay_test = prepare_data(aX_train_svd, ay_train, test_size=test_size, xgb_format=True)

In [10]:
xgb_bx_train, xgb_bx_test, xgb_by_train, xgb_by_test = prepare_data(bX_train_svd, by_train, test_size=test_size, xgb_format=True)

In [11]:
xgb_cx_train, xgb_cx_test, xgb_cy_train, xgb_cy_test = prepare_data(cX_train_svd, cy_train, test_size=test_size, xgb_format=True)

In [428]:
num_round = 8000
#params = {'max_depth': 15, 'eta': 0.01, 'silent': 0, 'lambda': 0.5, 'alpha': 0.5, 'lambda_bias': 0.5, 'min_child_weight': 1, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
params = {'max_depth': 15, 'eta': 0.01, 'silent': 0, 'lambda': 0.5, 'alpha': 0.5, 'lambda_bias': 0.5, 'min_child_weight': 1, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}

xgb_a = train_xgb_model(xgb_ax_train, params=params, num_round=num_round)

In [429]:
xgb_b = train_xgb_model(xgb_bx_train, params=params, num_round=num_round)

In [430]:
xgb_c = train_xgb_model(xgb_cx_train, params=params, num_round=num_round)

In [431]:
# With PCA reduction (40 dims): 0.7444784675249769
# Without reduction: 0.7105216116701498
# With SVD reduction: 0.585940340593645
# With Cramers + SVD: 0.22948565367793244
# With Entropy + SVD: 0.1343193065703149

#xgb_ax_test = xgb_ax_train
#xgb_ay_test = xgb_ay_train
a_loss_train, a_loss_test = cross_validate(xgb_ax_train, xgb_ax_test, xgb_ay_train, xgb_ay_test, xgb_a)

print("A Loss. Train: {} - Test: {}".format(a_loss_train, a_loss_test))

A Loss. Train: 0.15369252708486553 - Test: 6.0330821042947065


In [432]:
#xgb_bx_test = xgb_bx_train
#xgb_by_test = xgb_by_train
b_loss_train, b_loss_test = cross_validate(xgb_bx_train, xgb_bx_test, xgb_by_train, xgb_by_test, xgb_b)
print("B Loss. Train: {} - Test: {}".format(b_loss_train, b_loss_test))

B Loss. Train: 0.1946223943355792 - Test: 2.649129660741833


In [433]:
#xgb_cx_test = xgb_cx_train
#xgb_cy_test = xgb_cy_train
c_loss_train, c_loss_test = cross_validate(xgb_cx_train, xgb_cx_test, xgb_cy_train, xgb_cy_test, xgb_c)
print("C Loss. Train: {} - Test: {}".format(c_loss_train, c_loss_test))

C Loss. Train: 0.16967765326534057 - Test: 4.680354694093998


In [434]:
# Avg loss:
lines = sum([len(aX_train), len(bX_train), len(cX_train)])
total_loss = np.average([a_loss_train, b_loss_train, c_loss_train], weights=[len(aX_train) / lines, len(bX_train) / lines, len(cX_train) / lines])
print("Averaged Train loss: {}".format(total_loss))

Averaged Train loss: 0.16689242105276678


In [435]:
# Avg loss:
lines = sum([len(aX_train), len(bX_train), len(cX_train)])  # It doesn't matter if we use train here, since the proportions will maintain
total_loss = np.average([a_loss_test, b_loss_test, c_loss_test], weights=[len(aX_train) / lines, len(bX_train) / lines, len(cX_train) / lines])
print("Averaged Test loss: {}".format(total_loss))

Averaged Test loss: 4.930524017590127


In [403]:
# Prepare Submission
# TODO: tidy this up

# load test data
a_test = pd.read_csv(DATA_PATHS['A']['test'], index_col='id')
b_test = pd.read_csv(DATA_PATHS['B']['test'], index_col='id')
c_test = pd.read_csv(DATA_PATHS['C']['test'], index_col='id')

# columns to keep from test data
a_keep = a_train_reduc.columns.tolist()
b_keep = b_train_reduc.columns.tolist()
c_keep = c_train_reduc.columns.tolist()
a_test = a_test[a_keep]
b_test = b_test[b_keep]
c_test = c_test[c_keep]

# Create dummies, standarize numeric values
a_test = pre_process_data(a_test)
b_test = pre_process_data(b_test)
c_test = pre_process_data(c_test)

# Delete new columns that were not in training set
a_diff = set(a_test.columns.tolist()) - set(aX_train.columns.tolist())
b_diff = set(b_test.columns.tolist()) - set(bX_train.columns.tolist())
c_diff = set(c_test.columns.tolist()) - set(cX_train.columns.tolist())
a_test = a_test[a_test.columns.difference(list(a_diff))]
b_test = b_test[b_test.columns.difference(list(b_diff))]
c_test = c_test[c_test.columns.difference(list(c_diff))]

# Add dummy columns that are not in the test set
a_diff = set(aX_train.columns.tolist()) - set(a_test.columns.tolist())
b_diff = set(bX_train.columns.tolist()) - set(b_test.columns.tolist())
c_diff = set(cX_train.columns.tolist()) - set(c_test.columns.tolist())
a_test = a_test.assign(**{c: 0 for c in a_diff})
b_test = b_test.assign(**{c: 0 for c in b_diff})
c_test = c_test.assign(**{c: 0 for c in c_diff})

# Reorder columns in the original way so XGBoost does not explode
a_test = a_test[aX_train.columns.tolist()]
b_test = b_test[bX_train.columns.tolist()]
c_test = c_test[cX_train.columns.tolist()]

a_test.fillna(0, inplace=True)
b_test.fillna(0, inplace=True)
c_test.fillna(0, inplace=True)

# Reduce dimensions (comment if not testing this)
a_test_svd = a_svd.transform(a_test)
b_test_svd = b_svd.transform(b_test)
c_test_svd = c_svd.transform(c_test)

# Create XGBoost matrix
a_testxgb = xgb.DMatrix(a_test_svd)
b_testxgb = xgb.DMatrix(b_test_svd)
c_testxgb = xgb.DMatrix(c_test_svd)

a_preds = xgb_a.predict(a_testxgb)
b_preds = xgb_b.predict(b_testxgb)
c_preds = xgb_c.predict(c_testxgb)

# Prepare dataframes for each country
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

submission = pd.concat([a_sub, b_sub, c_sub])
submission.to_csv('submission_xgb_entropy_svd_balancing_BESTSOFAR.csv')
print("Submission saved.")

Input shape:	(4041, 172)
After standardization (4041, 172)
After converting categoricals:	(4041, 500)
Input shape:	(1604, 221)
After standardization (1604, 221)
After converting categoricals:	(1604, 923)
Input shape:	(3187, 82)
After standardization (3187, 82)
After converting categoricals:	(3187, 546)
Submission saved.


In [283]:
# Prepare Submission
# TODO: tidy this up

# load test data
a_test = pd.read_csv(DATA_PATHS['A']['test'], index_col='id')
b_test = pd.read_csv(DATA_PATHS['B']['test'], index_col='id')
c_test = pd.read_csv(DATA_PATHS['C']['test'], index_col='id')



a_test_small = delete_columns(a_test, a_corr_cols)
b_test_small = delete_columns(b_test, b_corr_cols)
c_test_small = delete_columns(c_test, c_corr_cols)

# Create dummies, standarize numeric values
a_test = pre_process_data(a_test)
b_test = pre_process_data(b_test)
c_test = pre_process_data(c_test)

# Delete new columns that were not in training set
a_diff = set(a_test.columns.tolist()) - set(aX_train.columns.tolist())
b_diff = set(b_test.columns.tolist()) - set(bX_train.columns.tolist())
c_diff = set(c_test.columns.tolist()) - set(cX_train.columns.tolist())
a_test = a_test[a_test.columns.difference(list(a_diff))]
b_test = b_test[b_test.columns.difference(list(b_diff))]
c_test = c_test[c_test.columns.difference(list(c_diff))]

# Add dummy columns that are not in the test set
a_diff = set(aX_train.columns.tolist()) - set(a_test.columns.tolist())
b_diff = set(bX_train.columns.tolist()) - set(b_test.columns.tolist())
c_diff = set(cX_train.columns.tolist()) - set(c_test.columns.tolist())
a_test = a_test.assign(**{c: 0 for c in a_diff})
b_test = b_test.assign(**{c: 0 for c in b_diff})
c_test = c_test.assign(**{c: 0 for c in c_diff})

# Reorder columns in the original way so XGBoost does not explode
a_test = a_test[aX_train.columns.tolist()]
b_test = b_test[bX_train.columns.tolist()]
c_test = c_test[cX_train.columns.tolist()]

a_test.fillna(0, inplace=True)
b_test.fillna(0, inplace=True)
c_test.fillna(0, inplace=True)


# Reduce dimensions (comment if not testing this)
a_test_svd = a_svd.transform(a_test)
b_test_svd = b_svd.transform(b_test)
c_test_svd = c_svd.transform(c_test)


# Create XGBoost matrix
a_testxgb = xgb.DMatrix(a_test_svd)
b_testxgb = xgb.DMatrix(b_test_svd)
c_testxgb = xgb.DMatrix(c_test_svd)

a_preds = xgb_a.predict(a_testxgb)
b_preds = xgb_b.predict(b_testxgb)
c_preds = xgb_c.predict(c_testxgb)

# Prepare dataframes for each country
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

submission = pd.concat([a_sub, b_sub, c_sub])
submission.to_csv('submission_xgb_cramers_svd.csv')
print("Submission saved.")

Input shape:	(4041, 344)
After standardization (4041, 344)
After converting categoricals:	(4041, 851)
Input shape:	(1604, 441)
After standardization (1604, 441)
After converting categoricals:	(1604, 1419)
Input shape:	(3187, 163)
After standardization (3187, 163)
After converting categoricals:	(3187, 773)
Submission saved.


# Tune Hyperparams

In [12]:
# Lets try to find the optimal hyperparameters for each country
def tune_params(dtrain, dtest, y_test):
    params = {
        'max_depth': 20,
        'eta': 0.05, 
        'silent': 0,
        'lambda': 0.5,
        'alpha': 0.5,
        'lambda_bias': 0.5, 
        'min_child_weight': 1,
        'objective': 'binary:logistic', 
        'eval_metric': 'logloss', 
        'seed': 42
    }

    current_loss = 10000
    best_num_rounds = 0
    best_hyperparams = {}
    num_rounds = [2000]
    max_depths = [2, 3, 5, 10]
    etas = [0.01, 0.05]
    min_child_weights = [1, 2]
    lamdas = [0.5, 1]
    alphas = [0.5, 1]
    lambda_biases = [0.5, 1]
    total_combinations = len(num_rounds) * len(max_depths) * len(etas)*\
        len(min_child_weights) * len(lamdas) * len(alphas) * len(lambda_biases)

    with tqdm(total=total_combinations) as pbar:
        for num_round in num_rounds:
            for max_depth in max_depths:
                for eta in etas:
                    for min_child_weight in min_child_weights:
                        for lamda in lamdas:
                            for alpha in alphas:
                                for lambda_bias in lambda_biases:
                                    params['max_depth'] = max_depth
                                    params['eta'] = eta
                                    params['min_child_weight'] = min_child_weight
                                    params['lambda'] = lamda
                                    params['alpha'] = alpha
                                    params['lambda_bias'] = lambda_bias

                                    model = train_xgb_model(dtrain, params=params, num_round=num_round)

                                    pred = model.predict(dtest)
                                    loss = log_loss(pred, y_test)

                                    if loss < current_loss:
                                        current_loss = loss
                                        best_hyperparams = params
                                        best_num_rounds = num_round
                                    pbar.update(1)

    return best_hyperparams, best_num_rounds, current_loss

In [13]:
test_size = 0.2
a_best_params = {}
a_best_rounds = 0
a_best_loss = 1000
a_best_svd = None
for n in [50, 100, 200]:
    aX_train_svd, a_svd = reduce_dimensions(aX_train, n_comp=n)
    xgb_ax_train, xgb_ax_test, xgb_ay_train, xgb_ay_test = prepare_data(aX_train_svd, ay_train, test_size=test_size, xgb_format=True)
    a_params, a_num_rounds, loss = tune_params(xgb_ax_train, xgb_ax_test, xgb_ay_test)
    if loss < a_best_loss:
        a_best_params = a_params
        a_best_rounds = a_num_rounds
        a_best_loss = loss
        a_best_svd = a_svd

print("A best params for loss: {} :".format(a_best_loss))
print(a_best_params)
print(a_best_rounds)

100%|██████████| 128/128 [34:39<00:00, 15.54s/it]
100%|██████████| 128/128 [59:45<00:00, 29.36s/it]
100%|██████████| 128/128 [1:49:13<00:00, 53.87s/it]

A best params for loss: 3.576437807163302 :
{'max_depth': 10, 'eta': 0.05, 'silent': 0, 'lambda': 1, 'alpha': 1, 'lambda_bias': 1, 'min_child_weight': 2, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
2000





In [None]:
test_size = 0.2
b_best_params = {}
b_best_rounds = 0
b_best_loss = 1000
b_best_svd = None
for n in [50, 100, 200]:
    bX_train_svd, b_svd = reduce_dimensions(bX_train, n_comp=n)
    xgb_bx_train, xgb_bx_test, xgb_by_train, xgb_by_test = prepare_data(bX_train_svd, by_train, test_size=test_size, xgb_format=True)
    b_params, b_num_rounds, loss = tune_params(xgb_bx_train, xgb_bx_test, xgb_by_test)
    if loss < b_best_loss:
        b_best_params = b_params
        b_best_rounds = b_num_rounds
        b_best_loss = loss
        b_best_svd = b_svd

print("B best params for loss: {} :".format(b_best_loss))
print(b_best_params)
print(b_best_rounds)

In [None]:
test_size = 0.2
c_best_params = {}
c_best_rounds = 0
c_best_loss = 1000
c_best_svd = None
for n in [50, 100, 200]:
    cX_train_svd, c_svd = reduce_dimensions(cX_train, n_comp=n)
    xgc_cx_train, xgc_cx_test, xgc_cy_train, xgc_cy_test = prepare_data(cX_train_svd, cy_train, test_size=test_size, xgc_format=True)
    c_params, c_num_rounds, loss = tune_params(xgc_cx_train, xgc_cx_test, xgc_cy_test)
    if loss < c_best_loss:
        c_best_params = c_params
        c_best_rounds = c_num_rounds
        c_best_loss = loss
        c_best_svd = c_svd

print("C best params for loss: {} :".format(c_best_loss))
print(c_best_params)
print(c_best_rounds)