In [1]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import scipy as sc
from tqdm import tqdm
import scipy as sc
import scipy.stats as ss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# data directory
DATA_DIR = os.path.join('../..', 'data')

In [2]:
data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C_hhold_test.csv')}}

In [3]:
# load training data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')

In [4]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

In [10]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    #print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    #print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    #print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)

    return df

from sklearn.utils import resample
def balance(df):
    poor = df[df['poor'] == True]
    not_poor = df[df['poor'] == False]
    
    not_poor_downsampled = resample(not_poor, 
                              replace=True,
                              n_samples=int(len(not_poor) * 0.6),
                              random_state=42)
    
    poor_upsampled = resample(poor, 
                              replace=True,
                              n_samples=len(not_poor_downsampled),
                              random_state=42)
    res = pd.concat([poor_upsampled, not_poor_downsampled])
    return res.sample(frac=1)

import random

def get_best_columns(df, params={}, num_round=500):
    test_size = 0.2

    X_train = balance(df)
    X_train.poor.fillna(False, inplace=True)
    y_train = np.ravel(X_train.poor.astype(int))

    columns = X_train.columns.tolist()
    columns.remove('poor')
    columns.remove('country')
    good_cols = columns.copy()
    
    # Initial train
    X_train_proc = pre_process_data(pd.DataFrame(X_train[good_cols]))
    xgb_x_train, xgb_x_test, xgb_y_train, xgb_y_test = prepare_data(X_train_proc, y_train, test_size=test_size, xgb_format=True)
    xgb_model = train_xgb_model(xgb_x_train, params=params, num_round=num_round)
    train_loss, test_loss = cross_validate(xgb_x_train, xgb_x_test, xgb_y_train, xgb_y_test, xgb_model)
    best_loss = test_loss
    
    with tqdm(total=len(columns)) as pbar:
        for col in columns:
            to_del = random.choice(good_cols)
            tmp_good_cols = good_cols.copy()
            tmp_good_cols.remove(to_del)

            X_train_proc = pre_process_data(pd.DataFrame(X_train[tmp_good_cols]))
            xgb_x_train, xgb_x_test, xgb_y_train, xgb_y_test = prepare_data(X_train_proc, y_train, test_size=test_size, xgb_format=True)
            xgb_model = train_xgb_model(xgb_x_train, params=params, num_round=num_round)
            train_loss, test_loss = cross_validate(xgb_x_train, xgb_x_test, xgb_y_train, xgb_y_test, xgb_model)

            if test_loss < best_loss:
                best_loss = test_loss
                good_cols = tmp_good_cols
            pbar.update(1)
    return good_cols, best_loss

from sklearn.ensemble import RandomForestClassifier


def prepare_data(x, y, test_size=0.2, xgb_format=True):
    if test_size == 0:
        dtrain = x
        Y_train = y
        dtest = None
        Y_test = None
    else:
        dtrain, dtest, Y_train, Y_test = train_test_split(x, y, test_size=test_size, stratify=y, random_state=42)

    if xgb_format:
        dtrain = xgb.DMatrix(dtrain, label=Y_train)
        if dtest is not None:
            dtest = xgb.DMatrix(dtest)

    return dtrain, dtest, Y_train, Y_test


def train_rf_model(features, labels, **kwargs):

    # instantiate model
    model = RandomForestClassifier(n_estimators=50, random_state=0)

    # train model
    model.fit(features, labels)

    # get a (not-very-useful) sense of performance
    accuracy = model.score(features, labels)
    print(f"In-sample accuracy: {accuracy:0.2%}")

    return model


def train_xgb_model(dtrain, params=None, num_round=100):
    if params is None:
        params = {'max_depth': 4, 'eta': 100, 'silent': 1, 'objective': 'reg:logistic'}

    bst = xgb.train(params, dtrain, num_round)

    return bst

# Compute loss
# -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
def log_loss(yt, yp):
    # yt: groundtruth
    # yp: predicted
    ground = np.array(yt)
    pred = yp.astype(float)
    eps_pred = np.maximum(np.minimum(pred, 1. - 1e-15), 1e-15)
    loss = -(ground * np.log(eps_pred) + (1 - ground) * np.log(1 - eps_pred))
    return np.mean(loss)

# Cross Validate
def cross_validate(x_train, x_test, y_train, y_test, model):
    test_loss = None
    if x_test is not None:
        preds = model.predict(x_test)
        test_loss = log_loss(preds, y_test)

    preds_train = model.predict(x_train)
    train_loss = log_loss(preds_train, y_train)
    return train_loss, test_loss

def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds,
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]


def prepare_submission(data_paths, models, enforce_cols=None, to_keep_cols=None, xgb_format=False):
    # load test data
    a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
    b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
    c_test = pd.read_csv(data_paths['C']['test'], index_col='id')

    if to_keep_cols:
        a_test = a_test[to_keep_cols['a']]
        b_test = b_test[to_keep_cols['b']]
        c_test = c_test[to_keep_cols['c']]

    if enforce_cols:
        # process the test data
        a_test = pre_process_data(a_test, enforce_cols=enforce_cols['a'])
        b_test = pre_process_data(b_test, enforce_cols=enforce_cols['b'])
        c_test = pre_process_data(c_test, enforce_cols=enforce_cols['c'])

    a_test.fillna(0, inplace=True)
    b_test.fillna(0, inplace=True)
    c_test.fillna(0, inplace=True)

    if xgb_format:
        a_test = xgb.DMatrix(a_test)
        b_test = xgb.DMatrix(b_test)
        c_test = xgb.DMatrix(c_test)

    # TODO: use probabilities
    a_preds = models['a'].predict(a_test)
    b_preds = models['b'].predict(b_test)
    c_preds = models['c'].predict(c_test)
    
    a_sub = make_country_sub(a_preds, a_test, 'A')
    b_sub = make_country_sub(b_preds, b_test, 'B')
    c_sub = make_country_sub(c_preds, c_test, 'C')
    
    submission = pd.concat([a_sub, b_sub, c_sub])
    
    return submission

# Lets filter out the columns with high correlation
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

def get_highly_correlated_columns(df):
    df_objs = df.columns.tolist()
    df_objs.remove('poor')
    corr_matrix = pd.DataFrame(columns=['poor'], index=df_objs)
    with tqdm(total=len(df_objs)) as pbar:
        for col in df_objs:
            confusion_matrix = pd.crosstab(df['poor'], df[col])
            corr = cramers_corrected_stat(confusion_matrix)
            corr_matrix.loc[col, 'poor'] = corr
            pbar.update(1)
    return corr_matrix
    """
    cols = {}
    for c1 in corr_matrix.columns.tolist():
        s = corr_matrix.loc[c1]
        s = s[s > 0.5]
        s = list(s.index)
        s.remove(c1)
        cols[c1] = s

    cols = {k: cols[k] for k in cols if cols[k]}
    return cols
    """

def entropy(a):
    return - sum( (a / sum(a)) * np.log((a / sum(a))))


def get_entropies(df):
    entropies = []
    for col in df.columns.tolist():
        res = df[col].value_counts()
        entropies.append(entropy(res.values))

    return entropies


def get_low_entropy_columns(df):
    to_del = []
    entropies = get_entropies(df)
    median_entr = np.median(entropies)
    #std_entr = np.std(entropies)
    #avg_entr = np.mean(entropies)
    for i, col in enumerate(df.columns.tolist()):
        if entropies[i] < median_entr:
            to_del.append(col)
    return to_del


def filter_columns(df):
    to_del = get_low_entropy_columns(df)
    print("Total columns: {}. To delete: {}".format(len(df.columns.tolist()), len(to_del)))
    to_keep = set(df.columns.tolist()) - set(to_del)
    return df[list(to_keep)]

In [38]:
print("Country A")
params = {'max_depth': 9, 'eta': 0.05, 'silent': 0, 'lambda': 0.5, 'alpha': 0.5, 'lambda_bias': 0.5, 'min_child_weight': 1, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
num_round = 500
good_cols, test_loss = get_best_columns(a_train, params=params, num_round=num_round)
print(good_cols, test_loss)

Country A


100%|██████████| 343/343 [1:28:42<00:00, 15.38s/it]

['wBXbHZmp', 'SlDKnCuu', 'KAJOWiiw', 'DsKacCdL', 'rtPrBBPl', 'tMJrvvut', 'jdetlNNF', 'maLAYXwi', 'vwpsXRGk', 'sArDRIyX', 'goxNwvnG', 'TYhoEiNm', 'bgfNZfcj', 'sYIButva', 'VZtBaoXL', 'GUvFHPNA', 'zFkComtB', 'fxbqfEWb', 'HDMHzGif', 'zzwlWZZC', 'nGTepfos', 'DxLvCGgv', 'CbABToOI', 'qgMygRvX', 'uSKnVaKV', 'hESBInAl', 'nzTeWUeM', 'BbKZUYsB', 'UCnazcxd', 'hTraVEWP', 'aCfsveTu', 'EfkPrfXa', 'NrvxpdMQ', 'nEsgxvAq', 'FcekeISI', 'NmAVTtfA', 'YTdCRVJt', 'QyBloWXZ', 'NGOnRdqc', 'HKMQJANN', 'ZRrposmO', 'wakWLjkG', 'vmZttwFZ', 'dkoIJCbY', 'NrUWfvEq', 'WqhniYIc', 'yHbEDILT', 'EJgrQqET', 'nGMEgWyl', 'IIEHQNUc', 'HfKRIwMb', 'NRVuZwXK', 'UCAmikjV', 'UGbBCHRE', 'uJYGhXqG', 'bxKGlBYX', 'nCzVgxgY', 'ltcNxFzI', 'WbEDLWBH', 'IBPMYJlv', 'MxOgekdE', 'ggNglVqE', 'YDgWYWcJ', 'SqGRfEuW', 'WiwmbjGW', 'benRXROb', 'cOSBrarW', 'JwtIxvKg', 'lRGpWehf', 'dSALvhyd', 'gfmfEyjQ', 'WbxAxHul', 'FlBqizNL', 'bEPKkJXP', 'KjkrfGLD', 'JbjHTYUM', 'HmDAlkAH', 'cqUmYeAp', 'sFWbFEso', 'fHUZugEd', 'tZKoAqgl', 'TqrXZaOw', 'galsfNtg', 'VI




In [None]:
print("Country B")
params = {'max_depth': 9, 'eta': 0.05, 'silent': 0, 'lambda': 0.5, 'alpha': 0.5, 'lambda_bias': 0.5, 'min_child_weight': 1, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
num_round = 500
good_cols_b, test_loss_b = get_best_columns(b_train, params=params, num_round=num_round)
print(good_cols_b, test_loss_b)

Country B


 19%|█▊        | 82/440 [24:47<1:44:43, 17.55s/it]

In [14]:
print("\nCountry C")
c_test_reduc = filter_columns(c_test)
c_train_reduc = filter_columns(c_train.drop('poor', axis=1))

Country A
Total columns: 344. To delete: 172
Total columns: 344. To delete: 172
172 172
{'wwfmpuWA'}

Country B
Total columns: 441. To delete: 220
Total columns: 441. To delete: 220
221 221
{'zCnhAreR', 'EmHAsgcA', 'xjaMthYM'}

Country C
Total columns: 163. To delete: 81
Total columns: 163. To delete: 81
82 82
{'WWuPOkor', 'wlNGOnRd'}


## XGBoost with columns with more entropy (Submitted Jan 8 2018. Score:  0.24977 )

In [29]:
# We need to repreprocess the data with less columns
a_train_reduc = a_train[a_test_reduc.columns.tolist() + ['poor']]
b_train_reduc = b_train[b_test_reduc.columns.tolist() + ['poor']]
c_train_reduc = c_train[c_test_reduc.columns.tolist() + ['poor']]

In [33]:
a_corr_cols = get_highly_correlated_columns(a_train_reduc)
b_corr_cols = get_highly_correlated_columns(b_train_reduc)
c_corr_cols = get_highly_correlated_columns(c_train_reduc)

100%|██████████| 172/172 [00:00<00:00, 190.13it/s]
100%|██████████| 221/221 [00:01<00:00, 200.43it/s]
  
100%|██████████| 82/82 [00:00<00:00, 161.12it/s]


In [36]:
a_useful_cols = a_corr_cols[a_corr_cols['poor'] > 0.4].index.tolist() + ['poor']
print(a_useful_cols)
b_useful_cols = b_corr_cols[b_corr_cols['poor'] > 0.2].index.tolist() + ['poor']
print(b_useful_cols)
c_useful_cols = c_corr_cols[c_corr_cols['poor'] > 0.25].index.tolist() + ['poor']
print(c_useful_cols)

['QyBloWXZ', 'poor']
['lCKzGQow', 'DwxXAlcv', 'frkmPrFd', 'BjWMmVMX', 'qnCnHAnk', 'UEaRhdUa', 'TChiqwQp', 'qrOrXLPM', 'plRFsRMw', 'poor']
['DBjxSUvf', 'tFrTiLjv', 'gZWEypOM', 'qCEuAGDU', 'CBoRtiUy', 'GIwNbAsH', 'mmoCpqWS', 'VbnOIDkC', 'LhUIIEHQ', 'BBPluVrb', 'snqZfFGY', 'XsbpBUGN', 'gLDyDXsb', 'laWlBVrk', 'xyzchLjk', 'nomHWXYi', 'wcNjwEuQ', 'YmHrcUIw', 'MtkqdQSs', 'kZmWbEDL', 'kiAJBGqv', 'YACFXGNR', 'kdkPWxwS', 'qbMphwEx', 'nRXRObKS', 'poor']


In [37]:
a_train_small = a_train_reduc[a_useful_cols]
b_train_small = b_train_reduc[b_useful_cols]
c_train_small = c_train_reduc[c_useful_cols]


print("Columns removed from A: {}".format(len(a_train.columns) - len(a_train_small.columns)))
print("Columns removed from B: {}".format(len(b_train.columns) - len(b_train_small.columns)))
print("Columns removed from C: {}".format(len(c_train.columns) - len(c_train_small.columns)))

Columns removed from A: 343
Columns removed from B: 432
Columns removed from C: 138


In [38]:
aX_train = pre_process_data(a_train_small)
a_train.poor.fillna(False, inplace=True)
ay_train = np.ravel(a_train.poor.astype(int))

bX_train = pre_process_data(b_train_small)
b_train.poor.fillna(False, inplace=True)
by_train = np.ravel(b_train.poor.astype(int))

cX_train = pre_process_data(c_train_small)
c_train.poor.fillna(False, inplace=True)
cy_train = np.ravel(c_train.poor.astype(int))

Input shape:	(8203, 2)
After standardization (8203, 2)
After converting categoricals:	(8203, 3)
Input shape:	(3255, 10)
After standardization (3255, 10)
After converting categoricals:	(3255, 90)
Input shape:	(6469, 26)
After standardization (6469, 26)
After converting categoricals:	(6469, 219)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [42]:
test_size = 0.

xgb_ax_train, xgb_ax_test, xgb_ay_train, xgb_ay_test = prepare_data(aX_train, ay_train, test_size=test_size, xgb_format=True)
xgb_bx_train, xgb_bx_test, xgb_by_train, xgb_by_test = prepare_data(bX_train, by_train, test_size=test_size, xgb_format=True)
xgb_cx_train, xgb_cx_test, xgb_cy_train, xgb_cy_test = prepare_data(cX_train, cy_train, test_size=test_size, xgb_format=True)

In [43]:
num_round = 3000
params = {'max_depth': 9, 'eta': 0.05, 'silent': 0, 'lambda': 0.5, 'alpha': 0.5, 'lambda_bias': 0.5, 'min_child_weight': 1, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}

xgb_a = train_xgb_model(xgb_ax_train, params=params, num_round=num_round)
xgb_b = train_xgb_model(xgb_bx_train, params=params, num_round=num_round)
xgb_c = train_xgb_model(xgb_cx_train, params=params, num_round=num_round)

In [44]:
print("A Loss. Train: {} - Test: {}".format(*cross_validate(xgb_ax_train, xgb_ax_test, xgb_ay_train, xgb_ay_test, xgb_a)))
print("B Loss. Train: {} - Test: {}".format(*cross_validate(xgb_bx_train, xgb_bx_test, xgb_by_train, xgb_by_test, xgb_b)))
print("C Loss. Train: {} - Test: {}".format(*cross_validate(xgb_cx_train, xgb_cx_test, xgb_cy_train, xgb_cy_test, xgb_c)))

A Loss. Train: 0.008615946642599797 - Test: None
B Loss. Train: 0.026407453189422843 - Test: None
C Loss. Train: 0.01067026217238942 - Test: None


In [46]:
# Prepare submission
models = {'a': xgb_a, 'b': xgb_b, 'c': xgb_c}
a_keep = a_train_small.columns.tolist()
a_keep.remove('poor')
b_keep = b_train_small.columns.tolist()
b_keep.remove('poor')
c_keep = c_train_small.columns.tolist()
c_keep.remove('poor')

#to_keep_cols = {'a': a_keep, 'b': b_keep, 'c': c_keep}
#enforce_cols = {'a': a_keep, 'b': b_keep, 'c': c_keep}

# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')

a_test = a_test[a_keep]
b_test = b_test[b_keep]
c_test = c_test[c_keep]


a_test = pre_process_data(a_test)
b_test = pre_process_data(b_test)
c_test = pre_process_data(c_test)

# Delete new columns that were not in training set
a_diff = set(a_test.columns.tolist()) - set(aX_train.columns.tolist())
b_diff = set(b_test.columns.tolist()) - set(bX_train.columns.tolist())
c_diff = set(c_test.columns.tolist()) - set(cX_train.columns.tolist())

a_test = a_test[a_test.columns.difference(list(a_diff))]
b_test = b_test[b_test.columns.difference(list(b_diff))]
c_test = c_test[c_test.columns.difference(list(c_diff))]

# Add dummy columns that are not in the test set
a_diff = set(aX_train.columns.tolist()) - set(a_test.columns.tolist())
b_diff = set(bX_train.columns.tolist()) - set(b_test.columns.tolist())
c_diff = set(cX_train.columns.tolist()) - set(c_test.columns.tolist())
a_test = a_test.assign(**{c: 0 for c in a_diff})
b_test = b_test.assign(**{c: 0 for c in b_diff})
c_test = c_test.assign(**{c: 0 for c in c_diff})

# Reorder columns in the original way so XGBoost does not explode
a_test = a_test[aX_train.columns.tolist()]
b_test = b_test[bX_train.columns.tolist()]
c_test = c_test[cX_train.columns.tolist()]


a_test.fillna(0, inplace=True)
b_test.fillna(0, inplace=True)
c_test.fillna(0, inplace=True)


print(a_test.shape)
print(b_test.shape)
print(c_test.shape)

a_testxgb = xgb.DMatrix(a_test)
b_testxgb = xgb.DMatrix(b_test)
c_testxgb = xgb.DMatrix(c_test)

# TODO: use probabilities
a_preds = xgb_a.predict(a_testxgb)
b_preds = xgb_b.predict(b_testxgb)
c_preds = xgb_c.predict(c_testxgb)

a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

submission = pd.concat([a_sub, b_sub, c_sub])

#submission = prepare_submission(data_paths, models, enforce_cols=enforce_cols, to_keep_cols=to_keep_cols, xgb_format=True)
submission.to_csv('submission_recent_XGB_best.csv')

Input shape:	(4041, 1)
After standardization (4041, 1)
After converting categoricals:	(4041, 2)
Input shape:	(1604, 9)
After standardization (1604, 9)
After converting categoricals:	(1604, 89)
Input shape:	(3187, 25)
After standardization (3187, 25)
After converting categoricals:	(3187, 208)
(4041, 3)
(1604, 90)
(3187, 219)
