In [2]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import scipy as sc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# data directory
DATA_DIR = os.path.join('..', 'data')

In [3]:
data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C_hhold_test.csv')}}

In [4]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

In [5]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)

    return df

In [6]:
print("Country A")
aX_train = pre_process_data(a_train.drop('poor', axis=1))
a_train.fillna(False, inplace=True)
ay_train = np.ravel(a_train.poor.astype(int))
print("Shapes: aX_train {} - ay_train {}".format(aX_train.shape, ay_train.shape))

print("\nCountry B")
bX_train = pre_process_data(b_train.drop('poor', axis=1))
b_train.fillna(False, inplace=True)
by_train = np.ravel(b_train.poor.astype(int))
print("Shapes: bX_train {} - by_train {}".format(bX_train.shape, by_train.shape))

print("\nCountry C")
cX_train = pre_process_data(c_train.drop('poor', axis=1))
cy_train = np.ravel(c_train.poor.astype(int))
print("Shapes: cX_train {} - cy_train {}".format(cX_train.shape, cy_train.shape))

Country A
Input shape:	(1855, 344)
After standardization (1855, 344)
After converting categoricals:	(1855, 849)
Shapes: aX_train (1855, 849) - ay_train (1855,)

Country B
Input shape:	(3255, 441)
After standardization (3255, 441)
After converting categoricals:	(3255, 1432)
Shapes: bX_train (3255, 1432) - by_train (3255,)

Country C
Input shape:	(6469, 163)
After standardization (6469, 163)
After converting categoricals:	(6469, 795)
Shapes: cX_train (6469, 795) - cy_train (6469,)


In [7]:
def prepare_data(x, y):
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.20, random_state=42)
    dtrain = xgb.DMatrix(X_train, label=Y_train)
    dtest = xgb.DMatrix(X_test)
    return dtrain, dtest, Y_train, Y_test


def train_model(dtrain, params=None, num_round=100):
    if params is None:
        params = {'max_depth': 4, 'eta': 100, 'silent': 1, 'objective': 'reg:logistic'}

    bst = xgb.train(params, dtrain, num_round)

    return bst

In [8]:
a_dtrain, a_dtest, ay_train, ay_test = prepare_data(aX_train, ay_train)
b_dtrain, b_dtest, by_train, by_test = prepare_data(bX_train, by_train)
c_dtrain, c_dtest, cy_train, cy_test = prepare_data(cX_train, cy_train)

## Cross validation test

In [9]:
# Compute loss
# -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
def log_loss(yt, yp):
    # yt: groundtruth
    # yp: predicted
    ground = np.array(yt)
    pred = yp.astype(float)
    eps_pred = np.maximum(np.minimum(pred, 1. - 1e-15), 1e-15)
    loss = -(ground * np.log(eps_pred) + (1 - ground) * np.log(1 - eps_pred))
    return np.mean(loss)

In [15]:
# Lets try to find the optimal hyperparameters
def tune_params(dtrain, dtest, y_test):
    params = {'max_depth': 4, 'eta': 0.1, 'silent': 1, 'lambda': 1, 'alpha': 1, 'lambda_bias': 1, 'min_child_weight': 1, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
    current_loss = 10000
    for num_round in [10, 50, 100]:
        for max_depth in range(3, 20):
            for eta in [0.01, 0.05, 0.09, 0.1]:
                for min_child_weight in [1, 2, 5]:
                    for lamda in [0.5, 1]:
                        for alpha in [0.5, 1, 2]:
                            for lambda_bias in [0.5, 1, 2]:
                                params['max_depth'] = max_depth
                                params['eta'] = eta
                                params['min_child_weight'] = min_child_weight
                                params['lambda'] = lamda
                                params['lambda_bias'] = lambda_bias
                                params['alpha'] = alpha

                                model = train_model(dtrain, params=params, num_round=num_round)

                                pred = model.predict(dtest)
                                loss = log_loss(pred, y_test)

                                if loss < current_loss:
                                    current_loss = loss
                                    best_hyperparams = params
                                    best_num_rounds = num_round

    return best_hyperparams, best_num_rounds

In [16]:
# Test set
print("\n\nTuning parameters for Country A")
a_params, a_num_rounds = tune_params(a_dtrain, a_dtest, ay_test)
#print("\n\nTuning parameters for Country B")
#b_params, b_num_rounds = tune_params(b_dtrain, b_dtest, by_test)
#print("\n\nTuning parameters for Country C")
#c_params, c_num_rounds = tune_params(c_dtrain, c_dtest, cy_test)



Tuning parameters for Country A


KeyboardInterrupt: 

In [15]:
print("A params: {}".format(a_params))
#print("B params: {}".format(b_params))
#print("C params: {}".format(c_params))

A params: {'max_depth': 9, 'eta': 0.1, 'silent': 1, 'lambda': 1, 'min_child_weight': 5, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
B params: {'max_depth': 9, 'eta': 0.1, 'silent': 1, 'lambda': 1, 'min_child_weight': 5, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
C params: {'max_depth': 9, 'eta': 0.1, 'silent': 1, 'lambda': 1, 'min_child_weight': 5, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}


In [None]:
#a_params = {'max_depth': 5, 'eta': 0.05, 'silent': 1, 'lambda': 2, 'alpha': 1, 'lambda_bias': 1, 'min_child_weight': 2, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
a_params['silent'] = 0
num_round = 3000

model_a = train_model(a_dtrain, params=a_params, num_round=num_round)

a_pred = model_a.predict(a_dtest)
a_pred_train = model_a.predict(a_dtrain)

test_loss_a = log_loss(a_pred, ay_test)
train_loss_a = log_loss(a_pred_train, ay_train)

print("Loss A Test: {} - Train: {}".format(test_loss_a, train_loss_a))

In [None]:
b_params = a_params
b_params['silent'] = 0
num_round = 3000

model_b = train_model(b_dtrain, params=b_params, num_round=num_round)

b_pred = model_b.predict(b_dtest)
b_pred_train = model_b.predict(b_dtrain)

test_loss_b = log_loss(b_pred, by_test)
train_loss_b = log_loss(b_pred_train, by_train)

print("Loss B Test: {} - Train: {}".format(test_loss_b, train_loss_b))

In [None]:
c_params = a_params
c_params['silent'] = 0

model_c = train_model(c_dtrain, params=c_params, num_round=3000)

c_pred = model_c.predict(c_dtest)
c_pred_train = model_c.predict(c_dtrain)

test_loss_c = log_loss(c_pred, cy_test)
train_loss_c = log_loss(c_pred_train, cy_train)

print("Loss C Test: {} - Train: {}".format(test_loss_c, train_loss_c))

In [None]:
print("Avg Train loss: {}".format(np.mean([train_loss_a, train_loss_b, train_loss_c])))
print("Avg Test loss: {}".format(np.mean([test_loss_a, test_loss_b, test_loss_c])))

## Submission Test

In [42]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')

In [43]:
# process the test data
a_test = pre_process_data(a_test, enforce_cols=aX_train.columns)
b_test = pre_process_data(b_test, enforce_cols=bX_train.columns)
c_test = pre_process_data(c_test, enforce_cols=cX_train.columns)

Input shape:	(4041, 344)
After standardization (4041, 344)
After converting categoricals:	(4041, 851)
Input shape:	(1604, 441)
After standardization (1604, 441)
After converting categoricals:	(1604, 1419)
Input shape:	(3187, 163)
After standardization (3187, 163)
After converting categoricals:	(3187, 773)


In [44]:
a_preds = model_a.predict(a_test)
b_preds = model_b.predict(b_test)
c_preds = model_c.predict(c_test)

In [45]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

In [46]:
# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

In [47]:
submission = pd.concat([a_sub, b_sub, c_sub])

In [49]:
submission.to_csv('submission.csv')