In [25]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import scipy as sc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as KNN

# data directory
DATA_DIR = os.path.join('..', 'data')

In [26]:
data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C_hhold_test.csv')}}

In [27]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

In [28]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)

    return df

In [29]:
print("Country A")
aX_train = pre_process_data(a_train.drop('poor', axis=1))
a_train.fillna(False, inplace=True)
ay_train = np.ravel(a_train.poor.astype(int))
print("Shapes: aX_train {} - ay_train {}".format(aX_train.shape, ay_train.shape))

print("\nCountry B")
bX_train = pre_process_data(b_train.drop('poor', axis=1))
b_train.fillna(False, inplace=True)
by_train = np.ravel(b_train.poor.astype(int))
print("Shapes: bX_train {} - by_train {}".format(bX_train.shape, by_train.shape))

print("\nCountry C")
cX_train = pre_process_data(c_train.drop('poor', axis=1))
cy_train = np.ravel(c_train.poor.astype(int))
print("Shapes: cX_train {} - cy_train {}".format(cX_train.shape, cy_train.shape))

Country A
Input shape:	(1855, 344)
After standardization (1855, 344)
After converting categoricals:	(1855, 849)
Shapes: aX_train (1855, 849) - ay_train (1855,)

Country B
Input shape:	(3255, 441)
After standardization (3255, 441)
After converting categoricals:	(3255, 1432)
Shapes: bX_train (3255, 1432) - by_train (3255,)

Country C
Input shape:	(6469, 163)
After standardization (6469, 163)
After converting categoricals:	(6469, 795)
Shapes: cX_train (6469, 795) - cy_train (6469,)


In [30]:
def prepare_data(x, y):
    return train_test_split(x, y, test_size=0.20, random_state=42)


def train_model(X, y, params=None):
    if params is None:
        params = {'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
    neigh = KNN(**params)
    neigh.fit(X, y) 
    return neigh

In [31]:
a_train, a_test, ay_train, ay_test = prepare_data(aX_train, ay_train)
b_train, b_test, by_train, by_test = prepare_data(bX_train, by_train)
c_train, c_test, cy_train, cy_test = prepare_data(cX_train, cy_train)

## Cross validation test

In [32]:
# Compute loss
# -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
def log_loss(yt, yp):
    # yt: groundtruth
    # yp: predicted
    ground = np.array(yt)
    pred = yp.astype(float)
    eps_pred = np.maximum(np.minimum(pred, 1. - 1e-15), 1e-15)
    loss = -(ground * np.log(eps_pred) + (1 - ground) * np.log(1 - eps_pred))
    return np.mean(loss)

In [35]:
# Lets try to find the optimal hyperparameters
def tune_params(X, y, X_test, y_test):
    params = {'n_neighbors': 10, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
    current_loss = 10000
    best_hyperparams = {}
    for n_neighbors in range(5, 10):
        for weight in ['uniform']:#, 'distance']:
            for algo in ['auto']:#'kd_tree', 'ball_tree', 'brute']:
                for leaf_size in [20, 30]:#[5, 10, 20, 30]:
                    for p in [1, 2]:
                        params['n_neighbors'] = n_neighbors
                        params['weights'] = weight
                        params['algorithm'] = algo
                        params['leaf_size'] = leaf_size
                        params['p'] = p

                        model = train_model(X, y, params=params)

                        pred = model.predict(X_test)
                        loss = log_loss(pred, y_test)

                        if loss < current_loss:
                            current_loss = loss
                            best_hyperparams = params

    return best_hyperparams

In [36]:
# Test set
print("\n\nTuning parameters for Country A")
a_params = tune_params(a_train, ay_train, a_test, ay_test)
#print("\n\nTuning parameters for Country B")
#b_params, b_num_rounds = tune_params(b_dtrain, b_dtest, by_test)
#print("\n\nTuning parameters for Country C")
#c_params, c_num_rounds = tune_params(c_dtrain, c_dtest, cy_test)



Tuning parameters for Country A


In [37]:
print("A params: {}".format(a_params))
#print("B params: {}".format(b_params))
#print("C params: {}".format(c_params))

A params: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}


In [38]:
model_a = train_model(a_train, ay_train, params=a_params)

a_pred = model_a.predict(a_test)
a_pred_train = model_a.predict(a_train)

test_loss_a = log_loss(a_pred, ay_test)
train_loss_a = log_loss(a_pred_train, ay_train)

print("Loss A Test: {} - Train: {}".format(test_loss_a, train_loss_a))

Loss A Test: 10.14752621085519 - Train: 8.844170231281922
