In [1]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import scipy as sc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# data directory
DATA_DIR = os.path.join('..', 'data')

In [2]:
data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C_hhold_test.csv')}}

In [3]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

In [4]:
a_train.head()

Unnamed: 0_level_0,wBXbHZmp,SlDKnCuu,KAJOWiiw,DsKacCdL,rtPrBBPl,tMJrvvut,jdetlNNF,maLAYXwi,vwpsXRGk,sArDRIyX,...,sDGibZrP,CsGvKKBJ,OLpGAaEu,LrDrWRjC,JCDeZBXq,HGPWuGlV,GDUPaBQs,WuwrCsIY,AlDbXTlZ,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46107,JhtDR,GUusz,TuovO,ZYabk,feupP,PHMVg,NDTCU,cLAGr,XAmOF,MwLvg,...,JqHnW,MaXfS,etZsD,idRwx,LPtkN,vkbkA,qQxrL,AITFl,aQeIm,A
82739,JhtDR,GUusz,TuovO,ZYabk,feupP,PHMVg,NDTCU,sehIp,lwCkE,MwLvg,...,JqHnW,MaXfS,HxnJy,idRwx,UyAms,vkbkA,qQxrL,AITFl,cecIq,A
9646,JhtDR,GUusz,BIZns,ZYabk,uxuSS,PHMVg,NDTCU,sehIp,qNABl,MwLvg,...,JqHnW,MaXfS,USRak,idRwx,UyAms,vkbkA,qQxrL,AITFl,cecIq,A
10975,JhtDR,GUusz,TuovO,ZYabk,feupP,PHMVg,NDTCU,sehIp,sPNOc,MwLvg,...,JqHnW,MaXfS,USRak,idRwx,UyAms,vkbkA,qQxrL,AITFl,cecIq,A
16463,JhtDR,alLXR,TuovO,ZYabk,feupP,PHMVg,NDTCU,cLAGr,NdlDR,MwLvg,...,JqHnW,MaXfS,etZsD,idRwx,UyAms,vkbkA,qQxrL,GAZGl,aQeIm,A


In [5]:
def entropy(a):
    return - sum( (a / sum(a)) * np.log((a / sum(a))))

In [6]:
# Lets find out all the columns whose values are almost monopolized by a single category. That is, whose values belong
# 60% (random decision TBH) of the time to the same category.
# TODO: in the future measure the entropy of the values and delete those with low entropy
df = a_train
entropies = []
for col in df.columns.tolist():
    res = df[col].value_counts()
    entropies.append(entropy(res.values))
    #if max(res.values) > len(df) * 0.7:
    #    to_del.append(col)
    #if entr < 0.4:
    #    to_del.append(col)
    
avg_entr = np.mean(entropies)
std_entr = np.std(entropies)
median_entr = np.median(entropies)

print("Entropy values: {} ± {}. Median: {}".format(avg_entr, std_entr, median_entr))

# Delete all the columns whose entropy is below average
to_del = []
for i, col in enumerate(df.columns.tolist()):
    if entropies[i] < median_entr:
        to_del.append(col)

Entropy values: 0.3829585850912254 ± 0.3739404708672668. Median: 0.3257934680141318


In [7]:
print("Total columns: {}. To delete: {}".format(len(df.columns.tolist()), len(to_del)))

Total columns: 345. To delete: 172


In [8]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)

    return df

In [9]:
# Lets delete all the columns we found to be useless
to_keep = set(df.columns.tolist()) - set(to_del)
#print(to_keep)

In [10]:
df_reduced = df[list(to_keep)]
df_train = pre_process_data(df_reduced)
df_train.fillna(0, inplace=True)
df.fillna(False, inplace=True)
dfy_train = np.ravel(df.poor.astype(int))
print("Shapes: df_train {} - dfy_train {}".format(df_train.shape, dfy_train.shape))

Input shape:	(1855, 173)
After standardization (1855, 173)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


After converting categoricals:	(1855, 500)
Shapes: df_train (1855, 500) - dfy_train (1855,)


In [11]:
def prepare_data(x, y):
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    dtrain = xgb.DMatrix(X_train, label=Y_train)
    dtest = xgb.DMatrix(X_test)
    return dtrain, dtest, Y_train, Y_test


def train_model(dtrain, params=None, num_round=100):
    if params is None:
        params = {'max_depth': 4, 'eta': 100, 'silent': 1, 'objective': 'reg:logistic'}

    bst = xgb.train(params, dtrain, num_round)

    return bst

# Compute loss
# -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
def log_loss(yt, yp):
    # yt: groundtruth
    # yp: predicted
    ground = np.array(yt)
    pred = yp.astype(float)
    eps_pred = np.maximum(np.minimum(pred, 1. - 1e-15), 1e-15)
    loss = -(ground * np.log(eps_pred) + (1 - ground) * np.log(1 - eps_pred))
    return np.mean(loss)

In [12]:
#for col in df_train.columns.tolist():
#    print(df_train[col].value_counts())

In [13]:
dtrain, dtest, y_train, y_test = prepare_data(df_train, dfy_train)

In [14]:
# Train
params = {'max_depth': 5, 'eta': 0.05, 'silent': 0, 'lambda': 2, 'alpha': 1, 'lambda_bias': 1, 'min_child_weight': 2, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
num_round = 3000

model = train_model(dtrain, params=params, num_round=num_round)

pred = model.predict(dtest)
pred_train = model.predict(dtrain)

test_loss = log_loss(pred, y_test)
train_loss = log_loss(pred_train, y_train)

print("Loss Test: {} - Train: {}".format(test_loss, train_loss))

Loss Test: 0.09433563748211932 - Train: 0.09450553205034791


## Try with other 2 countries

#### B

In [168]:
df = b_train
to_del = []
#for col in df.columns.tolist():
#    res = df[col].value_counts()
#    if max(res.values) > len(df) * 0.5:
#        to_del.append(col)


entropies = []
for col in df.columns.tolist():
    res = df[col].value_counts()
    entropies.append(entropy(res.values))

avg_entr = np.mean(entropies)
std_entr = np.std(entropies)
median_entr = np.median(entropies)
print("Entropy values: {} ± {} . Median: {}".format(avg_entr, std_entr, median_entr))

# Delete all the columns whose entropy is below average
to_del = []
for i, col in enumerate(df.columns.tolist()):
    if entropies[i] < 0.9:
        to_del.append(col)


to_keep = set(df.columns.tolist()) - set(to_del)
df_reduced = df[list(to_keep)]
df_train = pre_process_data(df_reduced)
df_train.fillna(0, inplace=True)
df.fillna(False, inplace=True)
dfy_train = np.ravel(df.poor.astype(int))



from sklearn.decomposition import TruncatedSVD
def reduce_dimensions(x):
    svd = TruncatedSVD(n_components=200, n_iter=10, random_state=42)
    return svd.fit_transform(x)
df_train = reduce_dimensions(df_train)


dtrain, dtest, y_train, y_test = prepare_data(df_train, dfy_train)
params = {'max_depth': 15, 'eta': 0.01, 'silent': 0, 'lambda': 1, 'alpha': 0.5, 'lambda_bias': 0, 'min_child_weight': 2, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
num_round = 3000

model = train_model(dtrain, params=params, num_round=num_round)

pred = model.predict(dtest)
pred_train = model.predict(dtrain)

test_loss = log_loss(pred, y_test)
train_loss = log_loss(pred_train, y_train)

print("B Loss Test: {} - Train: {}".format(test_loss, train_loss))

Entropy values: 0.5983799488688347 ± 0.5441820751439844 . Median: 0.507305083138068
Input shape:	(3255, 84)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


After standardization (3255, 84)
After converting categoricals:	(3255, 519)
B Loss Test: 2.7546789167222885 - Train: 0.27297252216582646


In [69]:
df = c_train
to_del = []
for col in df.columns.tolist():
    res = df[col].value_counts()
    if max(res.values) > len(df) * 0.7:
        to_del.append(col)
to_keep = set(df.columns.tolist()) - set(to_del)
df_reduced = df[list(to_keep)]
df_train = pre_process_data(df_reduced)
df_train.fillna(0, inplace=True)
df.fillna(False, inplace=True)
dfy_train = np.ravel(df.poor.astype(int))
dtrain, dtest, y_train, y_test = prepare_data(df_train, dfy_train)
params = {'max_depth': 5, 'eta': 0.05, 'silent': 0, 'lambda': 2, 'alpha': 1, 'lambda_bias': 1, 'min_child_weight': 2, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 42}
num_round = 3000

model = train_model(dtrain, params=params, num_round=num_round)

pred = model.predict(dtest)
pred_train = model.predict(dtrain)

test_loss = log_loss(pred, y_test)
train_loss = log_loss(pred_train, y_train)

print("C Loss Test: {} - Train: {}".format(test_loss, train_loss))

Input shape:	(6469, 97)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


After standardization (6469, 97)
After converting categoricals:	(6469, 557)
C Loss Test: 0.17188059898469973 - Train: 0.10635317006104153


In [70]:
avg_loss = np.mean([0.17188059898469973, 2.585255615957681, 0.09433563748211932])
print(avg_loss)

0.950490617475


## Lets try the same with random forests. TL;DR: they overfit a lot

In [78]:
from sklearn.ensemble import RandomForestClassifier

def train_rf_model(features, labels, **kwargs):
    
    # instantiate model
    model = RandomForestClassifier(n_estimators=50, random_state=0)
    
    # train model
    model.fit(features, labels)
    
    # get a (not-very-useful) sense of performance
    accuracy = model.score(features, labels)
    print(f"In-sample accuracy: {accuracy:0.2%}")
    
    return model

In [103]:
df = b_train
to_del = []

for col in df.columns.tolist():
    res = df[col].value_counts()
    if max(res.values) > len(df) * 0.4:
        to_del.append(col)
        
to_keep = set(df.columns.tolist()) - set(to_del)
df_reduced = df[list(to_keep)]
df_train = pre_process_data(df_reduced)
df_train.fillna(0, inplace=True)
df.fillna(False, inplace=True)
dfy_train = np.ravel(df.poor.astype(int))


X_train, X_test, Y_train, Y_test = train_test_split(df_train, dfy_train, test_size=0.40, random_state=42)

model = train_rf_model(X_train, Y_train)

preds = model.predict(X_test)
preds_train = model.predict(X_train)
test_loss = log_loss(preds, Y_test)
train_loss = log_loss(preds_train, Y_train)

print("B Loss Test: {} - Train: {}".format(test_loss, train_loss))

Input shape:	(3255, 28)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


After standardization (3255, 28)
After converting categoricals:	(3255, 180)
In-sample accuracy: 100.00%
B Loss Test: 2.2018298068909394 - Train: 9.992007221626413e-16


In [101]:
df = a_train
to_del = []

for col in df.columns.tolist():
    res = df[col].value_counts()
    if max(res.values) > len(df) * 0.6:
        to_del.append(col)
        
to_keep = set(df.columns.tolist()) - set(to_del)
df_reduced = df[list(to_keep)]
df_train = pre_process_data(df_reduced)
df_train.fillna(0, inplace=True)
df.fillna(False, inplace=True)
dfy_train = np.ravel(df.poor.astype(int))


X_train, X_test, Y_train, Y_test = train_test_split(df_train, dfy_train, test_size=0.40, random_state=42)

model = train_rf_model(X_train, Y_train)

preds = model.predict(X_test)
preds_train = model.predict(X_train)
test_loss = log_loss(preds, Y_test)
train_loss = log_loss(preds_train, Y_train)

print("A Loss Test: {} - Train: {}".format(test_loss, train_loss))

Input shape:	(1855, 32)
After standardization (1855, 32)
After converting categoricals:	(1855, 166)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In-sample accuracy: 100.00%
A Loss Test: 0.23274539132651606 - Train: 9.992007221626415e-16


In [100]:
df = c_train
to_del = []

for col in df.columns.tolist():
    res = df[col].value_counts()
    if max(res.values) > len(df) * 0.6:
        to_del.append(col)
        
to_keep = set(df.columns.tolist()) - set(to_del)
df_reduced = df[list(to_keep)]
df_train = pre_process_data(df_reduced)
df_train.fillna(0, inplace=True)
df.fillna(False, inplace=True)
dfy_train = np.ravel(df.poor.astype(int))


X_train, X_test, Y_train, Y_test = train_test_split(df_train, dfy_train, test_size=0.30, random_state=42)

model = train_rf_model(X_train, Y_train)

preds = model.predict(X_test)
preds_train = model.predict(X_train)
test_loss = log_loss(preds, Y_test)
train_loss = log_loss(preds_train, Y_train)

print("A Loss Test: {} - Train: {}".format(test_loss, train_loss))

Input shape:	(6469, 78)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


After standardization (6469, 78)
After converting categoricals:	(6469, 486)
In-sample accuracy: 100.00%
A Loss Test: 0.5694306195569548 - Train: 9.992007221626415e-16
