In [1]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import h5py
import matplotlib.pyplot as plt
from numpy.random import RandomState
from numpy import unravel_index


import time
# data directory
DATA_DIR = os.path.join('Kaggle', 'poverty')

In [2]:
data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C_hhold_test.csv')}}
ind_paths = {'A': {'train': os.path.join(DATA_DIR, 'A_indiv_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A_indiv_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B_indiv_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B_indiv_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C_indiv_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C_indiv_test.csv')}}

In [3]:
a_train = pd.read_csv('A_hhold_train.csv', index_col='id')
b_train = pd.read_csv('B_hhold_train.csv', index_col='id')
c_train = pd.read_csv('C_hhold_train.csv', index_col='id')
ai_train = pd.read_csv('A_indiv_train.csv', index_col='id')
bi_train = pd.read_csv('B_indiv_train.csv', index_col='id').drop('wJthinfa',axis=1)
ci_train = pd.read_csv('c_indiv_train.csv', index_col='id')

In [4]:
a_test = pd.read_csv('A_hhold_test.csv', index_col='id')
b_test = pd.read_csv('B_hhold_test.csv', index_col='id')
c_test = pd.read_csv('C_hhold_test.csv', index_col='id')
ai_test = pd.read_csv('A_indiv_test.csv', index_col='id')
bi_test = pd.read_csv('B_indiv_test.csv', index_col='id').drop('wJthinfa',axis=1)
ci_test = pd.read_csv('C_indiv_test.csv', index_col='id')

In [5]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    return df
    

def pre_process_data(df, enforce_cols=None):
#     df = standardize(df)
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)
    
    return df 
def drop(df, col, threshold):
    poor_count = sum(df['poor'])
    non_count = df.shape[0]-poor_count
    col_poor = Counter(df[col][df['poor'] == True])
    col_non_poor = Counter(df[col][df['poor'] == False])
    key = set(col_poor.keys())|set(col_poor.keys())
    poor_value, non_poor_value = any([i/poor_count>threshold for i in col_poor.values()]), any([i/non_count>threshold for i in col_non_poor.values()])
    if poor_value & non_poor_value:
        return df.drop([col],axis=1)
    else: return df

def drop_col(hhold, indi, hhold_threshold, indi_threshold):
    hhold_poor = sum(hhold['poor'])
    hhold_no = hhold.shape[0] - hhold_poor
    indi_poor = sum(indi['poor'])
    indi_no = indi.shape[0] - indi_poor
    for col in hhold:
        if col == 'poor' or col == 'country': continue
        h_col_poor = Counter(hhold[col][hhold['poor'] == True])
        h_col_non = Counter(hhold[col][hhold['poor'] == False])
        if any([i/hhold_poor>hhold_threshold for i in h_col_poor.values()]) & any([i/hhold_no>hhold_threshold for i in h_col_non.values()]):
            hhold = hhold.drop([col], axis = 1)
    for col in indi:
        if col == 'poor' or col == 'country' or col == 'iid': continue
        i_col_poor = Counter(indi[col][indi['poor'] == True])
        i_col_non = Counter(indi[col][indi['poor'] == False])
        if any([i/indi_poor>indi_threshold for i in i_col_poor.values()]) & any([i/indi_no>indi_threshold for i in i_col_non.values()]):
            indi = indi.drop([col], axis = 1)
    return hhold, indi.drop(['poor','country'],axis=1)
    
def combine_hhold_indi(hhold, indi):
    hhold = pre_process_data(hhold)
    iid_count = pd.DataFrame(indi.groupby('id')['iid'].count())
    iid_count.columns = ['family_numbers']
    indi = indi.drop('iid',axis=1)
    indi_num = indi.select_dtypes(include=['int64', 'float64'])
    indi_obj = indi.drop(list(indi_num.columns), axis = 1)
    indi_obj = pd.get_dummies(indi_obj)
    indi_num = indi_num.fillna(indi_num.mean())
    indi_obj = pre_process_data(indi_obj)
    indi_num = indi_num.groupby('id')[list(indi_num.columns)].mean()
#     indi_num = standardize(indi_num)
    indi_obj = indi_obj.groupby('id')[list(indi_obj.columns)].sum()/indi_obj.groupby('id')[list(indi_obj.columns)].count()
    com = pd.concat([iid_count,indi_num,indi_obj,hhold],axis=1)
    com.fillna(-100,inplace=True)
    return com

def order(df_test, df_train, ori_test):
    new_df = pd.DataFrame()
    for key in df_train:
        new_df[key] = df_test[key]
    new_df = new_df.loc[ori_test.index]
    return new_df
def mll(y_true,y_pred):
    loss = 0
    for i in range(3):
        for j in range(len(y_true[i])):
            loss += y_true[i][j]*np.log(y_pred[i][j])+(1-y_true[i][j])*np.log(1-y_pred[i][j])
        loss = loss / (j+1)
    return -loss/3

In [6]:
def combine_test_hhold_indi(hhold, indi, train):
    hhold = pre_process_data(hhold.drop('country',axis = 1))
    iid_count = pd.DataFrame(indi.groupby('id')['iid'].count())
    iid_count.columns = ['family_numbers']
    indi = indi.drop('iid',axis=1)
    indi_num = indi.select_dtypes(include=['int64', 'float64'])
    indi_obj = indi.drop(list(indi_num.columns), axis = 1)
    indi_obj = pd.get_dummies(indi_obj)
    indi_num = indi_num.fillna(indi_num.mean())
    indi_obj = pre_process_data(indi_obj)
    indi_num = indi_num.groupby('id')[list(indi_num.columns)].mean()
#     indi_num = standardize(indi_num)
    indi_obj = indi_obj.groupby('id')[list(indi_obj.columns)].sum()/indi_obj.groupby('id')[list(indi_obj.columns)].count()
    com = pd.concat([iid_count,indi_num,indi_obj,hhold],axis=1)
    to_drop = np.setdiff1d(com.columns, train.columns)
    to_add = np.setdiff1d(train.columns, com.columns)

    com.drop(to_drop, axis=1, inplace=True)
    com = com.assign(**{c: 0 for c in to_add})
    
    com.fillna(0, inplace=True)
    return com

In [7]:
train_a,train_ai = drop_col(a_train,ai_train,0.95,0.95)
A_train = combine_hhold_indi(train_a,train_ai)
train_b,train_bi = drop_col(b_train,bi_train,0.95,0.95)
B_train = combine_hhold_indi(train_b,train_bi)
train_c,train_ci = drop_col(c_train,ci_train,0.95,0.95)
C_train = combine_hhold_indi(train_c,train_ci)
A_train = A_train.sample(frac=1)
B_train = B_train.sample(frac=1)
C_train = C_train.sample(frac=1)

After converting categoricals:	(8203, 612)
After converting categoricals:	(37560, 255)
After converting categoricals:	(3255, 1221)
After converting categoricals:	(20252, 1039)
After converting categoricals:	(6469, 741)
After converting categoricals:	(29913, 279)


In [8]:
ax = A_train.drop('poor', axis = 1) 
ay = A_train['poor'].values
bx = B_train.drop('poor', axis = 1) 
by = B_train['poor'].values
cx = C_train.drop('poor', axis = 1) 
cy = C_train['poor'].values
A_test = combine_test_hhold_indi(a_test, ai_test, ax)
B_test = combine_test_hhold_indi(b_test, bi_test, bx)
C_test = combine_test_hhold_indi(c_test, ci_test, cx)
A_test = order(A_test, ax, a_test)
B_test = order(B_test, bx, b_test)
C_test = order(C_test, cx, c_test)

After converting categoricals:	(4041, 850)
After converting categoricals:	(18535, 271)
After converting categoricals:	(1604, 1418)
After converting categoricals:	(10066, 1502)
After converting categoricals:	(3187, 772)
After converting categoricals:	(14701, 296)


In [9]:
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier




In [10]:
ax_train,ax_valid,ay_train,ay_valid = train_test_split(ax, ay, test_size=0.2, random_state=4242)
print('Train samples: {} Validation samples: {}'.format(len(ax_train), len(ax_valid)))
bx_train,bx_valid,by_train,by_valid = train_test_split(bx, by, test_size=0.2, random_state=4242)
cx_train,cx_valid,cy_train,cy_valid = train_test_split(cx, cy, test_size=0.2, random_state=4242)

Train samples: 6562 Validation samples: 1641


In [11]:
a_train = xgb.DMatrix(ax_train, ay_train)
a_valid = xgb.DMatrix(ax_valid, ay_valid)
a_test = xgb.DMatrix(A_test)
b_train = xgb.DMatrix(bx_train, by_train)
b_valid = xgb.DMatrix(bx_valid, by_valid)
b_test = xgb.DMatrix(B_test)
c_train = xgb.DMatrix(cx_train, cy_train)
c_valid = xgb.DMatrix(cx_valid, cy_valid)
c_test = xgb.DMatrix(C_test)

In [12]:
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.02
params['silent'] = True
params['max_depth'] = 6
params['subsample'] = 0.9
params['colsample_bytree'] = 0.9

In [13]:
def gini(actual,pred,cmpcol = 0,sortcol = 1):
    assert(len(actual)== len(pred))
    all = np.asarray(np.c_[actual,pred,np.arange(len(actual))],dtype = np.float)
    all = all[np.lexsort((all[:,2],-1*all[:,1]))]
    totallosses = all[:,0].sum()
    ginisum = all[:,0].cumsum().sum()/totallosses
    
    ginisum -= (len(actual)+1)/2
    return ginisum/len(actual)
def gini_normalized(a,p):
    return gini(a,p)/gini(a,a)
def gini_xgb(preds,dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels,preds)
    return [("gini",gini_score)]


In [14]:
watchlist = [(a_train, 'train'), (a_valid, 'valid')]
model_A = xgb.train(params, a_train, 3000, watchlist,feval=gini_xgb,early_stopping_rounds=100, maximize=True, verbose_eval=10)

[0]	train-error:0.167784	valid-error:0.210238	train-gini:0.788735	valid-gini:0.727963
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[10]	train-error:0.146754	valid-error:0.188909	train-gini:0.864997	valid-gini:0.778838
[20]	train-error:0.133496	valid-error:0.182815	train-gini:0.884366	valid-gini:0.793323
[30]	train-error:0.124505	valid-error:0.176722	train-gini:0.895611	valid-gini:0.803274
[40]	train-error:0.119781	valid-error:0.171846	train-gini:0.905231	valid-gini:0.812581
[50]	train-error:0.114599	valid-error:0.166971	train-gini:0.913201	valid-gini:0.821324
[60]	train-error:0.109418	valid-error:0.166362	train-gini:0.919341	valid-gini:0.827803
[70]	train-error:0.106675	valid-error:0.162706	train-gini:0.925568	valid-gini:0.834047
[80]	train-error:0.100427	valid-error:0.162096	train-gini:0.932171	valid-gini:0.839426
[90]	train-error:0.097074	valid-error:0.157831	train-gini:0.936735	valid

[920]	train-error:0.003505	valid-error:0.124924	train-gini:0.999911	valid-gini:0.900755
[930]	train-error:0.003353	valid-error:0.127361	train-gini:0.999921	valid-gini:0.900915
[940]	train-error:0.0032	valid-error:0.125533	train-gini:0.999934	valid-gini:0.900909
[950]	train-error:0.003048	valid-error:0.125533	train-gini:0.999946	valid-gini:0.900984
[960]	train-error:0.003048	valid-error:0.126752	train-gini:0.999952	valid-gini:0.901179
[970]	train-error:0.003048	valid-error:0.126143	train-gini:0.999959	valid-gini:0.901236
[980]	train-error:0.003048	valid-error:0.125533	train-gini:0.999964	valid-gini:0.901272
[990]	train-error:0.002743	valid-error:0.127361	train-gini:0.999971	valid-gini:0.901338
[1000]	train-error:0.002743	valid-error:0.126752	train-gini:0.999976	valid-gini:0.901413
[1010]	train-error:0.002743	valid-error:0.127361	train-gini:0.99998	valid-gini:0.901338
[1020]	train-error:0.002438	valid-error:0.126143	train-gini:0.999983	valid-gini:0.901536
[1030]	train-error:0.002438	vali

In [15]:
watchlist = [(b_train, 'train'), (b_valid, 'valid')]
model_B = xgb.train(params, b_train, 3000, watchlist,feval=gini_xgb,early_stopping_rounds=100, maximize=True, verbose_eval=10)


[0]	train-error:0.05914	valid-error:0.092166	train-gini:0.593721	valid-gini:0.278735
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[10]	train-error:0.058756	valid-error:0.076805	train-gini:0.836096	valid-gini:0.524393
[20]	train-error:0.057604	valid-error:0.078341	train-gini:0.868216	valid-gini:0.581431
[30]	train-error:0.0553	valid-error:0.076805	train-gini:0.901541	valid-gini:0.622696
[40]	train-error:0.054147	valid-error:0.076805	train-gini:0.928725	valid-gini:0.610516
[50]	train-error:0.050691	valid-error:0.076805	train-gini:0.948356	valid-gini:0.619368
[60]	train-error:0.049539	valid-error:0.078341	train-gini:0.965512	valid-gini:0.61371
[70]	train-error:0.046467	valid-error:0.078341	train-gini:0.976592	valid-gini:0.61005
[80]	train-error:0.045699	valid-error:0.078341	train-gini:0.98422	valid-gini:0.614576
[90]	train-error:0.043395	valid-error:0.076805	train-gini:0.987876	valid-gini:

In [16]:
watchlist = [(c_train, 'train'), (c_valid, 'valid')]
model_C = xgb.train(params, c_train, 3000, watchlist,feval=gini_xgb,
                    early_stopping_rounds=100, maximize=True, verbose_eval=10)

[0]	train-error:0.006377	valid-error:0.006182	train-gini:0.974071	valid-gini:0.983197
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[10]	train-error:0.005411	valid-error:0.006955	train-gini:0.996495	valid-gini:0.993848
[20]	train-error:0.005024	valid-error:0.006955	train-gini:0.998051	valid-gini:0.996915
[30]	train-error:0.004444	valid-error:0.006182	train-gini:0.998575	valid-gini:0.99719
[40]	train-error:0.004251	valid-error:0.006182	train-gini:0.998733	valid-gini:0.998092
[50]	train-error:0.003671	valid-error:0.006182	train-gini:0.998977	valid-gini:0.998044
[60]	train-error:0.003478	valid-error:0.00541	train-gini:0.999279	valid-gini:0.998187
[70]	train-error:0.003092	valid-error:0.004637	train-gini:0.999413	valid-gini:0.998025
[80]	train-error:0.002319	valid-error:0.003864	train-gini:0.999678	valid-gini:0.997788
[90]	train-error:0.002319	valid-error:0.003864	train-gini:0.999889	valid-g

In [17]:
a_preds = model_A.predict(a_test)
b_preds = model_B.predict(b_test)
c_preds = model_C.predict(c_test)

In [18]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, ],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]


In [19]:
a_sub = make_country_sub(a_preds, A_test, 'A')
b_sub = make_country_sub(b_preds, B_test, 'B')
c_sub = make_country_sub(c_preds, C_test, 'C')
submission = pd.concat([a_sub, b_sub, c_sub])
submission.head()
submission.to_csv('dsubmission.csv')
