In [3]:
import pandas as pd
import xgboost as xgb
import numpy as np
import scipy as sc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
# Standardize features
def standardize(df):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)
        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)

    return df

In [5]:
# Check some training csv
dfA_csv = pd.read_csv('A_hhold_train.csv')

dfA = pre_process_data(dfA_csv.drop('poor', axis=1))
dfA_csv.fillna(False, inplace=True)
YA = np.ravel(dfA_csv.poor.astype(int))

#YA = list(dfA['poor'].astype(int))
#del dfA['poor']
dfA.head()

Input shape:	(1855, 345)
After standardization (1855, 345)
After converting categoricals:	(1855, 850)


Unnamed: 0,id,nEsgxvAq,OMtioXZZ,YFMZwKrU,TiwRslOh,wBXbHZmp_DkQlr,wBXbHZmp_JhtDR,SlDKnCuu_GUusz,SlDKnCuu_alLXR,KAJOWiiw_BIZns,...,JCDeZBXq_UyAms,HGPWuGlV_WKNwg,HGPWuGlV_vkbkA,GDUPaBQs_qCEuA,GDUPaBQs_qQxrL,WuwrCsIY_AITFl,WuwrCsIY_GAZGl,AlDbXTlZ_aQeIm,AlDbXTlZ_cecIq,country_A
0,-0.11103,-1.417848,0.310421,1.131639,-0.626205,0,1,1,0,0,...,0,0,1,0,1,1,0,1,0,1
1,1.167628,-0.406268,-0.50708,-0.00552,0.733261,0,1,1,0,0,...,1,0,1,0,1,1,0,0,1,1
2,-1.383718,0.605312,-0.50708,-0.00552,-0.626205,0,1,1,0,1,...,1,0,1,0,1,1,0,0,1,1
3,-1.337329,0.605312,-1.32458,-1.142679,0.733261,0,1,1,0,0,...,1,0,1,0,1,1,0,0,1,1
4,-1.145768,0.605312,0.310421,-1.142679,-0.17305,0,1,0,1,0,...,1,0,1,0,1,0,1,1,0,1


In [6]:
#df_dumA = pd.get_dummies(dfA)
XA = dfA
#XA.head()

In [7]:
X_trainA, X_testA, Y_trainA, Y_testA = train_test_split(XA, YA, test_size=0.20, random_state=42)
dtrainA = xgb.DMatrix(X_trainA, label=Y_trainA)
dtestA = xgb.DMatrix(X_testA)

In [16]:
# Set up XGBoost
param = {'max_depth': 4, 'eta':100, 'silent':1, 'objective':'reg:logistic'}
num_round = 100
bstA = xgb.train(param, dtrainA, num_round)

In [17]:
predA = bstA.predict(dtestA)

In [18]:
# Compute loss
# -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
def log_loss(yt, yp):
    # yt: groundtruth
    # yp: predicted
    ground = np.array(yt)
    pred = yp.astype(float)
    eps_pred = np.maximum(np.minimum(pred, 1. - 1e-15), 1e-15)
    loss = -(ground * np.log(eps_pred) + (1 - ground) * np.log(1 - eps_pred))
    return np.mean(loss)

In [19]:
print(log_loss(Y_testA, predA))

15.2678149023


In [24]:
# Train B
dfB = pd.read_csv('B_hhold_train.csv')
YB = list(dfB['poor'].astype(int))
del dfB['poor']
df_dumB = pd.get_dummies(dfB)
XB = df_dumB
X_trainB, X_testB, Y_trainB, Y_testB = train_test_split(XB, YB, test_size=0.20, random_state=42)
dtrainB = xgb.DMatrix(X_trainB, label=Y_trainB)
dtestB = xgb.DMatrix(X_testB)
#param = {'max_depth':2, 'eta':10, 'silent':1, 'objective':'reg:logistic'}
#num_round = 200
bstB = xgb.train(param, dtrainB, num_round)
predB = bstB.predict(dtestB)
print(log_loss(Y_testB, predB))

25.6813091845


In [27]:
# Train C
dfC = pd.read_csv('C_hhold_train.csv')
YC = list(dfC['poor'].astype(int))
del dfC['poor']
df_dumC = pd.get_dummies(dfC)
XC = df_dumC
X_trainC, X_testC, Y_trainC, Y_testC = train_test_split(XC, YC, test_size=0.20, random_state=42)
dtrainC = xgb.DMatrix(X_trainC, label=Y_trainC)
dtestC = xgb.DMatrix(X_testC)
param = {'max_depth':2, 'eta':10, 'silent':1, 'objective':'reg:logistic'}
num_round = 200
bstC = xgb.train(param, dtrainC, num_round)
predC = bstC.predict(dtestC)
print(log_loss(Y_testC, predC))

0.0816906123634


## Test

In [28]:
A = pd.read_csv('A_hhold_test.csv')
B = pd.read_csv('B_hhold_test.csv')
C = pd.read_csv('C_hhold_test.csv')
A = pd.get_dummies(A)
B = pd.get_dummies(B)
C = pd.get_dummies(C)

In [29]:
missingA = set(A.columns.tolist())
missingB = set(B.columns.tolist())
missingC = set(C.columns.tolist())
missingA = set(df_dumA.columns.tolist()) - missingA
missingB = set(df_dumB.columns.tolist()) - missingB
missingC = set(df_dumC.columns.tolist()) - missingC

for elem in missingA:
    A[elem] = 0
    
for elem in missingB:
    B[elem] = 0
    
for elem in missingC:
    C[elem] = 0

In [30]:
# Workaround for "feature_names mismatch" error
bstA.save_model('A.xgb')
bstA = xgb.Booster(param)
bstA.load_model('A.xgb')
bstB.save_model('B.xgb')
bstB = xgb.Booster(param)
bstB.load_model('B.xgb')
bstC.save_model('C.xgb')
bstC = xgb.Booster(param)
bstC.load_model('C.xgb')

In [31]:
dtestA = xgb.DMatrix(A)
dtestB = xgb.DMatrix(B)
dtestC = xgb.DMatrix(C)

predA = bstA.predict(dtestA)
predB = bstB.predict(dtestB)
predC = bstC.predict(dtestC)

In [32]:
A['country'] = 'A'
B['country'] = 'B'
C['country'] = 'C'
A['poor'] = predA
B['poor'] = predB
C['poor'] = predC
resultsA = A[['id', 'country', 'poor']]
resultsB = B[['id', 'country', 'poor']]
resultsC = C[['id', 'country', 'poor']]
result = pd.concat([resultsA, resultsB, resultsC])
result.head()

Unnamed: 0,id,country,poor
0,418,A,0.0
1,41249,A,1.0
2,16205,A,0.0
3,97501,A,1.0
4,67756,A,0.0


In [33]:
result.to_csv('submission.csv', index=False)