In [20]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

% matplotlib inline

In [21]:
PATH = os.path.join( 'data', 'processed')

In [22]:
data_path = {'A': {'train': os.path.join(PATH, 'A', 'A_hhold_train.csv'),
                   'test' : os.path.join(PATH, 'A', 'A_hhold_test.csv')},
             'B': {'train': os.path.join(PATH, 'B', 'B_hhold_train.csv'),
                   'test' : os.path.join(PATH, 'B', 'B_hhold_test.csv')},
             'C': {'train': os.path.join(PATH, 'C', 'C_hhold_train.csv'), 
                    'test': os.path.join(PATH, 'C', 'C_hhold_test.csv')}}

In [23]:
a_train = pd.read_csv(data_path['A']['train'], index_col='id')
b_train = pd.read_csv(data_path['B']['train'], index_col='id')
c_train = pd.read_csv(data_path['C']['train'], index_col='id')

In [24]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
    return df
    

def pre_process_data(df, enforce_cols=None):
    print("Input shape:\t{}".format(df.shape))
        

    df = standardize(df)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    

    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(-999, inplace=True)
    
    return df

In [25]:
print("Country A")
aX_train = pre_process_data(a_train.drop('poor', axis=1))
ay_train = np.ravel(a_train.poor)

print("\nCountry B")
bX_train = pre_process_data(b_train.drop('poor', axis=1))
by_train = np.ravel(b_train.poor)

print("\nCountry C")
cX_train = pre_process_data(c_train.drop('poor', axis=1))
cy_train = np.ravel(c_train.poor)

Country A
Input shape:	(8203, 344)
After standardization (8203, 344)
After converting categoricals:	(8203, 859)

Country B
Input shape:	(3255, 441)
After standardization (3255, 441)
After converting categoricals:	(3255, 1432)

Country C
Input shape:	(6469, 163)
After standardization (6469, 163)
After converting categoricals:	(6469, 795)


删去部分无用的数据

In [26]:
aX_train = aX_train.drop(['country_A'],axis=1)
bX_train = bX_train.drop(['country_B'],axis=1)
cX_train = cX_train.drop(['country_C'],axis=1)

In [27]:
aX_train.head()

Unnamed: 0_level_0,nEsgxvAq,OMtioXZZ,YFMZwKrU,TiwRslOh,wBXbHZmp_DkQlr,wBXbHZmp_JhtDR,SlDKnCuu_GUusz,SlDKnCuu_alLXR,KAJOWiiw_BIZns,KAJOWiiw_TuovO,...,JCDeZBXq_LPtkN,JCDeZBXq_UyAms,HGPWuGlV_WKNwg,HGPWuGlV_vkbkA,GDUPaBQs_qCEuA,GDUPaBQs_qQxrL,WuwrCsIY_AITFl,WuwrCsIY_GAZGl,AlDbXTlZ_aQeIm,AlDbXTlZ_cecIq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46107,-1.44716,0.325746,1.099716,-0.628045,0,1,1,0,0,1,...,1,0,0,1,0,1,1,0,1,0
82739,-0.414625,-0.503468,-0.01605,0.713467,0,1,1,0,0,1,...,0,1,0,1,0,1,1,0,0,1
9646,0.61791,-0.503468,-0.01605,-0.628045,0,1,1,0,1,0,...,0,1,0,1,0,1,1,0,0,1
10975,0.61791,-1.332682,-1.131816,0.713467,0,1,1,0,0,1,...,0,1,0,1,0,1,1,0,0,1
16463,0.61791,0.325746,-1.131816,-0.180874,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,1,0


In [28]:
cX_train.head()

Unnamed: 0_level_0,LhUIIEHQ,PNAiwXUz,jmsRIiqp,NONtAKOM,kLAQgdly,WWuPOkor,CtFxPQPT,GIwNbAsH,qLDzvjiU,detlNNFh,...,obIQUcpS_YXwVA,lAvdypjD_SnGME,lAvdypjD_UUKOE,lAvdypjD_gWylU,lAvdypjD_jSoky,ARWytYMz_NwjRA,ARWytYMz_herus,eqJPmiPb_Rrpos,eqJPmiPb_mORJu,eqJPmiPb_wnPqZ
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57211,-0.606739,-0.35034,-0.313847,-0.310077,-0.704307,0.524039,0.380397,0.026222,-0.249567,-0.127064,...,1,0,0,0,1,1,0,0,0,1
62519,-0.606739,3.907434,-0.313847,0.249445,-0.390493,0.524039,-3.296468,0.026222,-0.249567,-0.127064,...,0,0,0,0,1,1,0,0,0,1
11614,0.74252,-0.299653,0.449989,0.808967,1.004237,0.524039,0.380397,-0.444972,0.541055,-0.127064,...,1,0,0,0,1,1,0,0,0,1
6470,0.74252,-0.35034,-1.077683,0.808967,1.004237,0.524039,0.292852,-0.444972,-1.040189,-0.127064,...,1,0,0,0,1,1,0,0,0,1
33558,0.74252,0.004474,0.449989,-0.310077,-0.094113,-1.142633,0.424169,-0.444972,-0.249567,0.184764,...,0,0,0,0,1,0,1,0,0,1


In [29]:
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV

通过crossvalidation来做参数筛选

In [34]:
xgb_model = xgb.XGBClassifier()
parameters = {'nthread':[4], 
              'objective':['binary:logistic'],
              'learning_rate':[0.05],
              'max_depth':[10,11,12],
              'min_child_weight':[11],
              'silent':[1],
              'subsample':[0.8],
              'colsample_bytree':[0.7],
              'n_estimators':[50,70,100],
              'missing':[-999],
              'seed':[1337]
             }

In [35]:
model_A = GridSearchCV(xgb_model, param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(ay_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2,
                       refit=True)
model_A.fit(aX_train, ay_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] 

[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  38.7s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 - 1.0min
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 - 1.0min
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=100, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_r

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  7.5min


[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  45.6s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  45.9s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05,

[Parallel(n_jobs=5)]: Done  45 out of  45 | elapsed: 10.3min finished


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[False False ...,  True False], n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'nthread': [4], 'objective': ['binary:logistic'], 'learning_rate': [0.05], 'max_depth': [10, 11, 12], 'min_child_weight': [11], 'silent': [1], 'subsample': [0.8], 'colsample_bytree': [0.7], 'n_estimators': [50, 70, 100], 'missing': [-999], 'seed': [1337]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=2)

In [36]:
model_B = GridSearchCV(xgb_model, param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(by_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2,
                       refit=True)
model_B.fit(bX_train, by_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] 

[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  25.0s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  29.9s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=100, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  30.2s
[CV] colsample_bytree=0.7, learning_r

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  3.5min


[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  20.9s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  20.0s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  20.4s
[CV] colsample_bytree=0.7, learning_ra

[Parallel(n_jobs=5)]: Done  45 out of  45 | elapsed:  4.7min finished


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[False False ..., False False], n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'nthread': [4], 'objective': ['binary:logistic'], 'learning_rate': [0.05], 'max_depth': [10, 11, 12], 'min_child_weight': [11], 'silent': [1], 'subsample': [0.8], 'colsample_bytree': [0.7], 'n_estimators': [50, 70, 100], 'missing': [-999], 'seed': [1337]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=2)

In [37]:
model_C = GridSearchCV(xgb_model, param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(cy_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2,
                       refit=True)
model_C.fit(cX_train, cy_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] 

[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  35.9s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  43.5s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=100, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=11, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  42.2s
[CV] colsample_bytree=0.7, learning_r

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  4.3min


[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  16.3s
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  18.1s
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=70, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 
[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, min_child_weight=11, missing=-999, n_estimators=50, nthread=4, objective=binary:logistic, seed=1337, silent=1, subsample=0.8 -  17.3s
[CV] colsample_bytree=0.7, learning_ra

[Parallel(n_jobs=5)]: Done  45 out of  45 | elapsed:  5.3min finished


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[False  True ..., False False], n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'nthread': [4], 'objective': ['binary:logistic'], 'learning_rate': [0.05], 'max_depth': [10, 11, 12], 'min_child_weight': [11], 'silent': [1], 'subsample': [0.8], 'colsample_bytree': [0.7], 'n_estimators': [50, 70, 100], 'missing': [-999], 'seed': [1337]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=2)

查看最优参数组合

In [38]:
best_parameters, score, _ = max(model_B.grid_scores_, key=lambda x:x[1])
print('Raw ACU score', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r"%(param_name, best_parameters[param_name]))

Raw ACU score 0.8264430361413347
colsample_bytree: 0.7
learning_rate: 0.05
max_depth: 10
min_child_weight: 11
missing: -999
n_estimators: 100
nthread: 4
objective: 'binary:logistic'
seed: 1337
silent: 1
subsample: 0.8


In [39]:
a_test = pd.read_csv(data_path['A']['test'], index_col='id')
b_test = pd.read_csv(data_path['B']['test'], index_col='id')
c_test = pd.read_csv(data_path['C']['test'], index_col='id')

In [40]:
a_test = pre_process_data(a_test, enforce_cols=aX_train.columns)
b_test = pre_process_data(b_test, enforce_cols=bX_train.columns)
c_test = pre_process_data(c_test, enforce_cols=cX_train.columns)

Input shape:	(4041, 344)
After standardization (4041, 344)
After converting categoricals:	(4041, 851)
Input shape:	(1604, 441)
After standardization (1604, 441)
After converting categoricals:	(1604, 1419)
Input shape:	(3187, 163)
After standardization (3187, 163)
After converting categoricals:	(3187, 773)


因为crossvalidation之后，feature的顺序会发生变化，导致test和train的feature顺序不一致，所以在这里我把feature的顺序调整了一下

In [41]:
def order(df_test, df_train):
    new_df = pd.DataFrame()
    for key in df_train:
        new_df[key] = df_test[key]
    return new_df

In [42]:
a_test = order(a_test, aX_train)
b_test = order(b_test, bX_train)
c_test = order(c_test, cX_train)

In [43]:
a_test.head()

Unnamed: 0_level_0,nEsgxvAq,OMtioXZZ,YFMZwKrU,TiwRslOh,wBXbHZmp_DkQlr,wBXbHZmp_JhtDR,SlDKnCuu_GUusz,SlDKnCuu_alLXR,KAJOWiiw_BIZns,KAJOWiiw_TuovO,...,JCDeZBXq_LPtkN,JCDeZBXq_UyAms,HGPWuGlV_WKNwg,HGPWuGlV_vkbkA,GDUPaBQs_qCEuA,GDUPaBQs_qQxrL,WuwrCsIY_AITFl,WuwrCsIY_GAZGl,AlDbXTlZ_aQeIm,AlDbXTlZ_cecIq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
418,0.580509,-0.519164,-0.027227,-1.095034,0,1,1,0,0,1,...,0,1,0,1,0,1,0,1,0,1
41249,0.580509,1.18302,1.062118,0.730851,0,1,0,1,1,0,...,0,1,0,1,0,1,1,0,1,0
16205,-0.361216,0.331928,1.062118,-1.551505,0,1,0,1,1,0,...,0,1,0,1,0,1,1,0,1,0
97501,0.580509,1.18302,1.062118,1.643794,0,1,0,1,1,0,...,0,1,0,1,0,1,1,0,0,1
67756,-1.302941,1.18302,-0.027227,-0.638563,0,1,1,0,1,0,...,0,1,0,1,0,1,1,0,0,1


In [44]:
aX_train.head()

Unnamed: 0_level_0,nEsgxvAq,OMtioXZZ,YFMZwKrU,TiwRslOh,wBXbHZmp_DkQlr,wBXbHZmp_JhtDR,SlDKnCuu_GUusz,SlDKnCuu_alLXR,KAJOWiiw_BIZns,KAJOWiiw_TuovO,...,JCDeZBXq_LPtkN,JCDeZBXq_UyAms,HGPWuGlV_WKNwg,HGPWuGlV_vkbkA,GDUPaBQs_qCEuA,GDUPaBQs_qQxrL,WuwrCsIY_AITFl,WuwrCsIY_GAZGl,AlDbXTlZ_aQeIm,AlDbXTlZ_cecIq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46107,-1.44716,0.325746,1.099716,-0.628045,0,1,1,0,0,1,...,1,0,0,1,0,1,1,0,1,0
82739,-0.414625,-0.503468,-0.01605,0.713467,0,1,1,0,0,1,...,0,1,0,1,0,1,1,0,0,1
9646,0.61791,-0.503468,-0.01605,-0.628045,0,1,1,0,1,0,...,0,1,0,1,0,1,1,0,0,1
10975,0.61791,-1.332682,-1.131816,0.713467,0,1,1,0,0,1,...,0,1,0,1,0,1,1,0,0,1
16463,0.61791,0.325746,-1.131816,-0.180874,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,1,0


In [45]:
a_preds = model_A.predict_proba(a_test)
b_preds = model_B.predict_proba(b_test)
c_preds = model_C.predict_proba(c_test)

In [46]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

In [47]:
# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

In [48]:
submission = pd.concat([a_sub, b_sub, c_sub])

In [49]:
submission.tail()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6775,C,0.005753
88300,C,0.004832
35424,C,0.00701
81668,C,0.005411
98377,C,0.007171


In [51]:
submission.to_csv('submission.csv')