# Load and transform data

In [1]:
import numpy as np
import pandas as pd

In [2]:
# load
A_hhold_train = pd.read_csv('A_hhold_train.csv')
A_indiv_train = pd.read_csv('A_indiv_train.csv')
A_hhold_test = pd.read_csv('A_hhold_test.csv')
A_indiv_test = pd.read_csv('A_indiv_test.csv')
# remove cols
A_indiv_train_1 = A_indiv_train.drop(['iid','poor','country'], axis=1)
A_indiv_test_1 = A_indiv_test.drop(['iid','country'], axis=1)
# indiv numerical cols
A_indiv_train_num_col = list(A_indiv_train_1.dtypes[A_indiv_train_1.dtypes!='object'].index)
A_indiv_train_num_col.remove('id')
# keep categorial cols only
A_indiv_train_cat = A_indiv_train_1.drop(A_indiv_train_num_col, axis=1)
A_indiv_test_cat = A_indiv_test_1.drop(A_indiv_train_num_col, axis=1)
# keep numerical cols only
A_indiv_train_num_col.append('id')
A_indiv_train_num = A_indiv_train_1[A_indiv_train_num_col]
A_indiv_test_num = A_indiv_test_1[A_indiv_train_num_col]
# pivot to get freq for cat cols
A_indiv_train_cat_frq = (A_indiv_train_cat.set_index('id').stack()
 .groupby(level=[0,1])
 .value_counts()
 .unstack(level=[1,2])
 .fillna(0)
 .sort_index(axis=1))
A_indiv_test_cat_frq = (A_indiv_test_cat.set_index('id').stack()
 .groupby(level=[0,1])
 .value_counts()
 .unstack(level=[1,2])
 .fillna(0)
 .sort_index(axis=1))
# mean for num cols
A_indiv_train_num_mean = A_indiv_train_num.groupby('id').mean()
A_indiv_test_num_mean = A_indiv_test_num.groupby('id').mean()
# join to hhold 
A_train = A_hhold_train.set_index('id').join(A_indiv_train_cat_frq).join(A_indiv_train_num_mean)
A_test = A_hhold_test.set_index('id').join(A_indiv_test_cat_frq).join(A_indiv_test_num_mean)
# add missing freq cols
for i in [i for i in A_indiv_test_cat_frq.columns if i not in A_indiv_train_cat_frq.columns]:
    A_train[i] = 0
for i in [i for i in A_indiv_train_cat_frq.columns if i not in A_indiv_test_cat_frq.columns]:
    A_test[i] = 0   

#'''
# Option1 Standardized   
# convert to numeric arrays
A = pd.concat([A_train,A_test], axis=0)
A.columns = [''.join(col).strip() for col in A.columns.values]
A_feature = A.drop(['poor','country'], axis=1)
# standardization
A_cols_to_std = list(A_feature.dtypes[A_feature.dtypes!='object'].index)
A_feature[A_cols_to_std] = A_feature[A_cols_to_std].apply(lambda x: (x-x.mean()) / x.std())
# Encoding categorical cols
AX_all = pd.get_dummies(A_feature,drop_first=True)
# Split back into train and test
AX_test = AX_all[AX_all.index.isin(A_test.index)].as_matrix()
AX = AX_all[AX_all.index.isin(A_train.index)].as_matrix()
# Get train target 
Ay = np.array(A.poor[A.poor.isnull()==False].astype(int))     

#'''

'''
# Option2 Not Standardized
# convert to numeric arrays
A = pd.concat([A_train,A_test], axis=0)
A.columns = [''.join(col).strip() for col in A.columns.values]
A_feature = A.drop(['poor','country'], axis=1)
# Encoding categorical cols
AX_all = pd.get_dummies(A_feature,drop_first=True)
# Split back into train and test
AX_test = AX_all[AX_all.index.isin(A_test.index)].as_matrix()
AX = AX_all[AX_all.index.isin(A_train.index)].as_matrix()
# Get train target 
Ay = np.array(A.poor[A.poor.isnull()==False].astype(int))     
'''



"\n# Option2 Not Standardized\n# convert to numeric arrays\nA = pd.concat([A_train,A_test], axis=0)\nA.columns = [''.join(col).strip() for col in A.columns.values]\nA_feature = A.drop(['poor','country'], axis=1)\n# Encoding categorical cols\nAX_all = pd.get_dummies(A_feature,drop_first=True)\n# Split back into train and test\nAX_test = AX_all[AX_all.index.isin(A_test.index)].as_matrix()\nAX = AX_all[AX_all.index.isin(A_train.index)].as_matrix()\n# Get train target \nAy = np.array(A.poor[A.poor.isnull()==False].astype(int))     \n"

In [3]:
print(A_hhold_train.shape,A_indiv_train.shape,A_indiv_train_cat.shape,A_indiv_train_cat_frq.shape
      ,A_indiv_train_num.shape,A_indiv_train_num_mean.shape,A_train.shape)
print(A_hhold_test.shape,A_indiv_test.shape,A_indiv_test_cat.shape,A_indiv_test_cat_frq.shape
      ,A_indiv_test_num.shape,A_indiv_test_num_mean.shape,A_test.shape)
print(AX_all.shape,AX.shape,AX_test.shape)

(8203, 346) (37560, 44) (37560, 39) (8203, 271) (37560, 3) (8203, 2) (8203, 621)


In [6]:
#load
B_hhold_train = pd.read_csv('B_hhold_train.csv')
B_indiv_train = pd.read_csv('B_indiv_train.csv')
B_hhold_test = pd.read_csv('B_hhold_test.csv')
B_indiv_test = pd.read_csv('B_indiv_test.csv')
# remove cols
B_indiv_train_1 = B_indiv_train.drop(['iid','poor','country'], axis=1)
B_indiv_test_1 = B_indiv_test.drop(['iid','country'], axis=1)
# indiv numerical cols
B_indiv_train_num_col = list(B_indiv_train_1.dtypes[B_indiv_train_1.dtypes!='object'].index)
B_indiv_train_num_col.remove('id')
# keep categorial cols only
B_indiv_train_cat = B_indiv_train_1.drop(B_indiv_train_num_col, axis=1)
B_indiv_test_cat = B_indiv_test_1.drop(B_indiv_train_num_col, axis=1)
# keep numerical cols only
B_indiv_train_num_col.append('id')
B_indiv_train_num = B_indiv_train_1[B_indiv_train_num_col]
B_indiv_test_num = B_indiv_test_1[B_indiv_train_num_col]
# pivot to get freq for cat cols
B_indiv_train_cat_frq = (B_indiv_train_cat.set_index('id').stack()
 .groupby(level=[0,1])
 .value_counts()
 .unstack(level=[1,2])
 .fillna(0)
 .sort_index(axis=1))
B_indiv_test_cat_frq = (B_indiv_test_cat.set_index('id').stack()
 .groupby(level=[0,1])
 .value_counts()
 .unstack(level=[1,2])
 .fillna(0)
 .sort_index(axis=1))
# mean for num cols
B_indiv_train_num_mean = B_indiv_train_num.groupby('id').mean()
B_indiv_test_num_mean = B_indiv_test_num.groupby('id').mean()
# join to hhold 
B_train = B_hhold_train.set_index('id').join(B_indiv_train_cat_frq).join(B_indiv_train_num_mean,lsuffix='ind_')
B_test = B_hhold_test.set_index('id').join(B_indiv_test_cat_frq).join(B_indiv_test_num_mean,lsuffix='ind_')
# add missing freq cols
for i in [i for i in B_indiv_test_cat_frq.columns if i not in B_indiv_train_cat_frq.columns]:
    B_train[i] = 0
for i in [i for i in B_indiv_train_cat_frq.columns if i not in B_indiv_test_cat_frq.columns]:
    B_test[i] = 0   
# convert to numeric arrays
B = pd.concat([B_train,B_test], axis=0)
B.columns = [''.join(col).strip() for col in B.columns.values]
B_feature = B.drop(['poor','country'], axis=1)
# standardization
B_cols_to_std = list(B_feature.dtypes[B_feature.dtypes!='object'].index)
B_feature[B_cols_to_std] = B_feature[B_cols_to_std].apply(lambda x: (x-x.mean()) / x.std())
# Encoding categorical cols
BX_all = pd.get_dummies(B_feature,drop_first=True)
# Split back into train and test
BX_test = BX_all[BX_all.index.isin(B_test.index)].as_matrix()
BX = BX_all[BX_all.index.isin(B_train.index)].as_matrix()
# Get train target 
By = np.array(B.poor[B.poor.isnull()==False].astype(int)) 



In [7]:
print(B_hhold_train.shape,B_indiv_train.shape,B_indiv_train_cat.shape,B_indiv_train_cat_frq.shape
      ,B_indiv_train_num.shape,B_indiv_train_num_mean.shape,B_train.shape)
print(B_hhold_test.shape,B_indiv_test.shape,B_indiv_test_cat.shape,B_indiv_test_cat_frq.shape
      ,B_indiv_test_num.shape,B_indiv_test_num_mean.shape,B_test.shape)
print(BX_all.shape,BX.shape,BX_test.shape)

(3255, 443) (20252, 227) (20252, 192) (3255, 1608) (20252, 33) (3255, 32) (3255, 2141)


In [10]:
#load 
C_hhold_train = pd.read_csv('C_hhold_train.csv')
C_indiv_train = pd.read_csv('C_indiv_train.csv')
C_hhold_test = pd.read_csv('C_hhold_test.csv')
C_indiv_test = pd.read_csv('C_indiv_test.csv')
# remove cols
C_indiv_train_1 = C_indiv_train.drop(['iid','poor','country'], axis=1)
C_indiv_test_1 = C_indiv_test.drop(['iid','country'], axis=1)
# indiv numerical cols
C_indiv_train_num_col = list(C_indiv_train_1.dtypes[C_indiv_train_1.dtypes!='object'].index)
C_indiv_train_num_col.remove('id')
# keep categorial cols only
C_indiv_train_cat = C_indiv_train_1.drop(C_indiv_train_num_col, axis=1)
C_indiv_test_cat = C_indiv_test_1.drop(C_indiv_train_num_col, axis=1)
# keep numerical cols only
C_indiv_train_num_col.append('id')
C_indiv_train_num = C_indiv_train_1[C_indiv_train_num_col]
C_indiv_test_num = C_indiv_test_1[C_indiv_train_num_col]
# pivot to get freq for cat cols
C_indiv_train_cat_frq = (C_indiv_train_cat.set_index('id').stack()
 .groupby(level=[0,1])
 .value_counts()
 .unstack(level=[1,2])
 .fillna(0)
 .sort_index(axis=1))
C_indiv_test_cat_frq = (C_indiv_test_cat.set_index('id').stack()
 .groupby(level=[0,1])
 .value_counts()
 .unstack(level=[1,2])
 .fillna(0)
 .sort_index(axis=1))
# mean for num cols
C_indiv_train_num_mean = C_indiv_train_num.groupby('id').mean()
C_indiv_test_num_mean = C_indiv_test_num.groupby('id').mean()
# join to hhold 
C_train = C_hhold_train.set_index('id').join(C_indiv_train_cat_frq).join(C_indiv_train_num_mean)
C_test = C_hhold_test.set_index('id').join(C_indiv_test_cat_frq).join(C_indiv_test_num_mean)
# add missing freq cols
for i in [i for i in C_indiv_test_cat_frq.columns if i not in C_indiv_train_cat_frq.columns]:
    C_train[i] = 0
for i in [i for i in C_indiv_train_cat_frq.columns if i not in C_indiv_test_cat_frq.columns]:
    C_test[i] = 0    
# convert to numeric arrays
C = pd.concat([C_train,C_test], axis=0)
C.columns = [''.join(col).strip() for col in C.columns.values]
C_feature = C.drop(['poor','country'], axis=1)
# standardization
C_cols_to_std = list(C_feature.dtypes[C_feature.dtypes!='object'].index)
C_feature[C_cols_to_std] = C_feature[C_cols_to_std].apply(lambda x: (x-x.mean()) / x.std())
# Encoding categorical cols
CX_all = pd.get_dummies(C_feature,drop_first=True)
# Split back into train and test
CX_test = CX_all[CX_all.index.isin(C_test.index)].as_matrix()
CX = CX_all[CX_all.index.isin(C_train.index)].as_matrix()
# Get train target 
Cy = np.array(C.poor[C.poor.isnull()==False].astype(int))     



In [11]:
print(C_hhold_train.shape,C_indiv_train.shape,C_indiv_train_cat.shape,C_indiv_train_cat_frq.shape
      ,C_indiv_train_num.shape,C_indiv_train_num_mean.shape,C_train.shape)
print(C_hhold_test.shape,C_indiv_test.shape,C_indiv_test_cat.shape,C_indiv_test_cat_frq.shape
      ,C_indiv_test_num.shape,C_indiv_test_num_mean.shape,C_test.shape)
print(CX_all.shape,CX.shape,CX_test.shape)

(6469, 165) (29913, 44) (29913, 36) (6469, 294) (29913, 6) (6469, 5) (6469, 464)


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
AXtrn,AXtst,Aytrn,Aytst = train_test_split(AX,Ay,test_size=0.33, random_state=42)
BXtrn,BXtst,Bytrn,Bytst = train_test_split(BX,By,test_size=0.33, random_state=42)
CXtrn,CXtst,Cytrn,Cytst = train_test_split(CX,Cy,test_size=0.33, random_state=42)

# Try models

## Random Forest

In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [95]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [96]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(AXtrn, Aytrn)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.7min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [97]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 60,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 600}

In [98]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid =  {'bootstrap': [False],
 'max_depth': [70,80],
 'max_features': [32,36],
 'min_samples_leaf': [1],
 'min_samples_split': [6,7],
 'n_estimators': [200,400]}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring='neg_log_loss')

In [99]:
grid_search.fit(AXtrn,Aytrn)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:  7.5min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [False], 'max_depth': [50, 60, 70], 'max_features': [24, 28, 32], 'min_samples_leaf': [1], 'min_samples_split': [4, 5, 6], 'n_estimators': [400, 600, 800]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [100]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 70,
 'max_features': 32,
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 400}

In [101]:
bestrf = RandomForestClassifier()
bestrf.set_params(**grid_search.best_params_)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=70, max_features=32, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=6,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [102]:
bestrf.fit(AXtrn,Aytrn)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=70, max_features=32, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=6,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [103]:
Ayprdbestrf = bestrf.predict(AXtst)
Ayprdprobbestrf = bestrf.predict_proba(AXtst)

In [104]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Aytst, Ayprdbestrf)

array([[1284,  214],
       [ 178, 1031]], dtype=int64)

In [105]:
from sklearn.metrics import log_loss
log_loss(Aytst, Ayprdprobbestrf)

0.35940109445403479

## SVM

In [107]:
from sklearn.svm import SVC as svc 
from scipy import stats
 
svc = svc(probability = True, random_state = 1)

rand_list = {"C": stats.uniform(2, 10),
             "gamma": stats.uniform(0.1, 1)}
              
svm_random = RandomizedSearchCV(svc, param_distributions = rand_list, n_iter = 20, 
                                n_jobs = 4, cv = 3, random_state = 2017, scoring = 'neg_log_loss' ) 
svm_random.fit(AXtrn,Aytrn)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001,
  verbose=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=4,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002418A6470B8>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002418A6475C0>},
          pre_dispatch='2*n_jobs', random_state=2017, refit=True,
          return_train_score='warn', scoring='neg_log_loss', verbose=0)

In [108]:
svm_random.best_params_

{'C': 9.0331975797816781, 'gamma': 1.0312187582625605}

In [109]:
from sklearn.svm import SVC
bestsvc = SVC(probability=True)
bestsvc.set_params(**svm_random.best_params_)

SVC(C=9.0331975797816781, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0312187582625605,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [110]:
bestsvc.fit(AXtrn,Aytrn)

SVC(C=9.0331975797816781, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0312187582625605,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [111]:
Ayprdbestsvc = bestsvc.predict(AXtst)
Ayprdprobbestsvc= bestsvc.predict_proba(AXtst)

In [112]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Aytst, Ayprdbestsvc)

array([[1498,    0],
       [1209,    0]], dtype=int64)

In [113]:
from sklearn.metrics import log_loss
log_loss(Aytst, Ayprdprobbestsvc)

0.68754074182439362

In [114]:
from sklearn.svm import SVC
svc_test = SVC(probability=True)

In [115]:
svc_test.fit(AXtrn,Aytrn)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [116]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Aytst, svc_test.predict(AXtst))

array([[1294,  204],
       [ 187, 1022]], dtype=int64)

In [117]:
from sklearn.metrics import log_loss
log_loss(Aytst, svc_test.predict_proba(AXtst))

0.3205624269636016

In [133]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from dask.diagnostics import ProgressBar
param_grid =  [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4, 1e-5], 'C': [100, 300, 1000]}]
                #{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
# Create a based model
svc = SVC(probability=True)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv = 3, scoring = 'neg_log_loss')

In [134]:
grid_search.fit(AXtrn,Aytrn)

GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001, 1e-05], 'C': [100, 300, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

In [135]:
grid_search.best_params_

{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}

In [136]:
from sklearn.svm import SVC
bestsvc = SVC(probability=True)
bestsvc.set_params(**grid_search.best_params_)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [137]:
bestsvc.fit(AXtrn,Aytrn)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [138]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Aytst, bestsvc.predict(AXtst))

array([[1296,  202],
       [ 149, 1060]], dtype=int64)

In [139]:
from sklearn.metrics import log_loss
log_loss(Aytst, bestsvc.predict_proba(AXtst))

0.29884170686672307

## Neural Network

In [243]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping

In [360]:
nn = Sequential()
nn.add(Dense(30, input_shape=(795,),kernel_initializer='normal', activation='relu'
             ,kernel_regularizer=regularizers.l2(0.005)))
nn.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
# Compile model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [366]:
earlystop = EarlyStopping(monitor='loss', min_delta=0.0005, patience=3, 
                          verbose=1, mode='auto')
callbacks_list = [earlystop]

nn.fit(AXtrn, Aytrn, epochs=100, batch_size=3, verbose=1, callbacks=callbacks_list,
                       validation_split=0.2)

Train on 4396 samples, validate on 1100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping


<keras.callbacks.History at 0x241bfe07a90>

In [367]:
Ayprednn = nn.predict(AXtst)

In [368]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Aytst, (Ayprednn>=.5).astype(int))

array([[1214,  284],
       [ 125, 1084]], dtype=int64)

In [369]:
Ayprednn_proba = np.concatenate((1-Ayprednn,Ayprednn),axis=1)

In [370]:
from sklearn.metrics import log_loss
log_loss(Aytst, Ayprednn_proba)

0.32834306457796814