In [26]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

# data directory
DATA_DIR = os.path.join( 'data', 'processed')

In [18]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.
  return f(*args, **kwds)


In [19]:
data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A', 'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A', 'A_hhold_test.csv'),
                    'Itrain': os.path.join(DATA_DIR, 'A', 'A_indiv_train.csv'),
                    'Itest': os.path.join(DATA_DIR, 'A', 'A_indiv_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B', 'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B', 'B_hhold_test.csv'),
                    'Itrain': os.path.join(DATA_DIR, 'B', 'B_indiv_train.csv'),
                    'Itest': os.path.join(DATA_DIR, 'B', 'B_indiv_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C', 'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C', 'C_hhold_test.csv'),
                    'Itrain': os.path.join(DATA_DIR, 'C', 'C_indiv_train.csv'),
                    'Itest': os.path.join(DATA_DIR, 'C', 'C_indiv_test.csv')}}

In [20]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    return df
    

def pre_process_data(df, enforce_cols=None):
    
    print("Input shape:\t{}".format(df.shape)) 
    
#     df = standardize(df)

    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)
    
    return df 
def drop(df, col, threshold):
    poor_count = sum(df['poor'])
    non_count = df.shape[0]-poor_count
    col_poor = Counter(df[col][df['poor'] == True])
    col_non_poor = Counter(df[col][df['poor'] == False])
    key = set(col_poor.keys())|set(col_poor.keys())
    poor_value, non_poor_value = any([i/poor_count>threshold for i in col_poor.values()]), any([i/non_count>threshold for i in col_non_poor.values()])
    if poor_value & non_poor_value:
        return df.drop([col],axis=1)
    else: return df

def drop_col(hhold, indi, hhold_threshold, indi_threshold):
    hhold_poor = sum(hhold['poor'])
    hhold_no = hhold.shape[0] - hhold_poor
    indi_poor = sum(indi['poor'])
    indi_no = indi.shape[0] - indi_poor
    for col in hhold:
        if col == 'poor' or col == 'country': continue
        h_col_poor = Counter(hhold[col][hhold['poor'] == True])
        h_col_non = Counter(hhold[col][hhold['poor'] == False])
        if any([i/hhold_poor>hhold_threshold for i in h_col_poor.values()]) & any([i/hhold_no>hhold_threshold for i in h_col_non.values()]):
            hhold = hhold.drop([col], axis = 1)
    for col in indi:
        if col == 'poor' or col == 'country' or col == 'iid': continue
        i_col_poor = Counter(indi[col][indi['poor'] == True])
        i_col_non = Counter(indi[col][indi['poor'] == False])
        if any([i/indi_poor>indi_threshold for i in i_col_poor.values()]) & any([i/indi_no>indi_threshold for i in i_col_non.values()]):
            indi = indi.drop([col], axis = 1)
    return hhold, indi.drop(['poor','country'],axis=1)
    
def combine_hhold_indi(hhold, indi):
    hhold = pre_process_data(hhold)
    iid_count = pd.DataFrame(indi.groupby('id')['iid'].count())
    iid_count.columns = ['family_numbers']
    indi = indi.drop('iid',axis=1)
    indi_num = indi.select_dtypes(include=['int64', 'float64'])
    indi_obj = indi.drop(list(indi_num.columns), axis = 1)
    indi_obj = pd.get_dummies(indi_obj)
    indi_num = indi_num.fillna(indi_num.mean())
    indi_obj = pre_process_data(indi_obj)
    indi_num = indi_num.groupby('id')[list(indi_num.columns)].mean()
#     indi_num = standardize(indi_num)
    indi_obj = indi_obj.groupby('id')[list(indi_obj.columns)].sum()/indi_obj.groupby('id')[list(indi_obj.columns)].count()
    com = pd.concat([iid_count,indi_num,indi_obj,hhold],axis=1)
    com.fillna(-100,inplace=True)
    return com

def order(df_test, df_train, ori_test):
    new_df = pd.DataFrame()
    for key in df_train:
        new_df[key] = df_test[key]
    new_df = new_df.loc[ori_test.index]
    return new_df
def mll(y_true,y_pred):
    loss = 0
    for i in range(3):
        for j in range(len(y_true[i])):
            loss += y_true[i][j]*np.log(y_pred[i][j])+(1-y_true[i][j])*np.log(1-y_pred[i][j])
        loss = loss / (j+1)
    return -loss/3

def combine_test_hhold_indi(hhold, indi, train):
    hhold = pre_process_data(hhold.drop('country',axis = 1))
    iid_count = pd.DataFrame(indi.groupby('id')['iid'].count())
    iid_count.columns = ['family_numbers']
    indi = indi.drop('iid',axis=1)
    indi_num = indi.select_dtypes(include=['int64', 'float64'])
    indi_obj = indi.drop(list(indi_num.columns), axis = 1)
    indi_obj = pd.get_dummies(indi_obj)
    indi_num = indi_num.fillna(indi_num.mean())
    indi_obj = pre_process_data(indi_obj)
    indi_num = indi_num.groupby('id')[list(indi_num.columns)].mean()
#     indi_num = standardize(indi_num)
    indi_obj = indi_obj.groupby('id')[list(indi_obj.columns)].sum()/indi_obj.groupby('id')[list(indi_obj.columns)].count()
    com = pd.concat([iid_count,indi_num,indi_obj,hhold],axis=1)
    to_drop = np.setdiff1d(com.columns, train.columns)
    to_add = np.setdiff1d(train.columns, com.columns)

    com.drop(to_drop, axis=1, inplace=True)
    com = com.assign(**{c: 0 for c in to_add})
    
    com.fillna(0, inplace=True)
    return com

In [21]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

ai_train = pd.read_csv(data_paths['A']['Itrain'], index_col='id')
bi_train = pd.read_csv(data_paths['B']['Itrain'], index_col='id').drop('wJthinfa',axis=1)
ci_train = pd.read_csv(data_paths['C']['Itrain'], index_col='id')

In [22]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')
ai_test = pd.read_csv(data_paths['A']['Itest'], index_col='id')
bi_test = pd.read_csv(data_paths['B']['Itest'], index_col='id').drop('wJthinfa',axis=1)
ci_test = pd.read_csv(data_paths['C']['Itest'], index_col='id')

In [23]:
train_a,train_ai = drop_col(a_train,ai_train,0.95,0.95)
A_train = combine_hhold_indi(train_a,train_ai)
train_b,train_bi = drop_col(b_train,bi_train,0.95,0.95)
B_train = combine_hhold_indi(train_b,train_bi)
train_c,train_ci = drop_col(c_train,ci_train,0.95,0.95)
C_train = combine_hhold_indi(train_c,train_ci)
A_train = A_train.sample(frac=1)
B_train = B_train.sample(frac=1)
C_train = C_train.sample(frac=1)

Input shape:	(8203, 227)
After converting categoricals:	(8203, 612)
Input shape:	(37560, 255)
After converting categoricals:	(37560, 255)
Input shape:	(3255, 340)
After converting categoricals:	(3255, 1221)
Input shape:	(20252, 1039)
After converting categoricals:	(20252, 1039)
Input shape:	(6469, 155)
After converting categoricals:	(6469, 741)
Input shape:	(29913, 279)
After converting categoricals:	(29913, 279)


In [24]:
aX_train = A_train.drop('poor', axis = 1) 
aY_train = A_train['poor'].values
bX_train = B_train.drop('poor', axis = 1) 
bY_train = B_train['poor'].values
cX_train = C_train.drop('poor', axis = 1) 
cY_train = C_train['poor'].values
A_test = combine_test_hhold_indi(a_test, ai_test, aX_train)
B_test = combine_test_hhold_indi(b_test, bi_test, bX_train)
C_test = combine_test_hhold_indi(c_test, ci_test, cX_train)
A_test = order(A_test, aX_train, a_test)
B_test = order(B_test, bX_train, b_test)
C_test = order(C_test, cX_train, c_test)

Input shape:	(4041, 343)
After converting categoricals:	(4041, 850)
Input shape:	(18535, 271)
After converting categoricals:	(18535, 271)
Input shape:	(1604, 440)
After converting categoricals:	(1604, 1418)
Input shape:	(10066, 1502)
After converting categoricals:	(10066, 1502)
Input shape:	(3187, 162)
After converting categoricals:	(3187, 772)
Input shape:	(14701, 296)
After converting categoricals:	(14701, 296)


In [25]:
import math as mt
import csv
from sparsesvd import sparsesvd #used for matrix factorization
import numpy as np
from scipy.sparse import csc_matrix #used for sparse matrix
from scipy.sparse.linalg import * #used for matrix multiplication


In [27]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

In [90]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.cross_validation import StratifiedKFold,train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


ax_train,ax_valid,ay_train,ay_valid = train_test_split(aX_train, aY_train, test_size=0.2, random_state=4242)
print('Train samples: {} Validation samples: {}'.format(len(ax_train), len(ax_valid)))
bx_train,bx_valid,by_train,by_valid = train_test_split(bX_train, bY_train, test_size=0.2, random_state=4242)
cx_train,cx_valid,cy_train,cy_valid = train_test_split(cX_train, cY_train, test_size=0.2, random_state=4242)

Train samples: 6562 Validation samples: 1641


# random forest

In [112]:
params  = { 
           "n_estimators" : [9, 18, 27, 36, 45, 54, 63],
           "max_depth" : [1, 5, 10, 15, 20, 25, 30],
           "min_samples_leaf" : [1, 2, 4, 6, 8, 10]}

In [None]:
RF = RandomForestForest(min_samples_split=100,min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10)
model_A = GridSearchCV(estimator = RF, param_grid=params,cv=10).fit(ax_train, ay_train)

In [None]:
model_B = GridSearchCV(estimator = RandomForestForest(min_samples_split=100,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
                       param_grid=params, n_jobs=5,
                       cv=StratifiedKFold(by_train, n_folds=5,shuffle=True),
                       scoring='roc_auc').fit(trans_btrain, by_train)

In [95]:
tscore_A =RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
            oob_score=False, random_state=10, verbose=0, warm_start=False).fit(trans_atrain, ay_train)
tscore_A.score(trans_avalid, ay_valid)

0.7915904936014625

In [111]:
from sklearn.ensemble import RandomForestClassifier

def randomforest(features, labels, **kwargs):
    
    # instantiate model
    model = RandomForestClassifier(n_estimators=50, random_state=0)
    
    # train model
    model.fit(features, labels)
    
    # get a (not-very-useful) sense of performance
    accuracy = model.score(features, labels)
    print(f"In-sample accuracy: {accuracy:0.2%}")
    
    return model

In [112]:
model_a = randomforest(aX_train, aY_train)
model_b = randomforest(bX_train, bY_train)
model_c = randomforest(cX_train, cY_train)

In-sample accuracy: 100.00%
In-sample accuracy: 99.94%
In-sample accuracy: 100.00%


In [113]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')
# process the test data
a_test = pre_process_data(a_test, enforce_cols=aX_train.columns)
b_test = pre_process_data(b_test, enforce_cols=bX_train.columns)
c_test = pre_process_data(c_test, enforce_cols=cX_train.columns)

Input shape:	(4041, 344)
After standardization (4041, 344)
After converting categoricals:	(4041, 851)
Input shape:	(1604, 441)
After standardization (1604, 441)
After converting categoricals:	(1604, 1419)
Input shape:	(3187, 163)
After standardization (3187, 163)
After converting categoricals:	(3187, 773)


In [114]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

In [115]:
a_preds = model_a.predict_proba(a_test)
b_preds = model_b.predict_proba(b_test)
c_preds = model_c.predict_proba(c_test)
# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

In [116]:
submission = pd.concat([a_sub, b_sub, c_sub])
submission.head()
submission.tail()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6775,C,0.3
88300,C,0.2
35424,C,0.2
81668,C,0.28
98377,C,0.18


# logistic regression

In [26]:
xtrain, xvalid, ytrain, yvalid = train_test_split(aX_train, ay_train, 
                                                  stratify=ay_train, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [27]:
print (xtrain.shape)
print (xvalid.shape)

(7382, 859)
(821, 859)


In [60]:
from sklearn.linear_model import LogisticRegression

def logistic_regression(features, labels, **kwargs):
    
    # instantiate model
    model = LogisticRegression(C=0.0001)
    
    # train model
    model.fit(features, labels)
    
    # get a (not-very-useful) sense of performance
    accuracy = model.score(features, labels)
    print(f"In-sample accuracy: {accuracy:0.2%}")
    
    return model

In [61]:
model_a = logistic_regression(aX_train, ay_train)
model_b = logistic_regression(bX_train, by_train)
model_c = logistic_regression(cX_train, cy_train)


In-sample accuracy: 79.46%
In-sample accuracy: 92.29%
In-sample accuracy: 85.02%


In [49]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')
# process the test data
a_test = pre_process_data(a_test, enforce_cols=aX_train.columns)
b_test = pre_process_data(b_test, enforce_cols=bX_train.columns)
c_test = pre_process_data(c_test, enforce_cols=cX_train.columns)

Input shape:	(4041, 344)
After standardization (4041, 344)
After converting categoricals:	(4041, 851)
Input shape:	(1604, 441)
After standardization (1604, 441)
After converting categoricals:	(1604, 1419)
Input shape:	(3187, 163)
After standardization (3187, 163)
After converting categoricals:	(3187, 773)


In [50]:
a_preds = model_a.predict_proba(a_test)
b_preds = model_b.predict_proba(b_test)
c_preds = model_c.predict_proba(c_test)
# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

In [51]:
submission = pd.concat([a_sub, b_sub, c_sub])
submission.head()
submission.tail()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6775,C,0.368843
88300,C,0.303913
35424,C,0.302237
81668,C,0.305962
98377,C,0.260666


# SVD

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.cross_validation import StratifiedKFold,train_test_split
from sklearn.ensemble import RandomForestClassifier


ax_train,ax_valid,ay_train,ay_valid = train_test_split(aX_train, aY_train, test_size=0.2, random_state=4242)
print('Train samples: {} Validation samples: {}'.format(len(ax_train), len(ax_valid)))
bx_train,bx_valid,by_train,by_valid = train_test_split(bX_train, bY_train, test_size=0.2, random_state=4242)
cx_train,cx_valid,cy_train,cy_valid = train_test_split(cX_train, cY_train, test_size=0.2, random_state=4242)

Train samples: 6562 Validation samples: 1641


In [29]:
def transform(trainfeatures,testfeatures, validfeatures, n,**kwargs):
    svd = decomposition.TruncatedSVD(n_components=n)
    svd.fit(trainfeatures)
    trainfeatures_svd = svd.transform(trainfeatures)
    testfeatures_svd = svd.transform(testfeatures)
    validfeatures_svd = svd.transform(validfeatures)
    
    scl = preprocessing.StandardScaler()
    scl.fit(trainfeatures_svd)
    trainfeatures_svd_scl = scl.transform(trainfeatures_svd)
    testfeatures_svd_scl = scl.transform(testfeatures_svd) 
    validfeatures_svd_scl = scl.transform(validfeatures_svd) 
    return trainfeatures_svd_scl,testfeatures_svd_scl,validfeatures_svd_scl



In [30]:
trans_atrain, trans_atest, trans_avalid = transform(ax_train, A_test, ax_valid, 340)
trans_btrain, trans_btest, trans_bvalid = transform(bx_train, B_test, bx_valid, 440)
trans_ctrain, trans_ctest, trans_cvalid = transform(cx_train, C_test, cx_valid, 160)

In [44]:
svm_model = SVC()
parameters = [{'C': [0.1,1,10],'gamma': [0.1,0.01,0.001,0.0001,0.00001],'class_weight':['balanced'],'kernel': ['rbf']}]

In [45]:
model_A = GridSearchCV(svm_model,param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(ay_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2).fit(trans_atrain, ay_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV]  C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  30.1s
[CV] C=0.1, class_weight=balanced, gamma=0.01, kernel=rbf ............
[CV]  C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  30.1s
[CV] C=0.1, class_weight=balanced, gamma=0.01, kernel=rbf ............
[CV]  C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  30.1s
[CV]  C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  30.0s
[CV]  C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  30.0s
[CV] C=0.1, class_weight=balanced, gamma=0.01, kernel=rbf ............
[CV] C

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  5.1min


[CV]  C=1, class_weight=balanced, gamma=0.01, kernel=rbf, total=  22.5s
[CV] C=1, class_weight=balanced, gamma=0.001, kernel=rbf .............
[CV]  C=1, class_weight=balanced, gamma=0.01, kernel=rbf, total=  22.7s
[CV] C=1, class_weight=balanced, gamma=0.001, kernel=rbf .............
[CV]  C=1, class_weight=balanced, gamma=0.01, kernel=rbf, total=  22.6s
[CV] C=1, class_weight=balanced, gamma=0.001, kernel=rbf .............
[CV]  C=1, class_weight=balanced, gamma=0.01, kernel=rbf, total=  22.5s
[CV] C=1, class_weight=balanced, gamma=0.001, kernel=rbf .............
[CV]  C=1, class_weight=balanced, gamma=0.001, kernel=rbf, total=  12.3s
[CV]  C=1, class_weight=balanced, gamma=0.001, kernel=rbf, total=  12.8s
[CV]  C=1, class_weight=balanced, gamma=0.001, kernel=rbf, total=  12.4s
[CV] C=1, class_weight=balanced, gamma=0.0001, kernel=rbf ............
[CV] C=1, class_weight=balanced, gamma=0.0001, kernel=rbf ............
[CV] C=1, class_weight=balanced, gamma=0.0001, kernel=rbf .........

[Parallel(n_jobs=5)]: Done  75 out of  75 | elapsed:  9.6min finished


In [46]:
model_A.best_estimator_

SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
model_A.best_score_

0.95061715930450497

In [56]:
model_A = SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False).fit(trans_atrain, ay_train)
model_A.score(trans_avalid, ay_valid)

0.86227909811090797

In [15]:
model_B = GridSearchCV(svm_model,param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(by_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2,
                       refit=True).fit(trans_btrain, by_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV] C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf .............
[CV]  C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf, total=   5.6s
[CV] C=0.1, class_weight=balanced, gamma=0.01, kernel=rbf ............
[CV]  C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf, total=   5.6s
[CV]  C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf, total=   5.6s
[CV] C=0.1, class_weight=balanced, gamma=0.01, kernel=rbf ............
[CV] C=0.1, class_weight=balanced, gamma=0.01, kernel=rbf ............
[CV]  C=0.1, class_weight=balanced, gamma=0.1, kernel=rbf, total=   5.5s
[CV] C=0.1, class_weight=balanced, gamma=0.01, kernel=rbf ............
[CV]  C=

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  1.3min


[CV]  C=1, class_weight=balanced, gamma=0.01, kernel=rbf, total=   6.4s
[CV] C=1, class_weight=balanced, gamma=0.001, kernel=rbf .............
[CV]  C=1, class_weight=balanced, gamma=0.01, kernel=rbf, total=   6.4s
[CV] C=1, class_weight=balanced, gamma=0.001, kernel=rbf .............
[CV]  C=1, class_weight=balanced, gamma=0.01, kernel=rbf, total=   6.4s
[CV] C=1, class_weight=balanced, gamma=0.001, kernel=rbf .............
[CV]  C=1, class_weight=balanced, gamma=0.001, kernel=rbf, total=   3.1s
[CV] C=1, class_weight=balanced, gamma=0.0001, kernel=rbf ............
[CV]  C=1, class_weight=balanced, gamma=0.001, kernel=rbf, total=   3.0s
[CV] C=1, class_weight=balanced, gamma=0.0001, kernel=rbf ............
[CV]  C=1, class_weight=balanced, gamma=0.001, kernel=rbf, total=   2.9s
[CV] C=1, class_weight=balanced, gamma=0.0001, kernel=rbf ............
[CV]  C=1, class_weight=balanced, gamma=0.001, kernel=rbf, total=   3.1s
[CV] C=1, class_weight=balanced, gamma=0.0001, kernel=rbf ........

[Parallel(n_jobs=5)]: Done  75 out of  75 | elapsed:  2.2min finished


In [16]:
model_B.best_score_

0.80145408707639754

In [17]:
model_B.best_estimator_

SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [59]:
model_B = SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False).fit(trans_btrain, by_train)
model_B.score(trans_bvalid, by_valid)

0.88018433179723499

In [36]:
svm_model = SVC()
parameters = [{'C': [1e+7,1e+8,1e+9],'gamma': [1e-4,1e-5,1e-6,1e-7],'class_weight':['balanced'],'kernel': ['rbf']}]

In [37]:
model_C = GridSearchCV(svm_model,param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(cy_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2,
                       refit=True).fit(trans_ctrain, cy_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=10000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf ...
[CV] C=10000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf ...
[CV] C=10000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf ...
[CV] C=10000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf ...
[CV] C=10000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf ...
[CV]  C=10000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf, total=   8.9s
[CV] C=10000000.0, class_weight=balanced, gamma=1e-05, kernel=rbf ....
[CV]  C=10000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf, total=   9.0s
[CV] C=10000000.0, class_weight=balanced, gamma=1e-05, kernel=rbf ....
[CV]  C=10000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf, total=   9.1s
[CV]  C=10000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf, total=   9.0s
[CV] C=10000000.0, class_weight=balanced, gamma=1e-05, kernel=rbf ....
[CV] C=10000000.0, class_weight=balance

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  5.0min


[CV]  C=100000000.0, class_weight=balanced, gamma=1e-07, kernel=rbf, total=  11.7s
[CV] C=100000000.0, class_weight=balanced, gamma=1e-07, kernel=rbf ...
[CV]  C=100000000.0, class_weight=balanced, gamma=1e-07, kernel=rbf, total=  17.0s
[CV] C=100000000.0, class_weight=balanced, gamma=1e-07, kernel=rbf ...
[CV]  C=100000000.0, class_weight=balanced, gamma=1e-07, kernel=rbf, total=  13.2s
[CV] C=100000000.0, class_weight=balanced, gamma=1e-07, kernel=rbf ...
[CV]  C=100000000.0, class_weight=balanced, gamma=1e-07, kernel=rbf, total=  22.7s
[CV] C=100000000.0, class_weight=balanced, gamma=1e-07, kernel=rbf ...
[CV]  C=100000000.0, class_weight=balanced, gamma=1e-07, kernel=rbf, total=  25.2s
[CV] C=1000000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf .
[CV]  C=1000000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf, total=   7.7s
[CV] C=1000000000.0, class_weight=balanced, gamma=0.0001, kernel=rbf .
[CV]  C=100000000.0, class_weight=balanced, gamma=1e-06, kernel=rbf, total

[Parallel(n_jobs=5)]: Done  60 out of  60 | elapsed: 11.0min finished


In [None]:
model_A = SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False).fit(trans_atrain, ay_train)
model_A.score(trans_avalid, ay_valid)

In [None]:
model_B = SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False).fit(trans_btrain, by_train)
model_B.score(trans_bvalid, by_valid)

In [70]:
model_C = SVC(C=1000000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1e-05, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False).fit(trans_ctrain, cy_train)
model_C.score(trans_cvalid, cy_valid)

0.9667697063369397

from libsvm import svm
from libsvm import svmutil

In [66]:
a_preds = model_A.predict_proba(trans_atest)
b_preds = model_B.predict_proba(trans_btest)
c_preds = model_C.predict_proba(trans_ctest)

# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

In [67]:
submission = pd.concat([a_sub, b_sub, c_sub])
submission.head()
submission.tail()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6775,C,1e-07
88300,C,1e-07
35424,C,0.0002093219
81668,C,1e-07
98377,C,1e-07


In [62]:
submission.to_csv('SVM.csv')