In [1]:
%matplotlib inline

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# data directory
DATA_DIR = os.path.join( 'data', 'processed')

data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A', 'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A', 'A_hhold_test.csv'),
                    'Itrain': os.path.join(DATA_DIR, 'A', 'A_indiv_train.csv'),
                    'Itest': os.path.join(DATA_DIR, 'A', 'A_indiv_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B', 'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B', 'B_hhold_test.csv'),
                    'Itrain': os.path.join(DATA_DIR, 'B', 'B_indiv_train.csv'),
                    'Itest': os.path.join(DATA_DIR, 'B', 'B_indiv_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C', 'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C', 'C_hhold_test.csv'),
                    'Itrain': os.path.join(DATA_DIR, 'C', 'C_indiv_train.csv'),
                    'Itest': os.path.join(DATA_DIR, 'C', 'C_indiv_test.csv')}}

In [29]:
# Standardize features
def standardize(df, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    return df
    

def pre_process_data(df, enforce_cols=None):
#     df = standardize(df)
    df = pd.get_dummies(df)
    print("After converting categoricals:\t{}".format(df.shape))
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    df.fillna(0, inplace=True)
    
    return df 
def drop(df, col, threshold):
    poor_count = sum(df['poor'])
    non_count = df.shape[0]-poor_count
    col_poor = Counter(df[col][df['poor'] == True])
    col_non_poor = Counter(df[col][df['poor'] == False])
    key = set(col_poor.keys())|set(col_poor.keys())
    poor_value, non_poor_value = any([i/poor_count>threshold for i in col_poor.values()]), any([i/non_count>threshold for i in col_non_poor.values()])
    if poor_value & non_poor_value:
        return df.drop([col],axis=1)
    else: return df

def drop_col(hhold, indi, hhold_threshold, indi_threshold):
    hhold_poor = sum(hhold['poor'])
    hhold_no = hhold.shape[0] - hhold_poor
    indi_poor = sum(indi['poor'])
    indi_no = indi.shape[0] - indi_poor
    for col in hhold:
        if col == 'poor' or col == 'country': continue
        h_col_poor = Counter(hhold[col][hhold['poor'] == True])
        h_col_non = Counter(hhold[col][hhold['poor'] == False])
        if any([i/hhold_poor>hhold_threshold for i in h_col_poor.values()]) & any([i/hhold_no>hhold_threshold for i in h_col_non.values()]):
            hhold = hhold.drop([col], axis = 1)
    for col in indi:
        if col == 'poor' or col == 'country' or col == 'iid': continue
        i_col_poor = Counter(indi[col][indi['poor'] == True])
        i_col_non = Counter(indi[col][indi['poor'] == False])
        if any([i/indi_poor>indi_threshold for i in i_col_poor.values()]) & any([i/indi_no>indi_threshold for i in i_col_non.values()]):
            indi = indi.drop([col], axis = 1)
    return hhold, indi.drop(['poor','country'],axis=1)
    
def combine_hhold_indi(hhold, indi):
    hhold = pre_process_data(hhold)
    iid_count = pd.DataFrame(indi.groupby('id')['iid'].count())
    iid_count.columns = ['family_numbers']
    indi = indi.drop('iid',axis=1)
    indi_num = indi.select_dtypes(include=['int64', 'float64'])
    indi_obj = indi.drop(list(indi_num.columns), axis = 1)
    indi_obj = pd.get_dummies(indi_obj)
    indi_num = indi_num.fillna(indi_num.mean())
    indi_obj = pre_process_data(indi_obj)
    indi_num = indi_num.groupby('id')[list(indi_num.columns)].mean()
#     indi_num = standardize(indi_num)
    indi_obj = indi_obj.groupby('id')[list(indi_obj.columns)].sum()/indi_obj.groupby('id')[list(indi_obj.columns)].count()
    com = pd.concat([iid_count,indi_num,indi_obj,hhold],axis=1)
    com.fillna(-100,inplace=True)
    return com

def order(df_test, df_train, ori_test):
    new_df = pd.DataFrame()
    for key in df_train:
        new_df[key] = df_test[key]
    new_df = new_df.loc[ori_test.index]
    return new_df
def mll(y_true,y_pred):
    loss = 0
    for i in range(3):
        for j in range(len(y_true[i])):
            loss += y_true[i][j]*np.log(y_pred[i][j])+(1-y_true[i][j])*np.log(1-y_pred[i][j])
        loss = loss / (j+1)
    return -loss/3

def combine_test_hhold_indi(hhold, indi, train):
    hhold = pre_process_data(hhold.drop('country',axis = 1))
    iid_count = pd.DataFrame(indi.groupby('id')['iid'].count())
    iid_count.columns = ['family_numbers']
    indi = indi.drop('iid',axis=1)
    indi_num = indi.select_dtypes(include=['int64', 'float64'])
    indi_obj = indi.drop(list(indi_num.columns), axis = 1)
    indi_obj = pd.get_dummies(indi_obj)
    indi_num = indi_num.fillna(indi_num.mean())
    indi_obj = pre_process_data(indi_obj)
    indi_num = indi_num.groupby('id')[list(indi_num.columns)].mean()
#     indi_num = standardize(indi_num)
    indi_obj = indi_obj.groupby('id')[list(indi_obj.columns)].sum()/indi_obj.groupby('id')[list(indi_obj.columns)].count()
    com = pd.concat([iid_count,indi_num,indi_obj,hhold],axis=1)
    to_drop = np.setdiff1d(com.columns, train.columns)
    to_add = np.setdiff1d(train.columns, com.columns)

    com.drop(to_drop, axis=1, inplace=True)
    com = com.assign(**{c: 0 for c in to_add})
    
    com.fillna(0, inplace=True)
    return com

In [39]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

ai_train = pd.read_csv(data_paths['A']['Itrain'], index_col='id')
bi_train = pd.read_csv(data_paths['B']['Itrain'], index_col='id').drop('wJthinfa',axis=1)
ci_train = pd.read_csv(data_paths['C']['Itrain'], index_col='id')

In [40]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')
ai_test = pd.read_csv(data_paths['A']['Itest'], index_col='id')
bi_test = pd.read_csv(data_paths['B']['Itest'], index_col='id').drop('wJthinfa',axis=1)
ci_test = pd.read_csv(data_paths['C']['Itest'], index_col='id')

In [41]:
train_a,train_ai = drop_col(a_train,ai_train,0.95,0.95)
A_train = combine_hhold_indi(train_a,train_ai)
train_b,train_bi = drop_col(b_train,bi_train,0.95,0.95)
B_train = combine_hhold_indi(train_b,train_bi)
train_c,train_ci = drop_col(c_train,ci_train,0.95,0.95)
C_train = combine_hhold_indi(train_c,train_ci)
A_train = A_train.sample(frac=1)
B_train = B_train.sample(frac=1)
C_train = C_train.sample(frac=1)

After converting categoricals:	(8203, 612)
After converting categoricals:	(37560, 255)
After converting categoricals:	(3255, 1221)
After converting categoricals:	(20252, 1039)
After converting categoricals:	(6469, 741)
After converting categoricals:	(29913, 279)


In [59]:
aX_train = A_train.drop('poor', axis = 1) 
ay_train = A_train['poor'].values
bX_train = B_train.drop('poor', axis = 1) 
by_train = B_train['poor'].values
cX_train = C_train.drop('poor', axis = 1) 
cy_train = C_train['poor'].values
A_test = combine_test_hhold_indi(a_test, ai_test, aX_train)
B_test = combine_test_hhold_indi(b_test, bi_test, bX_train)
C_test = combine_test_hhold_indi(c_test, ci_test, cX_train)
A_test = order(A_test, ax, a_test)
B_test = order(B_test, bx, b_test)
C_test = order(C_test, cx, c_test)

After converting categoricals:	(4041, 850)
After converting categoricals:	(18535, 271)
After converting categoricals:	(1604, 1418)
After converting categoricals:	(10066, 1502)
After converting categoricals:	(3187, 772)
After converting categoricals:	(14701, 296)


In [43]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

In [58]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.cross_validation import StratifiedKFold,train_test_split
from sklearn.ensemble import RandomForestClassifier

def transform(trainfeatures, testfeatures, n,**kwargs):
    svd = decomposition.TruncatedSVD(n_components=n)
    svd.fit(trainfeatures)
    trainfeatures_svd = svd.transform(trainfeatures)
    testfeatures_svd = svd.transform(testfeatures)
    
    scl = preprocessing.StandardScaler()
    scl.fit(trainfeatures_svd)
    trainfeatures_svd_scl = scl.transform(trainfeatures_svd)
    testfeatures_svd_scl = scl.transform(testfeatures_svd)   
    return trainfeatures_svd_scl,testfeatures_svd_scl

In [62]:
trans_atrain, trans_atest = transform(aX_train, A_test, 340)
trans_btrain, trans_btest = transform(bX_train, B_test, 440)
trans_ctrain, trans_ctest = transform(cX_train, C_test, 160)

In [66]:
svm_model = SVC()
#parameters = [{'C': [1, 10, 100, 1000],'gamma': [1e-3, 1e-4], 'probability':['True']}]
#parameters = [{"C":[0.1, 1, 10], "gamma": [1, 0.1, 0.01]}]
#parameters = [{'C': [1, 10, 100, 1000],'gamma': [0.001,0.0001],'kernel': ['rbf']},
              #{'C': [1, 10, 100], 'kernel': ['linear']}]
#parameters = [{'C': [1,30,60,100],'gamma': [0.1,0.01,0.003,0.005],'class_weight':['balanced'],'kernel': ['rbf']}]
parameters = [{'C': [0.1,1,10],'gamma': [1,0.1,0.01],'class_weight':['balanced'],'kernel': ['rbf']}]


In [68]:
model_A = GridSearchCV(svm_model,param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(ay_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2).fit(trans_atrain, ay_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  41.8s
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  41.8s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  41.8s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  41.7s
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  41.6s
[CV] C=1, class_

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  7.3min


[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=  39.6s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=  40.1s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=  39.7s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=  35.4s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=  35.4s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=  35.0s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=  35.4s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf 

[Parallel(n_jobs=5)]: Done  80 out of  80 | elapsed: 17.3min finished


In [79]:
model_B = GridSearchCV(svm_model,param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(by_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2,
                       refit=True).fit(trans_btrain, by_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  10.9s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  11.0s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  11.1s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  11.1s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  1.8min


[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=   6.5s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=   6.4s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=   6.5s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   7.6s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   7.5s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   7.3s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   7.3s
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=r

[Parallel(n_jobs=5)]: Done  80 out of  80 | elapsed:  4.4min finished


In [81]:
model_C = GridSearchCV(svm_model,param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(cy_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2,
                       refit=True).fit(trans_ctrain, cy_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  19.0s
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  19.0s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  19.3s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  19.2s
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=  19.3s
[CV] C=1, class_

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  2.0min


[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=   6.0s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=   6.1s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=   5.9s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   6.8s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   6.8s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   6.5s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   6.6s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf 

[Parallel(n_jobs=5)]: Done  80 out of  80 | elapsed:  4.3min finished


In [85]:
best_parameters, score, _ = max(model_A_10.grid_scores_, key=lambda x:x[1])
print('Raw ACU score', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r"%(param_name, best_parameters[param_name]))

Raw ACU score 0.947856551668
C: 30
class_weight: 'balanced'
gamma: 0.003
kernel: 'rbf'




In [80]:
best_parameters, score, _ = max(model_B.grid_scores_, key=lambda x:x[1])
print('Raw ACU score', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r"%(param_name, best_parameters[param_name]))

Raw ACU score 0.795449531144
C: 1
class_weight: 'balanced'
gamma: 0.003
kernel: 'rbf'




In [82]:
best_parameters, score, _ = max(model_C.grid_scores_, key=lambda x:x[1])
print('Raw ACU score', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r"%(param_name, best_parameters[param_name]))

Raw ACU score 0.928627598648
C: 1
class_weight: 'balanced'
gamma: 0.003
kernel: 'rbf'




In [74]:
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


In [76]:
model_A = SVC(C=30 , gamma= 0.003,kernel='rbf',class_weight='balanced', probability=True).fit(trans_atrain, ay_train)

In [88]:
model_B = SVC(C=1 , gamma= 0.003,kernel='rbf',class_weight='balanced', probability=True).fit(trans_btrain, by_train)

In [89]:
model_C = SVC(C=1, gamma= 0.003 ,kernel='rbf',class_weight='balanced', probability=True).fit(trans_ctrain, cy_train)

In [None]:
model_A = SVC(C=10 , gamma= 0.001, kernel='rbf', probability=True).fit(trans_atrain, ay_train)
model_B = SVC(C=30 , gamma= 0.003,kernel='rbf',class_weight='balanced', probability=True).fit(trans_btrain, by_train)
model_C = SVC(C=3500, gamma= 0.0001 ,kernel='rbf', probability=True).fit(trans_ctrain, cy_train)

In [None]:
title = 'Learning Curves (SVM, linear kernel)' 
plot_learning_curve(model_A, title, aX_train, ay_train, cv=StratifiedKFold(ay_train, n_folds=5,shuffle=True))
plt.show()

In [90]:
a_preds = model_A.predict_proba(trans_atest)
b_preds = model_B.predict_proba(trans_btest)
c_preds = model_C.predict_proba(trans_ctest)

# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

In [91]:
submission = pd.concat([a_sub, b_sub, c_sub])
submission.head()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
418,A,0.494756
41249,A,0.003597
16205,A,0.255519
97501,A,0.001413
67756,A,0.990229


In [92]:
submission.to_csv('SVC_improved.csv')

In [102]:
svm_model = SVC()
#parameters = [{'C': [1, 10, 100, 1000],'gamma': [1e-3, 1e-4], 'probability':['True']}]
#parameters = [{"C":[0.1, 1, 10], "gamma": [1, 0.1, 0.01]}]
#parameters = [{'C': [1, 10, 100, 1000],'gamma': [0.001,0.0001],'kernel': ['rbf']},
              #{'C': [1, 10, 100], 'kernel': ['linear']}]
parameters = [{'C': [1,30,60,100],'gamma': [0.1,0.01,0.003,0.005],'class_weight':['balanced'],'kernel': ['rbf']}]

In [21]:
model_A = GridSearchCV(svm_model,param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(ay_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2).fit(trans_atrain, ay_train)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total=  26.6s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total=  26.5s
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total=  26.7s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total=  26.9s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ...........

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  5.4min


[CV] .................. C=1000, gamma=0.001, kernel=rbf, total=  26.4s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .................. C=1000, gamma=0.001, kernel=rbf, total=  26.1s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=  35.8s
[CV] C=1, kernel=linear ..............................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=  36.2s
[CV] C=1, kernel=linear ..............................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=  36.5s
[CV] C=1, kernel=linear ..............................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=  36.6s
[CV] C=1, kernel=linear ..............................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=  37.0s
[CV] C=1, kernel=linear ..............................................
[CV] .

[Parallel(n_jobs=5)]: Done  55 out of  55 | elapsed: 181.0min finished


In [103]:
model_B = GridSearchCV(svm_model,param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(by_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2,
                       refit=True).fit(trans_btrain, by_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV] C=1, class_weight=balanced, gamma=0.1, kernel=rbf ...............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=   8.3s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=   8.3s
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=   8.3s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class_weight=balanced, gamma=0.1, kernel=rbf, total=   8.3s
[CV] C=1, class_weight=balanced, gamma=0.01, kernel=rbf ..............
[CV]  C=1, class

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  1.4min


[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.003, kernel=rbf, total=   6.8s
[CV] C=30, class_weight=balanced, gamma=0.005, kernel=rbf ............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   7.1s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   7.0s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   7.0s
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   7.1s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=30, class_weight=balanced, gamma=0.005, kernel=rbf, total=   6.9s
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV] C=60, class_weight=balanced, gamma=0.1, kernel=rbf ..............
[CV]  C=60, class_weight=balanced, gamma=0.1, kernel=rbf, t

[Parallel(n_jobs=5)]: Done  80 out of  80 | elapsed:  3.5min finished


In [49]:
model_C = GridSearchCV(svm_model,param_grid=parameters, n_jobs=5,
                       cv=StratifiedKFold(cy_train, n_folds=5,shuffle=True),
                       scoring='roc_auc',
                       verbose=2,
                       refit=True).fit(trans_ctrain, cy_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=3000, gamma=0.001, kernel=rbf .................................
[CV] C=3000, gamma=0.001, kernel=rbf .................................
[CV] C=3000, gamma=0.001, kernel=rbf .................................
[CV] C=3000, gamma=0.001, kernel=rbf .................................
[CV] C=3000, gamma=0.001, kernel=rbf .................................
[CV] .................. C=3000, gamma=0.001, kernel=rbf, total=   9.4s
[CV] C=3000, gamma=0.0001, kernel=rbf ................................
[CV] .................. C=3000, gamma=0.001, kernel=rbf, total=   9.5s
[CV] C=3000, gamma=0.0001, kernel=rbf ................................
[CV] .................. C=3000, gamma=0.001, kernel=rbf, total=   9.5s
[CV] C=3000, gamma=0.0001, kernel=rbf ................................
[CV] .................. C=3000, gamma=0.001, kernel=rbf, total=   9.5s
[CV] C=3000, gamma=0.0001, kernel=rbf ................................
[CV] ............

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  1.5min


[CV] C=4000, gamma=0.0001, kernel=rbf ................................
[CV] .................. C=4000, gamma=0.001, kernel=rbf, total=   7.1s
[CV] C=4000, gamma=0.0001, kernel=rbf ................................
[CV] .................. C=4000, gamma=0.001, kernel=rbf, total=   6.5s
[CV] C=4000, gamma=0.0001, kernel=rbf ................................
[CV] .................. C=4000, gamma=0.001, kernel=rbf, total=   6.6s
[CV] C=4000, gamma=0.0001, kernel=rbf ................................
[CV] ................. C=4000, gamma=0.0001, kernel=rbf, total=  16.0s
[CV] ................. C=4000, gamma=0.0001, kernel=rbf, total=  16.2s
[CV] C=4000, gamma=1e-05, kernel=rbf .................................
[CV] C=4000, gamma=1e-05, kernel=rbf .................................
[CV] ................. C=4000, gamma=0.0001, kernel=rbf, total=  16.2s
[CV] C=4000, gamma=1e-05, kernel=rbf .................................
[CV] ................. C=4000, gamma=0.0001, kernel=rbf, total=  15.9s
[CV] C

[Parallel(n_jobs=5)]: Done  45 out of  45 | elapsed:  2.0min finished


In [23]:
best_parameters, score, _ = max(model_A.grid_scores_, key=lambda x:x[1])
print('Raw ACU score', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r"%(param_name, best_parameters[param_name]))

Raw ACU score 0.946174614135
C: 10
gamma: 0.001
kernel: 'rbf'




In [104]:
best_parameters, score, _ = max(model_B.grid_scores_, key=lambda x:x[1])
print('Raw ACU score', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r"%(param_name, best_parameters[param_name]))

Raw ACU score 0.790196851668
C: 30
class_weight: 'balanced'
gamma: 0.003
kernel: 'rbf'




In [73]:
best_parameters, score, _ = max(model_C.grid_scores_, key=lambda x:x[1])
print('Raw ACU score', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r"%(param_name, best_parameters[param_name]))

Raw ACU score 0.954474888704
C: 3500
gamma: 0.0001
kernel: 'rbf'




In [105]:
model_A = SVC(C=10 , gamma= 0.001, kernel='rbf', probability=True).fit(trans_atrain, ay_train)
model_B = SVC(C=30 , gamma= 0.003,kernel='rbf',class_weight='balanced', probability=True).fit(trans_btrain, by_train)
model_C = SVC(C=3500, gamma= 0.0001 ,kernel='rbf', probability=True).fit(trans_ctrain, cy_train)

In [106]:
a_preds = model_A.predict_proba(trans_atest)
b_preds = model_B.predict_proba(trans_btest)
c_preds = model_C.predict_proba(trans_ctest)

# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')
b_sub = make_country_sub(b_preds, b_test, 'B')
c_sub = make_country_sub(c_preds, c_test, 'C')

In [107]:
submission = pd.concat([a_sub, b_sub, c_sub])
submission.head()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
418,A,0.905004
41249,A,0.002598
16205,A,0.793149
97501,A,0.001034
67756,A,0.984502


In [108]:
submission.to_csv('SVC_adjusted.csv')