In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns

from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, MICE
import patsy

from sklearn import metrics
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_predict



### Imputing Missing Data

In [2]:
df=pd.read_csv('../assets/titanic.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [3]:
#creating data frame to conduct data imputation techniques 

list_of_cols_i_want = ['Survived', 'Pclass', 'Sex','SibSp','Parch','Age', 'Fare'];

df1=df[list_of_cols_i_want];
df1.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Age,Fare
0,0,3,male,1,0,22.0,7.25
1,1,1,female,1,0,38.0,71.2833
2,1,3,female,0,0,26.0,7.925
3,1,1,female,1,0,35.0,53.1
4,0,3,male,0,0,35.0,8.05


In [4]:
df_impute=df1.copy()

df_impute['Sex']=df['Sex'].apply(lambda x: '1' if x =='female' else '2')

df_impute.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Age,Fare
0,0,3,2,1,0,22.0,7.25
1,1,1,1,1,0,38.0,71.2833
2,1,3,1,0,0,26.0,7.925
3,1,1,1,1,0,35.0,53.1
4,0,3,2,0,0,35.0,8.05


In [5]:
X=df_impute.as_matrix().astype(float)


In [6]:
X

array([[  0.    ,   3.    ,   2.    , ...,   0.    ,  22.    ,   7.25  ],
       [  1.    ,   1.    ,   1.    , ...,   0.    ,  38.    ,  71.2833],
       [  1.    ,   3.    ,   1.    , ...,   0.    ,  26.    ,   7.925 ],
       ..., 
       [  0.    ,   3.    ,   1.    , ...,   2.    ,      nan,  23.45  ],
       [  1.    ,   1.    ,   2.    , ...,   0.    ,  26.    ,  30.    ],
       [  0.    ,   3.    ,   2.    , ...,   0.    ,  32.    ,   7.75  ]])

In [7]:
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# X_incomplete[missing_mask] = np.nan
missing_mask 

array([[False,  True, False, ...,  True, False,  True],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ..., 
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [ True, False, False, ...,  True, False,  True]], dtype=bool)

### KNN Imputation

In [8]:
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.complete(X_incomplete)


Imputing row 1/891 with 0 missing, elapsed time: 0.163
Imputing row 101/891 with 0 missing, elapsed time: 0.248
Imputing row 201/891 with 0 missing, elapsed time: 0.249
Imputing row 301/891 with 1 missing, elapsed time: 0.250
Imputing row 401/891 with 0 missing, elapsed time: 0.251
Imputing row 501/891 with 0 missing, elapsed time: 0.252
Imputing row 601/891 with 0 missing, elapsed time: 0.253
Imputing row 701/891 with 0 missing, elapsed time: 0.254
Imputing row 801/891 with 0 missing, elapsed time: 0.254


### Convex Optimization

In [9]:
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

----------------------------------------------------------------------------
	SCS v1.2.6 - Splitting Conic Solver
	(c) Brendan O'Donoghue, Stanford University, 2012-2016
----------------------------------------------------------------------------
Lin-sys: sparse-indirect, nnz in A = 1252462, CG tol ~ 1/iter^(2.00)
eps = 1.00e-03, alpha = 1.50, max_iters = 2500, normalize = 1, scale = 1.00
Variables n = 818878, constraints m = 831352
Cones:	primal zero / dual free vars: 408990
	linear vars: 18711
	sd vars: 403651, sd blks: 1
Setup time: 1.18e-01s
----------------------------------------------------------------------------
 Iter | pri res | dua res | rel gap | pri obj | dua obj | kap/tau | time (s)
----------------------------------------------------------------------------
     0|      inf       inf       nan      -inf       inf       inf  1.03e+00 
   100| 2.61e-03  1.45e-03  3.59e-04  2.68e+03  2.69e+03  1.70e-12  1.53e+02 
   140| 6.01e-04  3.68e-04  2.71e-04  2.71e+03  2.70e+03  1.7

### Spectral Regulation Algorithm 

In [10]:
softImpute = SoftImpute()
biscaler = BiScaler()

In [11]:
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

[BiScaler] Initial log residual value = 11.912883
[BiScaler] Iter 1: log residual = 3.977100, log improvement ratio=7.935783
[BiScaler] Iter 2: log residual = 3.175702, log improvement ratio=0.801398
[BiScaler] Iter 3: log residual = 2.355897, log improvement ratio=0.819804
[BiScaler] Iter 4: log residual = 1.557900, log improvement ratio=0.797997
[BiScaler] Iter 5: log residual = 0.796267, log improvement ratio=0.761634
[BiScaler] Iter 6: log residual = 0.076422, log improvement ratio=0.719845
[BiScaler] Iter 7: log residual = -0.610033, log improvement ratio=0.686455
[BiScaler] Iter 8: log residual = -1.273936, log improvement ratio=0.663903
[BiScaler] Iter 9: log residual = -1.923431, log improvement ratio=0.649495
[BiScaler] Iter 10: log residual = -2.563711, log improvement ratio=0.640280
[BiScaler] Iter 11: log residual = -3.197936, log improvement ratio=0.634225
[BiScaler] Iter 12: log residual = -3.828048, log improvement ratio=0.630111
[BiScaler] Iter 13: log residual = -4.455

In [12]:
# Spectral Regulization Algorithm normalized
X_filled_softnorm = softImpute.complete(X_incomplete_normalized)
X_filled_softnorm

[SoftImpute] Max Singular Value of X_init = 45.137077
[SoftImpute] Iter 1: observed MAE=0.022666 rank=6
[SoftImpute] Iter 2: observed MAE=0.022674 rank=6
[SoftImpute] Iter 3: observed MAE=0.022683 rank=6
[SoftImpute] Iter 4: observed MAE=0.022692 rank=6
[SoftImpute] Iter 5: observed MAE=0.022700 rank=6
[SoftImpute] Iter 6: observed MAE=0.022709 rank=6
[SoftImpute] Iter 7: observed MAE=0.022717 rank=6
[SoftImpute] Iter 8: observed MAE=0.022725 rank=6
[SoftImpute] Iter 9: observed MAE=0.022731 rank=6
[SoftImpute] Iter 10: observed MAE=0.022737 rank=6
[SoftImpute] Iter 11: observed MAE=0.022743 rank=6
[SoftImpute] Iter 12: observed MAE=0.022747 rank=6
[SoftImpute] Iter 13: observed MAE=0.022752 rank=6
[SoftImpute] Iter 14: observed MAE=0.022755 rank=6
[SoftImpute] Iter 15: observed MAE=0.022758 rank=6
[SoftImpute] Iter 16: observed MAE=0.022761 rank=6
[SoftImpute] Iter 17: observed MAE=0.022763 rank=6
[SoftImpute] Iter 18: observed MAE=0.022765 rank=6
[SoftImpute] Iter 19: observed MAE=0.

array([[-0.72636572,  0.33722682,  0.14149873, ..., -0.99269169,
        -0.50785266, -0.46142699],
       [ 0.60513798, -0.99905096, -0.98817914, ..., -1.28087936,
         0.33773145,  1.47973364],
       [ 2.17122223,  0.52114378, -1.19068743, ..., -0.33375358,
        -0.24715491, -0.46674274],
       ..., 
       [-1.00367818, -0.12878502, -0.96942763, ...,  2.0062342 ,
        -0.05229809,  0.10572413],
       [ 1.92962341, -1.43088132,  0.29474125, ..., -0.57416249,
        -0.23851651,  0.62422081],
       [-0.81019345,  1.47992769,  1.21270401, ..., -0.34580623,
         0.5494491 , -1.24968775]])

In [13]:
X_filled_softimpute = biscaler.inverse_transform(X_filled_softnorm)

X_filled_softimpute

array([[  5.55111512e-17,   3.00000000e+00,   2.00000000e+00, ...,
          0.00000000e+00,   2.20000000e+01,   7.25000000e+00],
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          5.55111512e-17,   3.80000000e+01,   7.12833000e+01],
       [  1.00000000e+00,   3.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   2.60000000e+01,   7.92500000e+00],
       ..., 
       [  5.55111512e-17,   3.00000000e+00,   1.00000000e+00, ...,
          2.00000000e+00,   2.83079986e+01,   2.34500000e+01],
       [  1.00000000e+00,   1.00000000e+00,   2.00000000e+00, ...,
          0.00000000e+00,   2.60000000e+01,   3.00000000e+01],
       [  0.00000000e+00,   3.00000000e+00,   2.00000000e+00, ...,
          0.00000000e+00,   3.20000000e+01,   7.75000000e+00]])

In [14]:
# Spectral Regulization Algorithm no biscale

X_filled_soft = softImpute.complete(X_incomplete)

X_filled_soft

[SoftImpute] Max Singular Value of X_init = 1834.310889
[SoftImpute] Iter 1: observed MAE=0.676697 rank=3
[SoftImpute] Iter 2: observed MAE=0.676640 rank=3
[SoftImpute] Iter 3: observed MAE=0.676576 rank=3
[SoftImpute] Iter 4: observed MAE=0.676507 rank=3
[SoftImpute] Iter 5: observed MAE=0.676436 rank=3
[SoftImpute] Iter 6: observed MAE=0.676362 rank=3
[SoftImpute] Iter 7: observed MAE=0.676287 rank=3
[SoftImpute] Iter 8: observed MAE=0.676211 rank=3
[SoftImpute] Iter 9: observed MAE=0.676135 rank=3
[SoftImpute] Iter 10: observed MAE=0.676060 rank=3
[SoftImpute] Iter 11: observed MAE=0.675985 rank=3
[SoftImpute] Iter 12: observed MAE=0.675914 rank=3
[SoftImpute] Iter 13: observed MAE=0.675844 rank=3
[SoftImpute] Iter 14: observed MAE=0.675777 rank=3
[SoftImpute] Iter 15: observed MAE=0.675713 rank=3
[SoftImpute] Iter 16: observed MAE=0.675652 rank=3
[SoftImpute] Iter 17: observed MAE=0.675597 rank=3
[SoftImpute] Iter 18: observed MAE=0.675553 rank=3
[SoftImpute] Iter 19: observed MAE=

array([[  0.        ,   3.        ,   2.        , ...,   0.        ,
         22.        ,   7.25      ],
       [  1.        ,   1.        ,   1.        , ...,   0.        ,
         38.        ,  71.2833    ],
       [  1.        ,   3.        ,   1.        , ...,   0.        ,
         26.        ,   7.925     ],
       ..., 
       [  0.        ,   3.        ,   1.        , ...,   2.        ,
          6.60840507,  23.45      ],
       [  1.        ,   1.        ,   2.        , ...,   0.        ,
         26.        ,  30.        ],
       [  0.        ,   3.        ,   2.        , ...,   0.        ,
         32.        ,   7.75      ]])

### Multiple Imputation by Chain  Equations

In [15]:
X_filled_mice = MICE().complete(X_incomplete)

[MICE] Completing matrix with shape (891, 7)
[MICE] Starting imputation round 1/110, elapsed time 0.000
[MICE] Starting imputation round 2/110, elapsed time 0.003
[MICE] Starting imputation round 3/110, elapsed time 0.004
[MICE] Starting imputation round 4/110, elapsed time 0.005
[MICE] Starting imputation round 5/110, elapsed time 0.005
[MICE] Starting imputation round 6/110, elapsed time 0.006
[MICE] Starting imputation round 7/110, elapsed time 0.006
[MICE] Starting imputation round 8/110, elapsed time 0.007
[MICE] Starting imputation round 9/110, elapsed time 0.007
[MICE] Starting imputation round 10/110, elapsed time 0.008
[MICE] Starting imputation round 11/110, elapsed time 0.010
[MICE] Starting imputation round 12/110, elapsed time 0.011
[MICE] Starting imputation round 13/110, elapsed time 0.012
[MICE] Starting imputation round 14/110, elapsed time 0.013
[MICE] Starting imputation round 15/110, elapsed time 0.013
[MICE] Starting imputation round 16/110, elapsed time 0.013
[MIC

### Getting RMSE for the methods' comparison

In [16]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(predictions, targets, name):
    rmse=sqrt(mean_squared_error(~np.isnan(predictions), ~np.isnan(targets)))
    print ("{} RMSE: %0.10f" % rmse).format(name)

rmse(X_filled_knn[missing_mask], X[missing_mask], "Knn")
rmse(X_filled_nnm[missing_mask], X[missing_mask], "Nnm")
rmse(X_filled_softnorm[missing_mask], X[missing_mask], "SoftImp Norm")
rmse(X_filled_soft[missing_mask], X[missing_mask], "SoftImp")
rmse(X_filled_mice[missing_mask], X[missing_mask], "Mice")


Knn RMSE: 0.1242259987
Nnm RMSE: 0.1242259987
SoftImp Norm RMSE: 0.1242259987
SoftImp RMSE: 0.1242259987
Mice RMSE: 0.1242259987


  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,
  output_errors = np.average((y_true - y_pred) ** 2, axis=0,


#### Because all RMSEs are the same, it seems that all types of imputations led to the same result. I decided to double check this by running several algorithms with selected types of imputation. I reasoned that if the algorithms’ scores match, all types of imputations indeed led to the same result.  


### Checking Knn Imputation

In [17]:
cols_to_merge = ['Embarked', 'Fare','Pclass','Survived'];
df_merge=df[cols_to_merge];

In [18]:


knn_frame = pd.DataFrame({'Survived':X_filled_knn[:,0],'Pclass':X_filled_knn[:,1],\
                       'Sex':X_filled_knn[:,2],'SibSp':X_filled_knn[:,3],
                        'Parch':X_filled_knn[:,4],'Age':X_filled_knn[:,5],"Fare":X_filled_knn[:,6]});

df_knn = pd.concat([knn_frame, df_merge], axis=1, join='inner').dropna()
df_knn.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
Age         889 non-null float64
Fare        889 non-null float64
Parch       889 non-null float64
Pclass      889 non-null float64
Sex         889 non-null float64
SibSp       889 non-null float64
Survived    889 non-null float64
Embarked    889 non-null object
Fare        889 non-null float64
Pclass      889 non-null int64
Survived    889 non-null int64
dtypes: float64(8), int64(2), object(1)
memory usage: 83.3+ KB


In [19]:
df_knn=df_knn.T.drop_duplicates().T

df_knn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Age         889 non-null object
Fare        889 non-null object
Parch       889 non-null object
Pclass      889 non-null object
Sex         889 non-null object
SibSp       889 non-null object
Survived    889 non-null object
Embarked    889 non-null object
dtypes: object(8)
memory usage: 62.5+ KB


In [20]:
df_knn['Pclass']=df_knn['Pclass'].astype(int);
df_knn['Survived']=df_knn['Survived'].astype(int);
df_knn['Sex']=df_knn['Sex'].astype(int);
df_knn['Age']=df_knn['Age'].astype(float);
df_knn['SibSp']=df_knn['SibSp'].astype(float);
df_knn['Parch']=df_knn['Parch'].astype(float);
df_knn['Fare']=df_knn['Fare'].astype(float);

df_knn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Age         889 non-null float64
Fare        889 non-null float64
Parch       889 non-null float64
Pclass      889 non-null int64
Sex         889 non-null int64
SibSp       889 non-null float64
Survived    889 non-null int64
Embarked    889 non-null object
dtypes: float64(4), int64(3), object(1)
memory usage: 62.5+ KB


In [21]:
from sklearn.preprocessing import StandardScaler
cont_var=['Fare','Age','Parch','SibSp']
x=df_knn[cont_var]


X_norm =  StandardScaler().fit_transform(x);
df_stand=pd.DataFrame(X_norm,columns=['Fare','Age','Parch','SibSp'])
len(df_stand)


889

In [22]:

X = patsy.dmatrix('~ C(Pclass)+C(Sex) + C(Embarked)', df_knn)

df_cat=pd.DataFrame(X, columns=X.design_info.column_names)
df_cat.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.2],C(Embarked)[T.Q],C(Embarked)[T.S]
0,1.0,0.0,1.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,1.0,1.0,0.0,1.0


In [23]:
df2 = pd.concat([df_cat, df_stand], axis=1)
df2.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.2],C(Embarked)[T.Q],C(Embarked)[T.S],Fare,Age,Parch,SibSp
0,1.0,0.0,1.0,1.0,0.0,1.0,-0.50024,-0.561925,-0.474326,0.43135
1,1.0,0.0,0.0,0.0,0.0,0.0,0.788947,0.604068,-0.474326,0.43135
2,1.0,0.0,1.0,0.0,0.0,1.0,-0.48665,-0.270427,-0.474326,-0.475199
3,1.0,0.0,0.0,0.0,0.0,1.0,0.422861,0.385444,-0.474326,0.43135
4,1.0,0.0,1.0,1.0,0.0,1.0,-0.484133,0.385444,-0.474326,-0.475199


In [24]:
X_knn=df2;
y_knn=df_knn['Survived'];

In [25]:
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

cv = StratifiedKFold(y_knn, n_folds=10, shuffle=True, random_state=5)


lg1 = LogisticRegression(random_state=5)
dt1  = DecisionTreeClassifier(class_weight='balanced',min_samples_split=50,random_state=5)
bdt1 = BaggingClassifier(DecisionTreeClassifier(random_state=5),random_state=5)
rf1 = RandomForestClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
et1 = ExtraTreesClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
abc1 = AdaBoostClassifier(random_state=5)
gbst1 = GradientBoostingClassifier(random_state=5)
bnb1 = BernoulliNB()
svc1= SVC(kernel='linear',probability=True,random_state=5)


def score(model, name):
    s = cross_val_score(model, X_knn, y_knn, cv=cv, n_jobs=-1)
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))
        
score(dt1,"Decision Tree")
score(lg1, "Logistic Regres")
score(bdt1, "Bagging DT")
score(rf1, "Random Forest")
score(et1, "Extra Trees")
score(abc1, "Ada Boost")
score(gbst1, "Grad Boosting")
score(bnb1, "Bernoulli NB")
score(svc1, "SVC")

Decision Tree Score:	0.813 ± 0.055
Logistic Regres Score:	0.808 ± 0.039
Bagging DT Score:	0.826 ± 0.039
Random Forest Score:	0.837 ± 0.045
Extra Trees Score:	0.812 ± 0.041
Ada Boost Score:	0.802 ± 0.042
Grad Boosting Score:	0.843 ± 0.031
Bernoulli NB Score:	0.775 ± 0.026
SVC Score:	0.785 ± 0.039


### Checking Convex Optimization Imputation

In [26]:

nnm_frame = pd.DataFrame({'Survived':X_filled_nnm[:,0],'Pclass':X_filled_nnm[:,1],\
                       'Sex':X_filled_nnm[:,2],'SibSp':X_filled_nnm[:,3],
                        'Parch':X_filled_nnm[:,4],'Age':X_filled_nnm[:,5],"Fare":X_filled_nnm[:,6]});

df_nnm = pd.concat([nnm_frame, df_merge], axis=1, join='inner').dropna()
df_nnm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
Age         889 non-null float64
Fare        889 non-null float64
Parch       889 non-null float64
Pclass      889 non-null float64
Sex         889 non-null float64
SibSp       889 non-null float64
Survived    889 non-null float64
Embarked    889 non-null object
Fare        889 non-null float64
Pclass      889 non-null int64
Survived    889 non-null int64
dtypes: float64(8), int64(2), object(1)
memory usage: 83.3+ KB


In [27]:
df_nnm=df_nnm.T.drop_duplicates().T

df_nnm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Age         889 non-null object
Fare        889 non-null object
Parch       889 non-null object
Pclass      889 non-null object
Sex         889 non-null object
SibSp       889 non-null object
Survived    889 non-null object
Embarked    889 non-null object
dtypes: object(8)
memory usage: 62.5+ KB


In [28]:
df_nnm['Pclass']=df_nnm['Pclass'].astype(int);
df_nnm['Survived']=df_nnm['Survived'].astype(int);
df_nnm['Sex']=df_nnm['Sex'].astype(int);
df_nnm['Age']=df_nnm['Age'].astype(float);
df_nnm['SibSp']=df_nnm['SibSp'].astype(float);
df_nnm['Parch']=df_nnm['Parch'].astype(float);
df_nnm['Fare']=df_nnm['Fare'].astype(float);

df_nnm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Age         889 non-null float64
Fare        889 non-null float64
Parch       889 non-null float64
Pclass      889 non-null int64
Sex         889 non-null int64
SibSp       889 non-null float64
Survived    889 non-null int64
Embarked    889 non-null object
dtypes: float64(4), int64(3), object(1)
memory usage: 62.5+ KB


In [29]:
from sklearn.preprocessing import StandardScaler
cont_var=['Fare','Age','Parch','SibSp']
x=df_nnm[cont_var]


X_norm =  StandardScaler().fit_transform(x);
df_stand=pd.DataFrame(X_norm,columns=['Fare','Age','Parch','SibSp'])
len(df_stand)

889

In [30]:

X = patsy.dmatrix('~ C(Pclass)+C(Sex) + C(Embarked)', df_nnm)

df_cat=pd.DataFrame(X, columns=X.design_info.column_names)
df_cat.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.2],C(Embarked)[T.Q],C(Embarked)[T.S]
0,1.0,0.0,1.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,1.0,1.0,0.0,1.0


In [31]:
df2 = pd.concat([df_cat, df_stand], axis=1)
df2.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.2],C(Embarked)[T.Q],C(Embarked)[T.S],Fare,Age,Parch,SibSp
0,1.0,0.0,1.0,1.0,0.0,1.0,-0.50024,-0.192693,-0.474326,0.43135
1,1.0,0.0,0.0,0.0,0.0,0.0,0.788947,0.805479,-0.474326,0.43135
2,1.0,0.0,1.0,0.0,0.0,1.0,-0.48665,0.05685,-0.474326,-0.475199
3,1.0,0.0,0.0,0.0,0.0,1.0,0.422861,0.618322,-0.474326,0.43135
4,1.0,0.0,1.0,1.0,0.0,1.0,-0.484133,0.618322,-0.474326,-0.475199


In [32]:
X_nnm=df2;
y_nnm=df_nnm['Survived'];

In [33]:
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

cv = StratifiedKFold(y_nnm, n_folds=10, shuffle=True, random_state=5)

# X_resampled1
# y_resampled1

lg2= LogisticRegression(random_state=5)
dt2 = DecisionTreeClassifier(class_weight='balanced',min_samples_split=50,random_state=5)
bdt2 = BaggingClassifier(DecisionTreeClassifier(random_state=5),random_state=5)
rf2 = RandomForestClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
et2 = ExtraTreesClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
abc2 = AdaBoostClassifier(random_state=5)
gbst2 = GradientBoostingClassifier(random_state=5)
bnb2 = BernoulliNB()
svc2= SVC(kernel='linear',probability=True,random_state=5)


def score(model, name):
    s = cross_val_score(model, X_nnm, y_nnm, cv=cv, n_jobs=-1)
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))
        
score(dt2,"Decision Tree")
score(lg2, "Logistic Regres")
score(bdt2, "Bagging DT")
score(rf2, "Random Forest")
score(et2, "Extra Trees")
score(abc2, "Ada Boost")
score(gbst2, "Grad Boosting")
score(bnb2, "Bernoulli NB")
score(svc2, "SVC")

Decision Tree Score:	0.784 ± 0.049
Logistic Regres Score:	0.801 ± 0.046
Bagging DT Score:	0.813 ± 0.045
Random Forest Score:	0.803 ± 0.047
Extra Trees Score:	0.804 ± 0.034
Ada Boost Score:	0.798 ± 0.043
Grad Boosting Score:	0.823 ± 0.042
Bernoulli NB Score:	0.754 ± 0.038
SVC Score:	0.786 ± 0.039


### Checking Spectral Regulization Algorithm 

### Normalized

In [34]:
# X_filled_softimpute = biscaler.inverse_transform(X_filled_softnorm)

# X_filled_softimpute


softnorm_frame = pd.DataFrame({'Survived':X_filled_softimpute[:,0],'Pclass':X_filled_softimpute[:,1],\
                       'Sex':X_filled_softimpute[:,2],'SibSp':X_filled_softimpute[:,3],
                        'Parch':X_filled_softimpute[:,4],'Age':X_filled_softimpute[:,5],"Fare":X_filled_softimpute[:,6]});



In [35]:
cols_to_merge = ['Embarked', 'Fare','Pclass','Survived','Sex', 'SibSp','Parch'];
df_merge=df[cols_to_merge];

In [36]:
df_softnorm = pd.concat([df_merge,softnorm_frame.iloc[:, 0:2]], axis=1, join='inner').dropna()
df_softnorm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
Embarked    889 non-null object
Fare        889 non-null float64
Pclass      889 non-null int64
Survived    889 non-null int64
Sex         889 non-null object
SibSp       889 non-null int64
Parch       889 non-null int64
Age         889 non-null float64
Fare        889 non-null float64
dtypes: float64(3), int64(4), object(2)
memory usage: 69.5+ KB


In [37]:
df_softnorm=df_softnorm.iloc[:,:-1]

df_softnorm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Embarked    889 non-null object
Fare        889 non-null float64
Pclass      889 non-null int64
Survived    889 non-null int64
Sex         889 non-null object
SibSp       889 non-null int64
Parch       889 non-null int64
Age         889 non-null float64
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


In [38]:
df_softnorm['SibSp']=df_softnorm['SibSp'].astype(float);
df_softnorm['Parch']=df_softnorm['Parch'].astype(float);



df_softnorm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Embarked    889 non-null object
Fare        889 non-null float64
Pclass      889 non-null int64
Survived    889 non-null int64
Sex         889 non-null object
SibSp       889 non-null float64
Parch       889 non-null float64
Age         889 non-null float64
dtypes: float64(4), int64(2), object(2)
memory usage: 62.5+ KB


In [39]:
from sklearn.preprocessing import StandardScaler
cont_var=['Fare','Parch','SibSp','Age']
x=df_softnorm[cont_var]


X_norm =  StandardScaler().fit_transform(x);
df_stand=pd.DataFrame(X_norm,columns=['Fare','Parch','Age','SibSp'])
len(df_stand)

889

In [40]:
X = patsy.dmatrix('~ C(Pclass)+C(Sex)+ C(Embarked)', df_softnorm)

df_cat=pd.DataFrame(X, columns=X.design_info.column_names)
df_cat.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.male],C(Embarked)[T.Q],C(Embarked)[T.S]
0,1.0,0.0,1.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,1.0,1.0,0.0,1.0


In [41]:
df3 = pd.concat([df_cat, df_stand], axis=1)
df3.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.male],C(Embarked)[T.Q],C(Embarked)[T.S],Fare,Parch,Age,SibSp
0,1.0,0.0,1.0,1.0,0.0,1.0,-0.50024,-0.474326,0.43135,-0.586701
1,1.0,0.0,0.0,0.0,0.0,0.0,0.788947,-0.474326,0.43135,0.646958
2,1.0,0.0,1.0,0.0,0.0,1.0,-0.48665,-0.474326,-0.475199,-0.278286
3,1.0,0.0,0.0,0.0,0.0,1.0,0.422861,-0.474326,0.43135,0.415647
4,1.0,0.0,1.0,1.0,0.0,1.0,-0.484133,-0.474326,-0.475199,0.415647


In [42]:
X_softnorm=df3;
y_softnorm=df_softnorm['Survived'];

In [43]:
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

cv = StratifiedKFold(y_softnorm, n_folds=10, shuffle=True, random_state=5)


lg3= LogisticRegression(random_state=5)
dt3 = DecisionTreeClassifier(class_weight='balanced',min_samples_split=50,random_state=5)
bdt3 = BaggingClassifier(DecisionTreeClassifier(random_state=5),random_state=5)
rf3 = RandomForestClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
et3 = ExtraTreesClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
abc3 = AdaBoostClassifier(random_state=5)
gbst3 = GradientBoostingClassifier(random_state=5)
bnb3 = BernoulliNB()
svc3= SVC(kernel='linear',probability=True,random_state=5)


def score(model, name):
    s = cross_val_score(model, X_softnorm, y_softnorm, cv=cv, n_jobs=-1)
    print "{} Outter Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))
        
score(dt3,"Decision Tree")
score(lg3, "Logistic Regres")
score(bdt3, "Bagging DT")
score(rf3, "Random Forest")
score(et3, "Extra Trees")
score(abc3, "Ada Boost")
score(gbst3, "Grad Boosting")
score(bnb3, "Bernoulli NB")
score(svc3, "SVC")

Decision Tree Outter Score:	0.809 ± 0.057
Logistic Regres Outter Score:	0.802 ± 0.038
Bagging DT Outter Score:	0.832 ± 0.043
Random Forest Outter Score:	0.817 ± 0.045
Extra Trees Outter Score:	0.794 ± 0.045
Ada Boost Outter Score:	0.804 ± 0.045
Grad Boosting Outter Score:	0.827 ± 0.039
Bernoulli NB Outter Score:	0.753 ± 0.038
SVC Outter Score:	0.786 ± 0.039


### Regular No Biscale

In [44]:
soft_frame = pd.DataFrame({'Survived':X_filled_soft[:,0],'Pclass':X_filled_soft[:,1],\
                       'Sex':X_filled_soft[:,2],'SibSp':X_filled_soft[:,3],
                        'Parch':X_filled_soft[:,4],'Age':X_filled_soft[:,5],"Fare":X_filled_soft[:,6]});

df_soft = pd.concat([soft_frame, df_merge], axis=1, join='inner').dropna()
df_soft.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 14 columns):
Age         889 non-null float64
Fare        889 non-null float64
Parch       889 non-null float64
Pclass      889 non-null float64
Sex         889 non-null float64
SibSp       889 non-null float64
Survived    889 non-null float64
Embarked    889 non-null object
Fare        889 non-null float64
Pclass      889 non-null int64
Survived    889 non-null int64
Sex         889 non-null object
SibSp       889 non-null int64
Parch       889 non-null int64
dtypes: float64(8), int64(4), object(2)
memory usage: 104.2+ KB


In [45]:
df_soft=df_soft.iloc[:,0:8]
df_soft.info()

df_soft

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Age         889 non-null float64
Fare        889 non-null float64
Parch       889 non-null float64
Pclass      889 non-null float64
Sex         889 non-null float64
SibSp       889 non-null float64
Survived    889 non-null float64
Embarked    889 non-null object
dtypes: float64(7), object(1)
memory usage: 62.5+ KB


Unnamed: 0,Age,Fare,Parch,Pclass,Sex,SibSp,Survived,Embarked
0,22.000000,7.2500,0.0,3.0,2.0,1.0,0.0,S
1,38.000000,71.2833,0.0,1.0,1.0,1.0,1.0,C
2,26.000000,7.9250,0.0,3.0,1.0,0.0,1.0,S
3,35.000000,53.1000,0.0,1.0,1.0,1.0,1.0,S
4,35.000000,8.0500,0.0,3.0,2.0,0.0,0.0,S
5,4.636919,8.4583,0.0,3.0,2.0,0.0,0.0,Q
6,54.000000,51.8625,0.0,1.0,2.0,0.0,0.0,S
7,2.000000,21.0750,1.0,3.0,2.0,3.0,0.0,S
8,27.000000,11.1333,2.0,3.0,1.0,0.0,1.0,S
9,14.000000,30.0708,0.0,2.0,1.0,1.0,1.0,C


In [46]:
df_soft['Pclass']=df_soft['Pclass'].astype(int);
df_soft['Survived']=df_soft['Survived'].astype(int);
df_soft['Sex']=df_soft['Sex'].astype(int);


df_soft.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Age         889 non-null float64
Fare        889 non-null float64
Parch       889 non-null float64
Pclass      889 non-null int64
Sex         889 non-null int64
SibSp       889 non-null float64
Survived    889 non-null int64
Embarked    889 non-null object
dtypes: float64(4), int64(3), object(1)
memory usage: 62.5+ KB


In [47]:
from sklearn.preprocessing import StandardScaler
cont_var=['Fare','Parch','SibSp','Age']
x=df_soft[cont_var]


X_norm =  StandardScaler().fit_transform(x);
df_stand=pd.DataFrame(X_norm,columns=['Fare','Parch','Age','SibSp'])
len(df_stand)

889

In [48]:
X = patsy.dmatrix('~ C(Pclass)+C(Sex)+ C(Embarked)', df_soft)

df_cat=pd.DataFrame(X, columns=X.design_info.column_names)
df_cat.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.2],C(Embarked)[T.Q],C(Embarked)[T.S]
0,1.0,0.0,1.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,1.0,1.0,0.0,1.0


In [49]:
df4 = pd.concat([df_cat, df_stand], axis=1)
df4.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.2],C(Embarked)[T.Q],C(Embarked)[T.S],Fare,Parch,Age,SibSp
0,1.0,0.0,1.0,1.0,0.0,1.0,-0.50024,-0.474326,0.43135,-0.189658
1,1.0,0.0,0.0,0.0,0.0,0.0,0.788947,-0.474326,0.43135,0.805884
2,1.0,0.0,1.0,0.0,0.0,1.0,-0.48665,-0.474326,-0.475199,0.059227
3,1.0,0.0,0.0,0.0,0.0,1.0,0.422861,-0.474326,0.43135,0.61922
4,1.0,0.0,1.0,1.0,0.0,1.0,-0.484133,-0.474326,-0.475199,0.61922


In [50]:
X_soft=df4;
y_soft=df_soft['Survived'].values;



In [51]:
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

cv = StratifiedKFold(y_soft, n_folds=10, shuffle=True, random_state=5)


lg3= LogisticRegression(random_state=5)
dt3 = DecisionTreeClassifier(class_weight='balanced',min_samples_split=50,random_state=5)
bdt3 = BaggingClassifier(DecisionTreeClassifier(random_state=5),random_state=5)
rf3 = RandomForestClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
et3 = ExtraTreesClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
abc3 = AdaBoostClassifier(random_state=5)
gbst3 = GradientBoostingClassifier(random_state=5)
bnb3 = BernoulliNB()
svc3= SVC(kernel='linear',probability=True,random_state=5)


def score(model, name):
    s = cross_val_score(model, X_soft, y_soft, cv=cv, n_jobs=-1)
    print "{} Outter Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))
        
score(dt3,"Decision Tree")
score(lg3, "Logistic Regres")
score(bdt3, "Bagging DT")
score(rf3, "Random Forest")
score(et3, "Extra Trees")
score(abc3, "Ada Boost")
score(gbst3, "Grad Boosting")
score(bnb3, "Bernoulli NB")
score(svc3, "SVC")

Decision Tree Outter Score:	0.781 ± 0.049
Logistic Regres Outter Score:	0.8 ± 0.048
Bagging DT Outter Score:	0.81 ± 0.032
Random Forest Outter Score:	0.816 ± 0.04
Extra Trees Outter Score:	0.787 ± 0.029
Ada Boost Outter Score:	0.795 ± 0.041
Grad Boosting Outter Score:	0.826 ± 0.037
Bernoulli NB Outter Score:	0.754 ± 0.038
SVC Outter Score:	0.786 ± 0.039


### Checking MICE Imputation

In [545]:
mice_frame = pd.DataFrame({'Survived':X_filled_mice[:,0],'Pclass':X_filled_mice[:,1],\
                       'Sex':X_filled_mice[:,2],'SibSp':X_filled_mice[:,3],
                        'Parch':X_filled_mice[:,4],'Age':X_filled_mice[:,5],"Fare":X_filled_mice[:,6]});

df_mice = pd.concat([mice_frame, df_merge], axis=1, join='inner').dropna()
df_mice.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
Age         889 non-null float64
Fare        889 non-null float64
Parch       889 non-null float64
Pclass      889 non-null float64
Sex         889 non-null float64
SibSp       889 non-null float64
Survived    889 non-null float64
Embarked    889 non-null object
Fare        889 non-null float64
Pclass      889 non-null int64
Survived    889 non-null int64
dtypes: float64(8), int64(2), object(1)
memory usage: 83.3+ KB


In [546]:
df_mice=df_mice.T.drop_duplicates().T

df_mice.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Age         889 non-null object
Fare        889 non-null object
Parch       889 non-null object
Pclass      889 non-null object
Sex         889 non-null object
SibSp       889 non-null object
Survived    889 non-null object
Embarked    889 non-null object
dtypes: object(8)
memory usage: 62.5+ KB


In [547]:
df_mice['Pclass']=df_mice['Pclass'].astype(int);
df_mice['Survived']=df_mice['Survived'].astype(int);
df_mice['Sex']=df_mice['Sex'].astype(int);
df_mice['Age']=df_mice['Age'].astype(float);
df_mice['SibSp']=df_mice['SibSp'].astype(float);
df_mice['Parch']=df_mice['Parch'].astype(float);
df_mice['Fare']=df_mice['Fare'].astype(float);

df_mice.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
Age         889 non-null float64
Fare        889 non-null float64
Parch       889 non-null float64
Pclass      889 non-null int64
Sex         889 non-null int64
SibSp       889 non-null float64
Survived    889 non-null int64
Embarked    889 non-null object
dtypes: float64(4), int64(3), object(1)
memory usage: 62.5+ KB


In [548]:
from sklearn.preprocessing import StandardScaler
cont_var=['Fare','Parch','SibSp','Age']
x=df_mice[cont_var]


X_norm =  StandardScaler().fit_transform(x);
df_stand=pd.DataFrame(X_norm,columns=['Fare','Parch','Age','SibSp'])
len(df_stand)

889

In [549]:
X = patsy.dmatrix('~ C(Pclass)+C(Sex)+ C(Embarked)', df_mice)

df_cat=pd.DataFrame(X, columns=X.design_info.column_names)
df_cat.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.2],C(Embarked)[T.Q],C(Embarked)[T.S]
0,1.0,0.0,1.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,1.0,1.0,0.0,1.0


In [550]:
df2 = pd.concat([df_cat, df_stand], axis=1)
df2.head()

Unnamed: 0,Intercept,C(Pclass)[T.2],C(Pclass)[T.3],C(Sex)[T.2],C(Embarked)[T.Q],C(Embarked)[T.S],Fare,Parch,Age,SibSp
0,1.0,0.0,1.0,1.0,0.0,1.0,-0.50024,-0.474326,0.43135,-0.580877
1,1.0,0.0,0.0,0.0,0.0,0.0,0.788947,-0.474326,0.43135,0.649514
2,1.0,0.0,1.0,0.0,0.0,1.0,-0.48665,-0.474326,-0.475199,-0.27328
3,1.0,0.0,0.0,0.0,0.0,1.0,0.422861,-0.474326,0.43135,0.418816
4,1.0,0.0,1.0,1.0,0.0,1.0,-0.484133,-0.474326,-0.475199,0.418816


In [551]:
X_mice=df2;
y_mice=df_mice['Survived'];

In [552]:
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

cv = StratifiedKFold(y_mice, n_folds=10, shuffle=True, random_state=5)

# X_resampled1
# y_resampled1

lg3= LogisticRegression(random_state=5)
dt3 = DecisionTreeClassifier(class_weight='balanced',min_samples_split=50,random_state=5)
bdt3 = BaggingClassifier(DecisionTreeClassifier(random_state=5),random_state=5)
rf3 = RandomForestClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
et3 = ExtraTreesClassifier(class_weight='balanced', n_jobs=-1,random_state=5)
abc3 = AdaBoostClassifier(random_state=5)
gbst3 = GradientBoostingClassifier(random_state=5)
bnb3 = BernoulliNB()
svc3= SVC(kernel='linear',probability=True,random_state=5)


def score(model, name):
    s = cross_val_score(model, X_mice, y_mice, cv=cv, n_jobs=-1)
    print "{} Outter Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))
        
score(dt3,"Decision Tree")
score(lg3, "Logistic Regres")
score(bdt3, "Bagging DT")
score(rf3, "Random Forest")
score(et3, "Extra Trees")
score(abc3, "Ada Boost")
score(gbst3, "Grad Boosting")
score(bnb3, "Bernoulli NB")
score(svc3, "SVC")

Decision Tree Outter Score:	0.794 ± 0.052
Logistic Regres Outter Score:	0.808 ± 0.038
Bagging DT Outter Score:	0.805 ± 0.041
Random Forest Outter Score:	0.825 ± 0.053
Extra Trees Outter Score:	0.781 ± 0.049
Ada Boost Outter Score:	0.804 ± 0.047
Grad Boosting Outter Score:	0.828 ± 0.038
Bernoulli NB Outter Score:	0.765 ± 0.028
SVC Outter Score:	0.786 ± 0.039


In [53]:
df_knn.to_csv(path_or_buf="../assets/titanic_imp.csv", index=False)