In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [4]:
x = np.load('tatanic_X_train.npy')

In [5]:
y = np.load('tatanic_y_train.npy')

In [6]:
x[:5]

array([[0.27345609, 0.01415106, 0.        , 1.        , 0.        ,
        0.125     , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.473882  , 0.13913574, 0.        , 0.        , 1.        ,
        0.125     , 0.25      , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.32356257, 0.01546857, 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 1.    

In [7]:
y[:5]

array([0., 1., 1., 1., 0.])

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [9]:
clflog = LogisticRegression(random_state=1)
clfdt = DecisionTreeClassifier(random_state=1)
clfgn = GaussianNB()
eclf_h = VotingClassifier(estimators=[('lr',clflog),('rf',clfdt),\
                                     ('gnb',clfgn)],voting='hard')
eclf_s = VotingClassifier(estimators=[('lr',clflog),('rf',clfdt),\
                                     ('gnb',clfgn)],voting='soft')

In [10]:
models = [clflog, clfdt, clfgn, eclf_h, eclf_s]

In [11]:
for model in models:
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    score = model.score(x_test, y_test)
    print(model)
    print(score)
    print('-'*20)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
0.8277153558052435
--------------------
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')
0.8202247191011236
--------------------
GaussianNB(priors=None, var_smoothing=1e-09)
0.41198501872659177
--------------------
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_stat



In [22]:
from sklearn.model_selection import cross_val_score
for model in models:
    scores = cross_val_score(model, x_train, y_train, cv=5)
    print(scores)
    print(scores.mean())
    print('-'*20)

[0.76       0.744      0.81451613 0.82258065 0.87903226]
0.8040258064516129
--------------------
[0.696      0.768      0.76612903 0.77419355 0.7983871 ]
0.760541935483871
--------------------
[0.736      0.768      0.80645161 0.7983871  0.83870968]
0.7895096774193548
--------------------
[0.696      0.768      0.76612903 0.77419355 0.7983871 ]
0.760541935483871
--------------------




In [13]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard')
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='soft')

In [14]:
models = [clf1, clf2, eclf1, eclf2]
for model in models:
    scores = cross_val_score(model, x_train, y_train, cv=5)
    print(scores)
    print(scores.mean())
    print('-'*20)

[0.76       0.744      0.81451613 0.82258065 0.87903226]
0.8040258064516129
--------------------
[0.696      0.768      0.76612903 0.77419355 0.7983871 ]
0.760541935483871
--------------------
[0.736      0.768      0.80645161 0.7983871  0.83870968]
0.7895096774193548
--------------------
[0.696      0.768      0.76612903 0.77419355 0.7983871 ]
0.760541935483871
--------------------




In [15]:
c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]
params ={
    "lr__solver" : ['liblinear'], "lr__penalty" : ["l2"], \
    "lr__C" : c_params, "dt__criterion" : ["gini", "entropy"],
    "dt__max_depth" : [10,8,7,6,5,4,3,2],
    "dt__min_samples_leaf": [1,2,3,4,5,6,7,8,9]
    }

In [18]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf1, param_grid=params, cv=5)
grid = grid.fit(x, y)

In [19]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf2, param_grid=params, cv=5)
grid = grid.fit(x, y)