In [76]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt


In [77]:
processed_dir = os.path.join(os.path.pardir,"data","processed")
pr_train_path=os.path.join(processed_dir,"train.csv")
pr_test_path=os.path.join(processed_dir,"test.csv")
train_df = pd.read_csv(pr_train_path,index_col="PassengerId")
test_df = pd.read_csv(pr_test_path,index_col="PassengerId")
# train_df.info()
# test_df.info()

In [78]:
X = train_df.loc[:,"Age":].as_matrix().astype("float")
y = train_df["Survived"].ravel()
print(type(X),type(y))
print(X.shape,y.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(891, 32) (891,)


In [79]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=0)
print(Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape)

(712, 32) (712,) (179, 32) (179,)


In [80]:
print("mean survival in train set : {0:.3f}".format(np.mean(ytrain)))
print("mean survival in test set : {0:.3f}".format(np.mean(ytest)))

mean survival in train set : 0.383
mean survival in test set : 0.385


In [81]:
import sklearn
sklearn.__version__
# '0.19.0'
from sklearn.dummy import DummyClassifier as dc
model_dummy = dc(strategy="most_frequent", random_state=25)
model_dummy.fit(Xtrain, ytrain)
print("Score for baseline model : {0:.2f}".format(model_dummy.score(Xtest,ytest)))
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
print("Accuracy of our baseline model : {0:.2f}".format(accuracy_score(ytest, model_dummy.predict(Xtest))))
print("Confusion Matrix of our baseline model : \n{0}".format(confusion_matrix(ytest, model_dummy.predict(Xtest))))
print("Precision Score of our baseline model : {0:.2f}".format(precision_score(ytest, model_dummy.predict(Xtest))))
print("Recall Score of our baseline model : {0:.2f}".format(recall_score(ytest, model_dummy.predict(Xtest))))

Score for baseline model : 0.61
Accuracy of our baseline model : 0.61
Confusion Matrix of our baseline model : 
[[110   0]
 [ 69   0]]
Precision Score of our baseline model : 0.00
Recall Score of our baseline model : 0.00


  'precision', 'predicted', average, warn_for)


In [82]:
testX = test_df.as_matrix().astype("float")
pred = model_dummy.predict(testX)
df_submit = pd.DataFrame({"PassengerId":test_df.index,"Survived":pred})
df_submit.head()
import pathlib2 as pl2
submit_data_path = os.path.join(os.path.pardir,'data','external')
pl2.Path(submit_data_path).mkdir(parents=True, exist_ok=True)
sub_fl_path = os.path.join(submit_data_path,"baseline_pred.csv")
df_submit.to_csv(sub_fl_path,index=False)


In [83]:
def get_submission_file(model, filename):
    testX = test_df.as_matrix().astype("float")
    pred = model.predict(testX)
    df_submit = pd.DataFrame({"PassengerId":test_df.index,"Survived":pred})
    df_submit.head()
    import pathlib2 as pl2
    submit_data_path = os.path.join(os.path.pardir,'data','external')
    pl2.Path(submit_data_path).mkdir(parents=True, exist_ok=True)
    sub_fl_path = os.path.join(submit_data_path,filename)
    df_submit.to_csv(sub_fl_path,index=False)

In [84]:
get_submission_file(model_dummy,"baseline_pred.csv")

In [85]:
from sklearn.linear_model import LogisticRegression as lr
mdl_lr_1 = lr()
mdl_lr_1.fit(Xtrain, ytrain)
print("Score of logistic regression - version 1 : {0:.2f}".format(mdl_lr_1.score(Xtest,ytest)))

Score of logistic regression - version 1 : 0.83


In [86]:
def get_metrics(Xtest, ytest, model):
    from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
    print("Accuracy of our baseline model : {0:.2f}".format(accuracy_score(ytest, model.predict(Xtest))))
    print("Confusion Matrix of our baseline model : \n{0}".format(confusion_matrix(ytest, model.predict(Xtest))))
    print("Precision Score of our baseline model : {0:.2f}".format(precision_score(ytest, model.predict(Xtest))))
    print("Recall Score of our baseline model : {0:.2f}".format(recall_score(ytest, model.predict(Xtest))))

In [87]:
get_metrics(Xtest, ytest, mdl_lr_1)

Accuracy of our baseline model : 0.83
Confusion Matrix of our baseline model : 
[[95 15]
 [15 54]]
Precision Score of our baseline model : 0.78
Recall Score of our baseline model : 0.78


In [88]:
mdl_lr_1.coef_

array([[-0.02840734,  0.00455631, -0.50017004,  0.61922838, -0.81414743,
         0.12823264, -0.17253859, -0.39355488,  0.52215008,  1.09939125,
         0.40346551, -0.18369316, -0.30021028,  0.96558544,  0.48281794,
        -0.3451608 ,  0.28258585,  1.21850069,  0.56334183, -1.44612507,
         1.07146232, -0.11345497, -0.47306807,  0.16297326,  0.24746349,
         0.27998252,  0.4128233 ,  0.49202884,  0.46214499,  0.14906873,
         0.37253571,  0.73070686]])

In [89]:
get_submission_file(mdl_lr_1,"lr_1_pred.csv")

In [90]:
mdl_lr_2 = lr(random_state=0)

In [91]:
from sklearn.model_selection import GridSearchCV as gsv
param = {"C":[1.0,5.0,10.0,50.0,100.0,500.0,1000.0,5000.0],"penalty":["l1","l2"],}
clf=gsv(mdl_lr_2, param_grid=param, cv=3)
clf.fit(Xtrain, ytrain)
from pprint import pprint as pp
pp(clf.best_params_)
print("Best Score : {0:.2f}".format(clf.best_score_))
print("Score of logistic regression - version 2 : {0:.2f}".format(clf.score(Xtest, ytest)))

{'C': 1.0, 'penalty': 'l1'}
Best Score : 0.83
Score of logistic regression - version 2 : 0.83


In [92]:
get_submission_file(clf,"lr_2_pred.csv")

In [96]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
Xtrain_sc = scaler.fit_transform(Xtrain)
print(Xtrain_sc[:,0].min(),Xtrain_sc[:,0].max())

0.0 1.0


In [97]:
Xtest_sc = scaler.transform(Xtest)

In [102]:
std_scaler = StandardScaler()
Xtrain_stdsc = std_scaler.fit_transform(Xtrain)
print(Xtrain_stdsc[:,0].min(),Xtrain_stdsc[:,0].max())
Xtest_stdsc = std_scaler.transform(Xtest)
clf1=gsv(mdl_lr_2, param_grid=param, cv=5)
clf1.fit(Xtrain_stdsc, ytrain)
from pprint import pprint as pp
pp(clf1.best_params_)
print("Best Score : {0:.2f}".format(clf1.best_score_))
print("Score of logistic regression - version 3 : {0:.2f}".format(clf1.score(Xtest_stdsc, ytest)))

-2.14395510183 3.80311270129
{'C': 1.0, 'penalty': 'l1'}
Best Score : 0.82
Score of logistic regression - version 3 : 0.84


In [104]:
import pickle
model_file_path = os.path.join(os.path.pardir, 'models', 'lr_model.pkl')
scalar_file_path = os.path.join(os.path.pardir, 'models', 'lr_scalar.pkl')
mdl_fl_pkl = open(model_file_path,"wb")
scl_fl_pkl = open(scalar_file_path,"wb")
pickle.dump(clf1, mdl_fl_pkl)
pickle.dump(std_scaler, scl_fl_pkl)
mdl_fl_pkl.close()
scl_fl_pkl.close()

mdl_fl_pkl = open(model_file_path,"rb")
scl_fl_pkl = open(scalar_file_path,"rb")

clf_loaded = pickle.load(mdl_fl_pkl)
scl_loaded = pickle.load(scl_fl_pkl)

print(clf_loaded, scl_loaded)
X_test_scaled = scl_loaded.transform(Xtest)
print("Score for persisted logistic regression : {0:.2f}".format(clf_loaded.score(X_test_scaled,ytest)))



GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0, 5000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0) StandardScaler(copy=True, with_mean=True, with_std=True)
Score for persisted logistic regression : 0.84
