In [1]:
import numpy as np

seed_ = 20200218
np.random.seed(seed_)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy import stats

sns.set(context='paper', style='whitegrid', rc={'figure.figsize':(21/2, 9/2), 'figure.dpi': 120})
color_palette = sns.color_palette("Set2")
sns.set_palette(color_palette)

# Sk-learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Keras
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv("./data/train_cleaned.csv")

In [3]:
X = df.drop(columns=['Survived'], axis=1)
y = df['Survived']

In [4]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,22.0,1,0,7.25,0,0,1
1,1,0,38.0,1,0,71.2833,1,0,0
2,3,0,26.0,0,0,7.925,0,0,1
3,1,0,35.0,1,0,53.1,0,0,1
4,3,1,35.0,0,0,8.05,0,0,1


In [5]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

# 
## Normalize input
## split train test

In [6]:
scaler = StandardScaler()
scaler.fit(X)

X_normalized = scaler.transform(X)

In [7]:
X_normalized

array([[ 0.82412268,  0.73779931, -0.57937536, ..., -0.48127201,
        -0.30813068,  0.6155339 ],
       [-1.57529024, -1.35538212,  0.64994903, ...,  2.07782705,
        -0.30813068, -1.62460588],
       [ 0.82412268, -1.35538212, -0.27204426, ..., -0.48127201,
        -0.30813068,  0.6155339 ],
       ...,
       [ 0.82412268, -1.35538212, -0.04154594, ..., -0.48127201,
        -0.30813068,  0.6155339 ],
       [-1.57529024,  0.73779931, -0.27204426, ...,  2.07782705,
        -0.30813068, -1.62460588],
       [ 0.82412268,  0.73779931,  0.18895238, ..., -0.48127201,
         3.24537633, -1.62460588]])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2)

# 
## Grid Search for LogisticRegression

In [18]:
logistic_model = LogisticRegression()

In [10]:
logistic_params_grid = [
    {'penalty': ['l1'], 
     'solver': ['liblinear', 'saga'], 
     'max_iter': [50, 100, 200]
    },
    {'penalty': ['l2'], 
     'solver': ['newton-cg', 'sag', 'lbfgs', 'liblinear', 'saga'], 
     'max_iter': [50, 100, 200]
    }
]

In [11]:
lr_clf = GridSearchCV(logistic_model, param_grid=logistic_params_grid, cv=5)
lr_clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid=[{'max_iter': [50, 100, 200], 'penalty': ['l1'],
                          'solver': ['liblinear', 'saga']},
                         {'max_iter': [50, 100, 200], 'penalty': ['l2'],
                          'solver': ['newton-cg', 'sag', 'lbfgs', 'liblinear',
                                     'saga']}])

In [12]:
lr_clf.best_params_

{'max_iter': 50, 'penalty': 'l1', 'solver': 'liblinear'}

In [13]:
lr_clf.best_score_

0.7929577464788733

In [14]:
lr_clf.best_estimator_

LogisticRegression(max_iter=50, penalty='l1', solver='liblinear')

In [15]:
best_lr = lr_clf.best_estimator_
best_lr.fit(X_train, y_train)

LogisticRegression(max_iter=50, penalty='l1', solver='liblinear')

In [16]:
best_lr.score(X_train, y_train)

0.7915492957746478

In [17]:
best_lr.score(X_test, y_test)

0.7921348314606742

# 
## Grid Search for Ridge

In [19]:
ridge_model = RidgeClassifier()

In [32]:
ridge_params_grid = [
    {
        'alpha': [1, 0.5, 0.1, 0.01, 0.001],
        'max_iter': [100, 200, 400, 1000],
        'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    }
]

In [33]:
r_clf = GridSearchCV(ridge_model, param_grid=ridge_params_grid, cv=5)

In [34]:
r_clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RidgeClassifier(),
             param_grid=[{'alpha': [1, 0.5, 0.1, 0.01, 0.001],
                          'max_iter': [100, 200, 400, 1000],
                          'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg',
                                     'sag', 'saga']}])

In [35]:
r_clf.best_params_

{'alpha': 1, 'max_iter': 100, 'solver': 'svd'}

In [36]:
r_clf.best_score_

0.7929577464788733

In [37]:
r_clf.best_estimator_

RidgeClassifier(alpha=1, max_iter=100, solver='svd')

In [38]:
best_ridge = r_clf.best_estimator_

In [39]:
best_ridge.fit(X_train, y_train)

RidgeClassifier(alpha=1, max_iter=100, solver='svd')

In [40]:
best_ridge.score(X_train, y_train)

0.8028169014084507

In [41]:
best_ridge.score(X_test, y_test)

0.7921348314606742