### Setup the imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, roc_curve, auc
from matplotlib import pyplot as plt

%matplotlib inline

### Read in the data

In [None]:
df = pd.read_csv('train.csv',  index_col=0)
df.head()

### Basic EDA

In [None]:
# missing info
sns.heatmap(df.isna())

In [None]:
# shape of data
sns.pairplot(df, hue='Survived')

In [None]:
# strength of correlation: feature-feature, and feature-dependent variable
# using an absolte correlation as it is the magnitude of correlation, not direction, that we are interested in
plt.figure(figsize=(12,8))
sns.heatmap(df.corr().abs(), annot=True)

### Prepare the X and y data

In [None]:
y = df['Survived']
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]

### Train test split X and y data

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

### Feature Engineering

In [None]:
def feature_engineering(X, agemean):
    """adds extra features to a DataFrame"""
    X['Age'].fillna(agemean, inplace=True)
    # one-hot encoding
    X['female'] = (X['Sex']=='female').astype(int)
    X['child'] = (X['Age'] < 14).astype(int)
    # interaction terms
    X['female_pclass'] = X['female']       * X['Pclass']
    X['male_pclass']   = (1 - X['female']) * X['Pclass']
    X['child_pclass']  = X['child']        * X['Pclass']
    X['adult_pclass']  = (1 - X['child'])  * X['Pclass']
    del X['Sex']

In [None]:
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()

In [None]:
agemeans = df.groupby(['Pclass','Sex'])['Age'].mean()

agemean = Xtrain['Age'].mean()
feature_engineering(Xtrain, agemean)
feature_engineering(Xtest, agemean)  # use training mean to avoid 'data leakage' in .mean()

In [None]:
Xtrain.shape, ytrain.shape

In [None]:
Xtest.shape, ytest.shape

### LogReg Model

In [None]:
m = LogisticRegression(max_iter=1000)

### Cross validate the model and intepret the results

In [None]:
cv_all = cross_val_score(m, Xtrain, ytrain, cv=10, scoring='accuracy')
cv_mean = cv_all.mean()
cv_std = np.std(cv_all)
f"Cross-validation mean {cv_mean:5.3f} +- {cv_std:5.3f}"

In [None]:
# Lets visualise a distribution of the results
plt.bar(range(10), cv_all)

In [None]:
m.fit(Xtrain, ytrain)
train_score = m.score(Xtrain, ytrain)
train_score

In [None]:
# Is the score inside the range?
train_score > cv_all.min() and train_score < cv_all.max()

In [None]:
# Is the score is inside one sigma?
train_score - cv_mean < cv_std 

In [None]:
test_score = m.score(Xtest, ytest)
test_score

### Evaluate the models predictive power

In [None]:
ypred = m.predict(Xtest)

In [None]:
#normalise the confusion matrix - its easier to interpret
norm_cm = (confusion_matrix(ytest, ypred) / confusion_matrix(ytest, ypred).sum()) * 100
norm_cm = norm_cm.round(2)

In [None]:
# Confusion matrix
plt.figure(figsize=(12,8))
sns.heatmap(norm_cm, annot=True)

In [None]:
# roc curve
ypred_proba = m.predict_proba(Xtest)[:,1]
fpr, tpr, threshold = roc_curve(ytest, ypred_proba)
plt.figure(figsize=(12,8))
plt.plot(fpr,tpr)
plt.xlabel('false positive rate / precision')
plt.ylabel('true positive rate / recall')
plt.title('roc curve')

### Examine the strength of the coefficients for each feature

In [None]:
plt.barh(range(10), m.coef_[0])
plt.yticks(range(10), X.columns)
None

### Evaluate the statistical significance of the results

In [None]:
import statsmodels.discrete.discrete_model as sm

logit = sm.Logit(ytrain, Xtrain)
f = logit.fit(maxiter=300)
print(f.params)
print('-' * 40)
print(f.summary())

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

m = RandomForestClassifier(n_estimators=20, max_depth=2)
cross_val_score(m, Xtrain, ytrain, cv=5, scoring='accuracy')

In [None]:
m.fit(Xtrain, ytrain)
m.score(Xtrain, ytrain)

In [None]:
m.score(Xtest, ytest)

In [None]:
plt.barh(range(10), m.feature_importances_)
plt.yticks(range(10), Xtrain.columns)
None