In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df.keys()

In [None]:
df.drop('PassengerId', axis=1, inplace=True)
num = [x for x in df.columns if df.dtypes[x] in ('float', 'int')]
cat = [x for x in df.columns if df.dtypes[x] == 'object']
target = 'Survived'
num.remove(target)

from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2, random_state=0)

# Explore

In [None]:
corr = train.corr()
corr[target].sort_values(ascending=False)

In [None]:
np.fill_diagonal(corr.values, 0)
sns.heatmap(corr, annot=True, center=0, cmap='RdBu');

In [None]:
df.isnull().sum()

In [None]:
train.groupby(['Sex'])[target].mean()

In [None]:
prefix = train['Name'].apply(lambda x: x.split(',')[1].strip().split(' ')[0])
train.groupby(['Sex', prefix])[target].agg(['mean', 'count', 'sum'])

In [None]:
train.groupby([train['Fare'] == 0, prefix])[target].agg(['mean', 'count', 'sum'])


In [None]:
train.loc[train['Name'].str.contains('Capt')]

In [None]:
train.query('Fare == 0')

In [None]:
train.groupby(train['Ticket'] == 'LINE')[target].agg(['mean', 'count', 'sum'])

In [None]:
train.groupby(['Embarked'])[target].agg(['mean', 'sum', 'count'])

In [None]:
train.groupby(['Embarked', 'Pclass'])[['Fare', target]].agg(['mean', 'sum', 'count'])

In [None]:
train.groupby(
    [x[0] if type(x) == str else 'na' for x in train['Cabin']]
)[target].agg(['mean', 'sum', 'count'])

# Pipeline

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin

class DataSelect(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attributes]

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        dframe = pd.DataFrame({
        'child': [1 if x <= 15 else 0 for x in X['Age']]
        , 'man': np.where((X['Sex'] == 'male') & (X['Age'] > 16), 1, 0)
        , 'woman': np.where((X['Sex'] == 'female') & (X['Age'] > 16), 1, 0)
        , 'emb_s': [1 if x == 'S' else 0 for x in X['Embarked']]
        , 'emb_q': [1 if x == 'Q' else 0 for x in X['Embarked']]
        , 'cabin': [1 if (type(x) == str) and (x[0] in 'BDE') else 0 for x in X['Cabin']]
        , 'mr': np.where((X['Name'].str.contains('Mr.')), 1, 0)
        , 'male_alone': np.where((X['Sex'] == 'male') & (X['SibSp']==0) & (X['Parch'] == 0), 1, 0)
        })
        self.feature_names = dframe.columns.values
        return dframe
    def get_feature_names_out(self):
        return self.feature_names

num_pp = Pipeline([
    ('dataselect', DataSelect(['Age', 'Pclass', 'Fare']))
    , ('impute', SimpleImputer(strategy='median'))
    , ('scaler', StandardScaler())
])

cat_pp = Pipeline([
    ('dataselect', DataSelect(['Sex']))
#     , ('impute', SimpleImputer(strategy='most_frequent'))
    , ('encode', OneHotEncoder(sparse=False))
])

eng_pp = Pipeline([
    ('dataselect', DataSelect(['Age', 'Sex', 'Embarked', 'Name', 'Pclass', 'SibSp', 'Parch', 'Cabin']))
    , ('engineer', FeatureEngineer())
])

pipe = FeatureUnion([
    ('num', num_pp)
    , ('cat', cat_pp)
    , ('eng', eng_pp)
])

pipe.fit(train)
train_pre = pipe.transform(train)
test_pre = pipe.transform(test)

# Select Model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
machines = [
    GaussianNB()
    , DecisionTreeClassifier(random_state=0)
    , RandomForestClassifier(random_state=0)
]

def getBaseline(lst, X, y):
    rt = []
    for m in machines:
        m.fit(X, y)
        rt.append({
            'model': m
            , 'base': cross_val_score(m, X, y).mean()
        })
    return pd.DataFrame(rt)

models = getBaseline(machines, train_pre, train[target])
models

In [None]:
model = models.iloc[2]['model']

# Hypertuning Model

In [None]:
model.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 200, 300]
    , 'max_depth': [5, 6, 7, 8, 9]
}

grid = GridSearchCV(model, params)

grid.fit(train_pre, train[target])
grid.best_estimator_

In [None]:
model_tune = RandomForestClassifier(random_state=0
                                    , n_estimators=200
                                    , max_depth=7
                                   )
model_tune.fit(train_pre, train[target])

# Validation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_predict

def getScores(m, X, y):
    prediction = cross_val_predict(m, X, y)
    return {
        'accuracy': accuracy_score(y, prediction)
        , 'precision': precision_score(y, prediction)
        , 'recall': recall_score(y, prediction)
        , 'f1_score': f1_score(y, prediction)
        , 'roc_auc_score': roc_auc_score(y, prediction)
    }

scores = pd.DataFrame({
    'tr_base': getScores(model, train_pre, train[target])
    , 'tr_tune': getScores(model_tune, train_pre, train[target])
    , 'te_base': getScores(model, test_pre, test[target])
    , 'te_tune': getScores(model_tune, test_pre, test[target])
})

scores['tr_diff'] = (scores['tr_tune'] / scores['tr_base'] -1) * 100
scores['te_diff'] = (scores['te_tune'] / scores['te_base'] -1) * 100
scores.drop(['tr_base', 'te_base'], axis=1, inplace=True)
scores

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
pred_base = cross_val_predict(model, train_pre, train[target], method='predict_proba')[:,1]
pred_tune = cross_val_predict(model_tune, train_pre, train[target], method='predict_proba')[:,1]

pr_tr, re_tr, th_tr = precision_recall_curve(train[target], pred_base)
pr_te, re_te, th_te = precision_recall_curve(train[target], pred_tune)

plt.plot(pr_tr, re_tr, label='base')
plt.plot(pr_te, re_te, label='tune')
plt.legend();

# Post-Analysis

In [None]:
pd.DataFrame({
    'attributes': np.concatenate([
    num_pp.named_steps['dataselect'].attributes
    , cat_pp.named_steps['encode'].get_feature_names_out(["Sex"])
    , eng_pp.named_steps['engineer'].get_feature_names_out()
      ])
    , 'importance': grid.best_estimator_.feature_importances_ * 100

}).sort_values(by='importance', ascending=False)

In [None]:
# train_ch = pd.DataFrame(
#     data=train_pre
#     , columns=np.concatenate([
#         num_pp.named_steps['dataselect'].attributes
#         , cat_pp.named_steps['encode'].get_feature_names_out(["Sex"])
#         , eng_pp.named_steps['engineer'].get_feature_names_out()
#     ])
# )

In [None]:
pred = cross_val_predict(model_tune, train_pre, train[target])

train['pred'] = pred

train['diff'] = train[target] != pred

In [None]:
# train.loc[(train['diff'] == True ) & (train['pred'] == 1) & (train['Sex'] =='female')]

# Notes
- 

# Submission

In [None]:
samp = pd.read_csv('/kaggle/input/titanic/test.csv')
pd.DataFrame({
    'PassengerId': samp['PassengerId']
    , 'Survived': model_tune.predict(pipe.transform(samp))
}).to_csv('submission.csv', index=False)