## Introduction to this notebook

After todays lession about 'Feature Engeneering' using 'ColumnTransformer()' and 'Pipline()' I would like to achive similar accuracy results like in the 3_WP notebook by using these functions in shorter time and with more concise code.

## 1. Load data and some basic EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

# new utils
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

# to visualize the column transformer and pipeline
set_config(display='diagram')

In [None]:
df = pd.read_csv("./data/Kaggle_Data/train.csv")
X_test = pd.read_csv("./data/Kaggle_Data/test.csv")

In [None]:
X_test.isna().sum()

In [None]:
df.isna().sum()

In [None]:
sns.heatmap(X_test.isna());

In [None]:
X_test[X_test["Fare"].isna()]

## 2. Feature Engineering outside of scikit-learn

In [None]:
# Fill the 177 NaNs in the "Age" column depending on the "Survived and "Pclass"

df['Age'].fillna(df.groupby('Pclass')['Age'].transform('mean').round(0), inplace=True)
X_test['Age'].fillna(X_test.groupby('Pclass')['Age'].transform('mean').round(0), inplace=True)

In [None]:
# Merge the columns parent/children and sibling/spouse together, create a new column "Family" and bin the values

df["Family"] = df["SibSp"] + df["Parch"]
X_test["Family"] = X_test["SibSp"] + X_test["Parch"]

In [None]:
# Extract the title from "Name" and create a new column

df["Title"] = df["Name"].map(lambda name:name.split(',')[1].split(".")[0].strip())
X_test["Title"] = X_test["Name"].map(lambda name:name.split(',')[1].split(".")[0].strip())

In [None]:
#Fill the one NaN in the column "Fare" with the mean
X_test['Fare'].fillna((X_test['Fare'].mean()), inplace=True)

In [None]:
X_test.isna().sum()

## 3. Train-Test Split

In [None]:
y_train = df["Survived"]
X_train = df.loc[:, df.columns != "Survived"]

In [None]:
X_train.shape, X_test.shape

## 3. Define ColumnTransformers

In [None]:
numeric_features = ["Age", "Fare"]
numeric_transformer = StandardScaler()


categorical_features = ["Sex", "Pclass", "Family", "Title"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

embarked_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]
)

In [None]:
# Define the preprocessor

preprocessor = [
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("embarked", embarked_transformer, ["Embarked"])
    ]

In [None]:
column_transformer = ColumnTransformer(preprocessor,
                                        remainder = 'drop')

In [None]:
column_transformer

## 4. Train ML models

### 4.1 Logistic Regression

### 4.1.1 Normal Model

In [None]:
log_reg_pipeline = Pipeline(steps = 
                        [('column_transformer', column_transformer),
                         ('log_reg', LogisticRegression(max_iter = 1000, class_weight = 'balanced'))
                        ])

In [None]:
log_reg_pipeline.fit(X_train, y_train)

In [None]:
X_test.isna().sum()

In [None]:
# Create a predictions array in order to submit to Kaggle
predictions_logreg = log_reg_pipeline.predict(X_test)

In [None]:
predictions_logreg

In [None]:
print(f"""The train accuracy of log_reg_pipeline is: {round(log_reg_pipeline.score(X_train,y_train),2)}""")

### 4.1.2 Evaluating classifiers

In [None]:
from sklearn.metrics import accuracy_score 

ypred = log_reg_pipeline.predict(X_train)
print(f"Accuracy: {round(accuracy_score(y_train, ypred),2)}")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(f"""Precision = {round(precision_score(y_train,ypred),2)} 
Recall = {round(recall_score(y_train,ypred),2)}
F1 = {round(f1_score(y_train,ypred),2)}""")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix

conf = confusion_matrix(y_train, ypred)
conf

In [None]:
plot_confusion_matrix(log_reg_pipeline, X_train, y_train, normalize=None)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=conf, display_labels=log_reg_pipeline.classes_)
disp.plot()
plt.show()

### 4.2 Random Forest

In [None]:
forest_pipeline = Pipeline(steps = 
                        [('column_transformer', column_transformer),
                         ('forest', RandomForestClassifier(n_estimators = 35, max_depth = 3))
                        ])

In [None]:
forest_pipeline.fit(X_train, y_train)

In [None]:
# Create a predictions array in order to submit to Kaggle
predictions_rf = svc_pipeline.predict(X_test)

In [None]:
predictons_rf

In [None]:
print(f"""The train accuracy of forest_pipeline is: {round(forest_pipeline.score(X_train,y_train),2)}""")

### 4.3 Support Vector Model

In [None]:
svc_pipeline = Pipeline(steps = 
                        [('column_transformer', column_transformer),
                         ('svc', SVC(kernel= "poly", C=1))
                        ])

In [None]:
svc_pipeline.fit(X_train, y_train)

In [None]:
# Create a predictions array in order to submit to Kaggle
predictions_svc = svc_pipeline.predict(X_test)

In [None]:
predictions_svc

In [None]:
print(f"""The train accuracy of svc_pipeline is: {round(svc_pipeline.score(X_train,y_train),2)}""")

## 5. Create a CSV in order to submit to Kaggle

In [None]:
# Choose which predictions should be submitted to Kaggle
submission = pd.DataFrame({'PassengerId':X_test['PassengerId'],'Survived':predictions_svc})

In [None]:
filename = 'Titanic_Predictions_DB.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)