# Titanic - Machine Learning from Disaster

## Decision Tree Model

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

## Constants

In [None]:
PATH_PROCESSED_TRAIN_CSV = "data/processed_train.csv"

## Data Collection

In [None]:
train_df = pd.read_csv(PATH_PROCESSED_TRAIN_CSV)
train_df

### Feature Selection

_Legend:_

X: Feature Variables (or Independent Variables)

y: Target Variables (or dependent Variables)

In [None]:
X = train_df.drop(["Survived"], axis=1)
y = train_df["Survived"]

### Split dataset into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

## Modeling

In [None]:
from common import functions as func

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

**Getting Best Hyperparameter Optimization**

*Note: The execution of the code below may take a few minutes or hours.*

*Uncomment and run it when you need to optimize hyperparameters.*

In [None]:
# space = dict()
# space['criterion'] = ["gini", "entropy"]
# space['splitter'] = ["best", "random"]
# space['max_depth'] = [n for n in range(50)]
# space['min_samples_split'] = [n for n in range(10)]
# space['min_samples_leaf'] = [n for n in range(10)]

# func.show_best_hyperparameter_optimization(
#     DecisionTreeClassifier(), 
#     space, 
#     X_train, 
#     y_train
# )

Best Score: 0.818920548771295

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 6, 'min_samples_split': 4, 'splitter': 'random'}

**Building, train and predict model**

In [None]:
decision_tree_classifier = DecisionTreeClassifier(
    criterion="entropy", 
    max_depth=6, 
    min_samples_split=4,
    splitter="random"
)
pipeline = make_pipeline(
    StandardScaler(),
    decision_tree_classifier
)

_ = pipeline.fit(X_train, y_train)

__Check the most relevant features for the training model__

In [None]:
func.get_feature_importances(decision_tree_classifier, X_train)

__Predict and show model result__

In [None]:
y_predict = pipeline.predict(X_test)
func.show_model_result(pipeline, X, y, y_test, y_predict)

**Show Curve ROC and Area Under the ROC**

In [None]:
func.show_curve_roc(pipeline, X_test, y_test, y_predict)

__Compare Ground Truth vs Prediction Model__

In [None]:
func.get_error_prediction(X_test, y_test, y_predict)

In [None]:
train_df[train_df["Ticket"] == 203]

In [None]:
plt.figure(figsize=(20, 25))
tree.plot_tree(decision_tree_classifier, filled=True, fontsize=10)
plt.show()