# Decision Tree
 This task creates a decision tree that can predict the survival of passengers on the Titanic
 ![image.png](attachment:image.png)

In [None]:
# importing libraries
import numpy as np
import pandas as pd

from sklearn.datasets import load_wine


# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# Splitting data into training and testing set
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# from sklearn.metrics import f1_score, precision_score, recall_score

# for visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import export_graphviz
from subprocess import call

# import os

In [None]:
titanic_df = pd.read_csv("titanic.csv")
titanic_df.head()

In [None]:
titanic_df.info()

In [None]:
# You can also drop whichever other columns you'd like here
titanic_df.drop(["PassengerId", "Cabin", "Name", "Ticket"], axis=1, inplace=True)

### One-Hot Encoding
One-hot encoding is a technique used to ensure that categorical variables are better represented in the machine. Let's take a look at the "Sex" column

In [None]:
titanic_df["Sex"].unique()

In [None]:
titanic_df = pd.get_dummies(titanic_df, prefix="Sex", columns=["Sex"])
titanic_df = pd.get_dummies(titanic_df, prefix="Embarked", columns=["Embarked"])
titanic_df.head()

Features and target

In [None]:
X = titanic_df.drop(columns=["Survived"])
y = titanic_df["Survived"]
X.shape

Train Test Split

In [None]:
# random state
r = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=r)

### 1. Decision Tree & Bagged trees

Check train and test accuracy


In [None]:
names = ["Decision Tree", "Bagged Tree", "Random Forest"]

classifier = [
    DecisionTreeClassifier(random_state=r),
    BaggingClassifier(
        estimator=DecisionTreeClassifier(random_state=r),
        random_state=r,
    ),
    RandomForestClassifier(random_state=r),
]

In [None]:
data = []
for name, log in zip(names, classifier):
    log.fit(X_train, y_train)
    y_prediction = log.predict(X_test)

    accuracy = log.score(X_test, y_test)
    data.append([name, accuracy])

In [None]:
from tabulate import tabulate

print(
    tabulate(
        data,
        headers=["Model", "Accuracy"],
        tablefmt="fancy_outline",
        colalign=("center",),
    )
)

### 2. Feature Importance

In [None]:
rf = RandomForestClassifier(random_state=r)
rf.fit(X_train, y_train)

# Finding the important features using the built-in Gini importance

# Get numerical feature importances
feature_names = X.columns
importances = rf.feature_importances_

# Dataframe with features and importances, # Sort the feature importances by most important first
feature_imp_df = pd.DataFrame(
    {"Feature": feature_names, "Importance": importances}
).sort_values("Importance", ascending=False)

feature_imp_df

In [None]:
# Creating a seaborn bar plot
plt.figure(figsize=(7, 7))
sns.barplot(
    x=feature_imp_df["Importance"], y=feature_imp_df["Feature"], data=feature_imp_df
)
plt.title("Feature Importance")
plt.xlabel("Importance score")
plt.ylabel("Features")

As shown in the above graph, "Fare" feature contributes the most when predicting survival rate. 

### 3. Hyperparameter Tuning

In [None]:
# Define a base Random Forest model
rf = RandomForestClassifier(random_state=r)

# Hyperparameter tuning for Random Forest using GridSearchCV and fit the data.
from sklearn.model_selection import GridSearchCV

params = {
    "max_depth": [2, 3, 5, 10, 20],
    "max_features": [1, 2, 3, 6, 10],
    "n_estimators": [10, 25, 30, 50, 100, 200],
}

# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator=rf, param_grid=params, cv=3, n_jobs=-1, verbose=1, scoring="r2"
)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
print(grid_search.best_params_)
best_depth = grid_search.best_params_["max_depth"]
best_estimators = grid_search.best_params_["n_estimators"]
best_features = grid_search.best_params_["max_features"]

### 4. Comparing Models (second time)

In [None]:
new_log = RandomForestClassifier(
    max_depth=best_depth,
    max_features=best_features,
    n_estimators=best_estimators,
    random_state=r,
)
new_log.fit(X_train, y_train)
y_prediction = new_log.predict(X_test)

new_accuracy = new_log.score(X_test, y_test)
data.append(["Random Forest with pre-pruning", new_accuracy])

In [None]:
from tabulate import tabulate

print(
    tabulate(
        data,
        headers=["Model", "Accuracy"],
        tablefmt="fancy_outline",
        colalign=("center",),
    )
)