<a href="https://colab.research.google.com/github/evanlin917/titanicML-DecisionTree/blob/main/titanicML_DecisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install graphviz
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

#allows a file to be uploaded to Google Colab directly from the computer
from google.colab import files
uploaded = files.upload()

#using pandas to create a DataFrame to organize the data
df = pd.read_csv("titanic_train.csv")
df = df.drop("passenger_id", axis = 'columns')
df = df.drop("name", axis = 'columns')
df = df.drop("ticket", axis = 'columns')
df = df.drop("embarked", axis = 'columns')
df = df.drop("cabin", axis = 'columns')
df = df.drop("boat", axis = 'columns')
df = df.drop("body", axis = 'columns')
df = df.drop("home.dest", axis = 'columns')
df = df.dropna(subset = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare'])
print(df.head)

#creates a new column in the DataFrame with boolean values denoting if certain passengers are male or female
df['male'] = df['sex'] == 'male'
print(df.head)

#defining the feature matrix and creating a NumPy array to hold the data of the DataFrame
x = df[['pclass', 'male', 'age', 'sibsp', 'parch', 'fare']].values
print(x)

#defining the target and creating a NumPy array to hold the data of the DataFrame
y = df['survived'].values
print(y)

#creating the decision tree model
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier() #default creation of model sets its impurity to gini

#splitting the dataset into training and test datasets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 22)

#running the decision tree on the training and test datasets
model.fit(x_train, y_train)

#using the recently created model to predict the survivability of a certain passenger
print(model.predict([[3, True, 38.0, 0, 0, 8.6625]]))

#printing the accuracy, precision, and recall score of the model
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("accuracy: ", model.score(x_test, y_test))
y_pred = model.predict(x_test)
print("precision: ", precision_score(y_test, y_pred))
print("recall: ", recall_score(y_test, y_pred))

#comparing a decision tree with gini impurity with a decision tree of entropy impurity using k-fold cross validation
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True)
for criterion in ['gini', 'entropy']:
  print("Decision Tree - {}".format(criterion))
  accuracy = []
  precision = []
  recall = []
  for train_index, test_index in kf.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    new_model = DecisionTreeClassifier(criterion = 'entropy') #creating another decision tree model based on entropy impurity
    new_model.fit(x_train, y_train)
    y_pred = new_model.predict(x_test)
    accuracy.append(accuracy_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    print("accuracy: ", np.mean(accuracy))
    print("precision: ", np.mean(precision))
    print("recall: ", np.mean(recall))

#creating another decision tree with features of only passenger class and male/female
feature_names = ['pclass', 'male']
new_x = df[feature_names].values
new_y = df['survived'].values
modelThree = DecisionTreeClassifier()
modelThree.fit(new_x, new_y)

#creating an image to visualize the decision tree
from sklearn.tree import export_graphviz
import graphviz
from IPython.display import Image
dot_file = export_graphviz(modelThree, feature_names = feature_names)
graph = graphviz.Source(dot_file)
graph.render(filename = 'modelThreeTree', format = 'png', cleanup = 'True')

#conducting pre-pruning on the decision tree to prevent it from overfitting
pruned_decision_tree = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 2, max_leaf_nodes = 10)
pruned_decision_tree.fit(new_x, new_y)

#looping through values on the decision tree using grid search
from sklearn.model_selection import GridSearchCV
param_grid = {
              'max-depth': depth,
              'min_samples_leaf': leaves,
              'max_leaf_nodes': [10, 20, 35, 50]}
modelFour = DecisionTreeClassifier()
gs = GridSearchCV(modelFour, param_grid, scoring = 'f1', cv = 5)
gs.fit(new_x, new_y)
print("best params: ", gs.best_params_)
print("best score: ", gs.best_score_)

Saving titanic_train.csv to titanic_train (14).csv
<bound method NDFrame.head of      pclass     sex   age  sibsp  parch     fare  survived
1         3    male  38.0      0      0   8.6625         0
2         3  female  30.0      1      1  24.1500         0
3         2  female  54.0      1      3  23.0000         1
4         2    male  40.0      0      0  13.0000         0
5         3    male  28.0      0      0  22.5250         0
..      ...     ...   ...    ...    ...      ...       ...
845       1    male  55.0      0      0  50.0000         0
846       1    male  58.0      0      0  29.7000         0
847       2  female  24.0      1      0  26.0000         1
848       3  female   3.0      1      1  13.7750         0
849       2    male  52.0      0      0  13.0000         0

[675 rows x 7 columns]>
<bound method NDFrame.head of      pclass     sex   age  sibsp  parch     fare  survived   male
1         3    male  38.0      0      0   8.6625         0   True
2         3  female  30.

NameError: ignored