# Decision Tree Classification on Titanic Dataset
##### Dataset can be obtained from Kaggle at: https://www.kaggle.com/competitions/titanic/overview
##### Will need titanic_unscaled dataset from ML Lab 2 KNN for this to work
##### In this notebook, apply these tools and methodologies:
##### 1. ETL Titanic dataset
##### 2. Apply Decision Tree on dataset to see prediction accuracy

In [30]:
import pandas as pd
import os

# # Show current working directory - the directory where all your files are saved by default
# os.getcwd()

# Set path for new working directory
path = "C:/Users/Sarah/Faris Stuff/USM Data Science Masters Files/CDS503/Week 4 - 09 Mar/Data"
os.chdir(path) 

# # Check to see if current directory has changed
# os.getcwd()

# Read data from CSV to a data frame named df
df = pd.read_csv('titanic_unscaled.csv') 
# Display the data
df

Unnamed: 0,survived,pclass,sex,age,sibspouse,parchild,fare
0,0,3,1,22.0,1,0,7.2500
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.9250
3,1,1,0,35.0,1,0,53.1000
4,0,3,1,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
882,0,2,1,27.0,0,0,13.0000
883,1,1,0,19.0,0,0,30.0000
884,0,3,0,7.0,1,2,23.4500
885,1,1,1,26.0,0,0,30.0000


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   survived   887 non-null    int64  
 1   pclass     887 non-null    int64  
 2   sex        887 non-null    int64  
 3   age        887 non-null    float64
 4   sibspouse  887 non-null    int64  
 5   parchild   887 non-null    int64  
 6   fare       887 non-null    float64
dtypes: float64(2), int64(5)
memory usage: 48.6 KB


In [32]:
# Convert integer to string: survived, pclass, sex - because we need it as a category, not a number
columns_to_convert = ['survived', 'pclass', 'sex']
for col in columns_to_convert:
    df[col] = df[col].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   survived   887 non-null    object 
 1   pclass     887 non-null    object 
 2   sex        887 non-null    object 
 3   age        887 non-null    float64
 4   sibspouse  887 non-null    int64  
 5   parchild   887 non-null    int64  
 6   fare       887 non-null    float64
dtypes: float64(2), int64(2), object(3)
memory usage: 48.6+ KB


In [33]:
# Indicate the target column
target = df['survived']
# Indicate the columns that will serve as features
features = df.drop('survived', axis = 1)

In [34]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split data into train, validation and test sets
# Split the dataset into training + development set and test set
x, x_test, y, y_test = train_test_split(features, target, test_size = 0.2, random_state = 0)

# Split the dataset into training set and development set
x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size = 0.2, random_state = 10)

In [35]:
# Training a decision tree model
# Import decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier with default parameters
dtree = DecisionTreeClassifier()

# Train the model using the training set
dtree.fit(x_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [43]:
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Predict the target for the development dataset
dev_predict = dtree.predict(x_dev)

# Evaluate the decision tree model on development set
# Compute the model accuracy on the development set: How often is the classifier correct?
print("Accuracy (Dev): ", metrics.accuracy_score(y_dev, dev_predict))

# Import scikit-learn metrics methods
from sklearn.metrics import classification_report, confusion_matrix
# Print confusion matrix
print(f'Confusion Matrix:')
print(confusion_matrix(y_dev, dev_predict))
# Print precision, recall and F1
print(classification_report(y_dev, dev_predict))

Accuracy (Dev):  0.8309859154929577
Confusion Matrix:
[[74 11]
 [13 44]]
              precision    recall  f1-score   support

           0       0.85      0.87      0.86        85
           1       0.80      0.77      0.79        57

    accuracy                           0.83       142
   macro avg       0.83      0.82      0.82       142
weighted avg       0.83      0.83      0.83       142



In [44]:
# Predict the target for the development dataset
test_predict = dtree.predict(x_test)

# Evaluate the decision tree model on development set
# Compute the model accuracy on the development set: How often is the classifier correct?
print("Accuracy (Test): ", metrics.accuracy_score(y_test, test_predict))

# Print confusion matrix
print(f'Confusion Matrix:')
print(confusion_matrix(y_test, test_predict))
# Print precision, recall and F1
print(classification_report(y_test, test_predict))

Accuracy (Test):  0.7865168539325843
Confusion Matrix:
[[97 20]
 [18 43]]
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       117
           1       0.68      0.70      0.69        61

    accuracy                           0.79       178
   macro avg       0.76      0.77      0.76       178
weighted avg       0.79      0.79      0.79       178



##### Without tuning, with just default Decision Tree results:
##### Accuracy (Dev):  0.8309859154929577
##### Accuracy (Test):  0.7865168539325843

# Random parameter tuning

In [46]:
# Create a decision tree classifier with criterion = entropy and max_depth = 3
dtree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3)
# Train the model using the training set
dtree.fit(x_train, y_train)
# Predict the target for the development dataset
dev_predict = dtree.predict(x_dev)
# Compute the model accuracy on the development set: How often is the classifier correct?
print("Accuracy (Dev): ", metrics.accuracy_score(y_dev, dev_predict))
# Predict the target for the development dataset
test_predict = dtree.predict(x_test)
# Evaluate the decision tree model on development set
# Compute the model accuracy on the development set: How often is the classifier correct?
print("Accuracy (Test): ", metrics.accuracy_score(y_test, test_predict))

Accuracy (Dev):  0.8169014084507042
Accuracy (Test):  0.7808988764044944


# Hyperparameter Tuning

In [48]:
# Use Grid Search to further optimize Decision Tree. Called Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Split original dataset into train and test sets, no need dev because will be doing cross validation. 80/20
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 0)

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'criterion': ['gini', 'entropy', 'log_loss']
}

# Initialize the Decision Tree Classifier
optDTree = DecisionTreeClassifier()

# 'scoring' can be set to metrics like 'accuracy', 'f1', 'roc_auc' as needed
grid_search = GridSearchCV(estimator=optDTree, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearch to the data
grid_search.fit(x_train, y_train)

# Print the best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

# Use the best estimator for final evaluation
best_model = grid_search.best_estimator_
accuracy = best_model.score(x_test, y_test)
print(f"Test Set Accuracy with Best Model: {accuracy}")

Best Parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best Cross-Validation Score: 0.8336030366596743
Test Set Accuracy with Best Model: 0.8033707865168539


##### Results: 
###### No Parameter Tuning
###### Accuracy (Dev):  0.8309859154929577
###### Accuracy (Test):  0.7865168539325843
######
###### Random Parameter Tuning
###### Accuracy (Dev):  0.8169014084507042
###### Accuracy (Test):  0.7808988764044944
######
###### Hyperparameter Tuning
###### Accuracy (Dev):  0.8336030366596743
###### Accuracy (Test):  0.8033707865168539
######
###### Conclusion: Hyperparameter tuning best fitting and best test accuracy score

In [49]:
# Load export_text library
from sklearn.tree import export_text
tree_rules = export_text(dtree, feature_names=list(x_train))
# Print deccision tree rules
print(tree_rules)

|--- sex <= 0.50
|   |--- pclass <= 2.50
|   |   |--- fare <= 28.86
|   |   |   |--- class: 1
|   |   |--- fare >  28.86
|   |   |   |--- class: 1
|   |--- pclass >  2.50
|   |   |--- fare <= 23.35
|   |   |   |--- class: 1
|   |   |--- fare >  23.35
|   |   |   |--- class: 0
|--- sex >  0.50
|   |--- fare <= 10.87
|   |   |--- fare <= 7.13
|   |   |   |--- class: 0
|   |   |--- fare >  7.13
|   |   |   |--- class: 0
|   |--- fare >  10.87
|   |   |--- age <= 3.50
|   |   |   |--- class: 1
|   |   |--- age >  3.50
|   |   |   |--- class: 0

