# Importing Data

In [1]:
import pandas as pd

In [2]:
titanic_data = pd.read_csv('Titanic_coursework_entire_dataset_23-24.csv')
titanic_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survival
0,1,3.0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0
1,2,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1
2,3,3.0,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1
3,4,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1
4,5,3.0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0


# Data Preprocessing + Setting X and Y

In [3]:
x = titanic_data.drop(['Survival', 'PassengerId'], axis = 'columns')
y = titanic_data['Survival']

In [4]:
from sklearn.preprocessing import LabelEncoder

le_name = LabelEncoder()
le_sex = LabelEncoder()
le_ticket = LabelEncoder()
le_embarked = LabelEncoder()

x['name_encoded'] = le_name.fit_transform(x['Name'])
x['sex_encoded'] = le_sex.fit_transform(x['Sex'])
x['ticket_encoded'] = le_ticket.fit_transform(x['Ticket'])
x['embarked_encoded'] = le_embarked.fit_transform(x['Embarked'])

x.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,name_encoded,sex_encoded,ticket_encoded,embarked_encoded
0,3.0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,108,1,522,2
1,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,190,0,595,0
2,3.0,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,353,0,668,2
3,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,272,0,49,2
4,3.0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,15,1,471,2


In [5]:
x_encoded = x.drop(['Name', 'Sex', 'Ticket', 'Embarked'], axis = 'columns')
x_encoded.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,name_encoded,sex_encoded,ticket_encoded,embarked_encoded
0,3.0,22.0,1,0,7.25,108,1,522,2
1,1.0,38.0,1,0,71.2833,190,0,595,0
2,3.0,26.0,0,0,7.925,353,0,668,2
3,1.0,35.0,1,0,53.1,272,0,49,2
4,3.0,35.0,0,0,8.05,15,1,471,2


In [6]:
#imputation method is used to replace the NaN values with mean in this case
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

# Splitting the Data

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_encoded, y, test_size = 0.27, random_state = 42)

In [8]:
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

# Feature Scaling

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train_imputed)
x_test_scaled = scaler.transform(x_test_imputed)

# Decision Tree Model

In [10]:
from sklearn import tree

model = tree.DecisionTreeClassifier()
model.fit(x_train_scaled, y_train)

In [11]:
predictions = model.predict(x_test_scaled)
predictions

array([0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0],
      dtype=int64)

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
print("Accuracy: ", accuracy_score(y_test, predictions))
print("Precision: ", precision_score(y_test, predictions))
print("Recall: ", recall_score(y_test, predictions))

Accuracy:  0.7427385892116183
Precision:  0.6697247706422018
Recall:  0.7373737373737373


# Hyperparameter Tuning

In [13]:
model2 = tree.DecisionTreeClassifier(max_depth = 4, ccp_alpha = 0.01)
model2.fit(x_train_scaled, y_train)

In [14]:
predictions = model2.predict(x_test_scaled)

In [15]:
print("Accuracy: ", accuracy_score(y_test, predictions))
print("Precision: ", precision_score(y_test, predictions))
print("Recall: ", recall_score(y_test, predictions))

Accuracy:  0.8298755186721992
Precision:  0.8222222222222222
Recall:  0.7474747474747475
