`python -m venv venv`

`.\venv\Scripts\activate`

In [None]:
#pip install pandas numpy matplotlib seaborn scikit-learn

In [None]:
#import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics

In [None]:
df = pd.read_excel("../files/titanic3.xls")
df.head(10)

In [None]:
# check dimensions
df.info()
df.isnull().any()

In [None]:
#find the title for each person
def find_title(name):
    strings = name.split()
    for string in strings:
        if string.endswith('.'):
            return string
        
    return None
df['title'] = df['name'].apply(find_title)

df.head()


In [None]:
title_avg_age = df.groupby('title')['age'].mean()
# Fill in missing ages based on title with the average age
df = df.merge(title_avg_age, on='title', suffixes=('', '_avg'))
df['age'].fillna(df['age_avg'], inplace=True)
df.drop(columns=['age_avg'], inplace=True)

print(df)
    

In [None]:
#fill in average fare with total average
avg_fare = df['fare'].mean()

df['fare'].fillna(avg_fare, inplace=True)

In [None]:

# df['cabin'] = df['cabin'].fillna('X')

# df['cabin'] = df['cabin'].apply(lambda x: str(x)[0])

df['embarked'] = df['embarked'].fillna('X')

df['home.dest'] = df['home.dest'].fillna('X')

home_value_counts = df['home.dest'].value_counts()
print(home_value_counts)

# cabin_value_counts = df['cabin'].value_counts()
# print(cabin_value_counts)



embarking_value_counts = df['embarked'].value_counts()
print(embarking_value_counts)

# df['embarked'] = df[embarking_value_counts[0]]

# embarking_value_counts = df['embarked'].value_counts()

# print(embarking_value_counts)




In [None]:
#check if someone had a cabin
df['is_cabin'] = df['cabin'].isnull()
df.drop(['cabin'], axis=1,inplace=True)

In [None]:
# check dimensions
df.info()
df.isnull().any()

In [None]:
df.head()
print(df['embarked'].unique())

In [None]:
#encode the sex
df['sex'] = df['sex'].astype('category')
enc = OneHotEncoder()

df_encoded =  enc.fit_transform(df[['sex']])
sex_encoded_df = pd.DataFrame.sparse.from_spmatrix(df_encoded, columns=enc.get_feature_names_out(['sex']))

df = pd.concat([df, sex_encoded_df], axis=1)

In [None]:
#encode the embarking location
df['embarked'] = df['embarked'].astype('category')

df_encoded =  enc.fit_transform(df[['embarked']])
embarked_encoded_df = pd.DataFrame.sparse.from_spmatrix(df_encoded, columns=enc.get_feature_names_out(['embarked']))

df = pd.concat([df, embarked_encoded_df], axis=1)

In [None]:
# #encode the Title 
# df['title'] = df['title'].astype('category')

# df_encoded =  enc.fit_transform(df[['title']])
# title_encoded_df = pd.DataFrame.sparse.from_spmatrix(df_encoded, columns=enc.get_feature_names_out(['title']))

# df = pd.concat([df, title_encoded_df], axis=1)

In [None]:
df = df.drop('sex', axis=1)
df = df.drop('embarked', axis=1)
# df = df.drop('title', axis=1)



In [None]:
print(df.head())

In [None]:
# begin making the model
# split the data
feature_cols = df.columns.tolist()
feature_cols.remove('survived')
feature_cols.remove('name')
feature_cols.remove('body')
feature_cols.remove('boat')
feature_cols.remove('ticket')
feature_cols.remove('home.dest')
feature_cols.remove('title')


X = df[feature_cols]
y = df['survived']

print(feature_cols)
#Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 70% training and 30% test


In [None]:
#check contents of X
print(X)

In [None]:
#print contents of y
print(y)

In [None]:
clf_rf = RandomForestClassifier(n_estimators=1001, max_leaf_nodes=16, n_jobs=1)
clf_lr = LogisticRegression(max_iter=1000)

clf_rf = clf_rf.fit(X_train, y_train)
clf_lr = clf_lr.fit(X_train, y_train)

In [None]:
rf_y_pred = clf_rf.predict(X_test)

print("rf accuracy:", metrics.accuracy_score(y_test, rf_y_pred))

#random forest confusion matrix

rf_cm = confusion_matrix(y_test, rf_y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=rf_cm, display_labels=np.unique(y))

disp.plot()
plt.title("Random Forest Classifier Confusion Matrix")

plt.show()

In [None]:
lr_y_pred = clf_lr.predict(X_test)

print("lr accuracy:", metrics.accuracy_score(y_test, lr_y_pred))

#lr confusion matrix

lr_cm = confusion_matrix(y_test, lr_y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=lr_cm, display_labels=np.unique(y))

disp.plot()
plt.title("Logistic Regression Confusion Matrix")

plt.show()

In [None]:
#Create normalized Confusion Matrix
rf_cm_normalized = rf_cm.astype('float') / rf_cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(rf_cm_normalized, annot=True, linewidths = 0.01)

In [None]:
#Create normalized Confusion Matrix
lr_cm_normalized = lr_cm.astype('float') / lr_cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(lr_cm_normalized, annot=True, linewidths = 0.01)

In [None]:
df.head()