# Import Libraries and Data

In [7]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import seaborn as sns

In [9]:
# Load in the train and test datasets
train = pd.read_csv('C:/Users/Amber/Documents/Projects/TitanicProject/titanicData/train.csv')
test = pd.read_csv('C:/Users/Amber/Documents/Projects/TitanicProject/titanicData/test.csv')

## Variables:

<b>PassengerID</b> <br>
<b>Survived</b> Bool, 0=No, 1=Yes <br>
<b>Pclass</b> = Ticket Class (1=1st, 2=2nd, or 3=3rd), serves as proxy for socio-economic status (SES)<br>
<b>Name</b> passenger's name<br>
<b>Sex</b> ('male' or 'female') <br>
<b>Age</b> in years, fractional if less than 1 and in form xx.5 if estimated<br>
<b>SibSp</b> = # or siblings/spouses aboard the Titanic <br>
<b>Parch</b> = # of parents/children aboard the Titanic <br>
<b>Ticket</b> (ticket #) <br>
<b>Fare</b> passenger fare <br>
<b>Cabin</b> (cabin #) <br>
<b>Embarked</b> = port of embarkation (C=Cherbourg, Q=Queenstown, S=Southampton)

In [16]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Clean and Organize Data

# Predictions

### Simple rates by gender

In [19]:
# 'gender_submission.csv' assumes all female passengers survived (and all male passengers died). How reasonable is this guess?
# Let's first see how many women actually survived in our training set
women = train.loc[train.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [21]:
# At what rate did men survive in our training set?
men = train.loc[train.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


Almost 75% of women survived, but only about 19% of men survived.  Gender does seem to be a strong indicator of survival, so this 'gender_submission.csv' solution attempt is not a terrible first attempt.

### Random Forest Model

The following code creastes 100 trees based on patterns between four columns ("Pclass", "Sex", "SibSp", and "Parch").  It first constructs trees in the random forest based on our training data, then generates predictions for the test data passengers.  Finally, this code saves those predictions in "my_submission.csv"

In [25]:
from sklearn.ensemble import RandomForestClassifier

y = train["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [31]:
# check accuracy of model by computing confusion matrix 
predictions2 = model.predict(X)

output2 = pd.DataFrame({'PassengerId': train.PassengerId, 'SurvivedPred': predictions2, 'Survived': train.Survived})
output2.to_csv('trainingcheck.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [33]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(output2.Survived,output2.SurvivedPred)
print(conf_mat)

[[492  57]
 [107 235]]


492 TN (did not survive and were predicted to not survive)<br>
57 FP (did not survive but were predicted to)<br>
107 FN (survived but were not predicted to)<br>
235 TP (survived and were predicted to)<br>

In [39]:
def accuracy(confusion_matrix):
    diagonal_sum = confusion_matrix.trace()
    sum_of_all_elements = confusion_matrix.sum()
    return diagonal_sum / sum_of_all_elements 
print("Random Forest Accuracy = ", round(100*accuracy(conf_mat),2) , "%")

Random Forest Accuracy =  81.59 %
