<a href="https://colab.research.google.com/github/cmannnn/titanic/blob/master/titanic_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Titanic Dataset

https://www.kaggle.com/c/titanic


The competition is simple: use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.


Variable Notes:

pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [13]:
# imports

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error

from google.colab import files
import io

In [15]:
# google colab uploader
uploaded = files.upload()

In [14]:
# uploading training data
train_data = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8')))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
# uploading testing data
test_data = pd.read_csv(io.StringIO(uploaded['test.csv'].decode('utf-8')))
test_data.head()

KeyError: ignored

In [6]:
# importing the data path
TITANIC_PATH = os.path.join('Desktop', 'datasets', 'titanic')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [7]:
# importing the the data from folders
def load_titanic_data(filename, titanic_path = TITANIC_PATH):
  csv_path = os.path.join(titanic_path, filename)
  return pd.read_csv(csv_path)

In [8]:
train_data = load_titanic_data('train.csv')
test_data = load_titanic_data('test.csv')

FileNotFoundError: ignored

In [None]:
# import train data
train_data = pd.read_csv('/Users/cman/Desktop/code/titanic/train.csv')
# print(train_data.columns)

# import test data
test_data = pd.read_csv('/Users/cman/Desktop/code/titanic/test.csv')
# print(test_data.columns)

# test + train
dfs = [train_data, test_data]
# print(train_data.columns)
# print(test_data.columns)

# test + train DataFrame
all_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# naming each data set
train_data.name = 'Training Data'
test_data.name = 'Testing Data'
all_data.name = 'All Data'



In [None]:
# def function to show NaN values
'''def nan_val(df):
	for col in df.columns:
		if df[col].isnull().sum() != 0:
			print('{} column has {} missing data points'.format(col, df[col].isnull().sum()))
	print('\n')

for df in dfs:
	print('{}'.format(df.name))
	nan_val(df)'''



In [None]:
# fix 177 missing age NaN's in training data and testing data
corr_matrix = train_data.corr().abs()
corr_matrix_ = corr_matrix.unstack()

# sorting correlation matrix
corr_matrix_sort = corr_matrix_.sort_values(kind='quicksort', ascending=False).reset_index()

# creating new descriptive columns from sorted correlation
corr_matrix_resort = corr_matrix_sort.rename(columns={'level_0':'feature 1', 'level_1':'feature 2', 0:'corr'})

# which feature is most correlated to age?
print(corr_matrix_resort[corr_matrix_resort['feature 1'] == 'Age'])



In [None]:
# heatmap of feature correlations
plt.figure(figsize = (8,6))
sns.heatmap(corr_matrix, annot=True, cbar=True, linewidths=0.3, linecolor='black')
plt.title('Feature correlation', fontsize=15)
plt.xlabel('Feature 1', labelpad=-18)
plt.xticks(rotation=45, fontsize=10)
plt.ylabel('Feature 2', labelpad=-5)
plt.yticks(rotation=45, fontsize=10)
plt.show()

In [None]:
# fix 687 missing cabin NaN's in training data and 1 in testing data
age_by_pclass = all_data.groupby(['Pclass']).median()['Age']

for pclass in range(1, 4):
	print('Median age of Pclass {} is: {}'.format(pclass, age_by_pclass[pclass]))


# fix 2 missing embarked NaN's in training data and 327 in testing data

In [None]:
# women survival rate
women = train_data.loc[train_data.Sex == 'female']['Survived']
women_rate = sum(women)/len(women)
# print('The % of women that survived is:', women_rate*100)

# men survival rate
men = train_data.loc[train_data.Sex == 'male']['Survived']
men_rate = sum(men)/len(men)
# print('The % of men that survived is:', men_rate*100)



In [None]:
# class 1 survival rate
pclass1 = train_data.loc[train_data.Pclass == 1]['Survived']
rate_pclass1 = sum(pclass1)/len(pclass1)
# print('The % of First Class that survived is:', rate_pclass1*100)

# class 2 survival rate
pclass2 = train_data.loc[train_data.Pclass == 2]['Survived']
rate_pclass2 = sum(pclass2)/len(pclass2)
# print('The % of Second Class that survived is:', rate_pclass2*100)

# class 3 survival rate
pclass3 = train_data.loc[train_data.Pclass == 3]['Survived']
rate_pclass3 = sum(pclass3)/len(pclass3)
# print('The % of Third Class that survived is:', rate_pclass3*100)



In [None]:
# sikit random forest 
# y variable
y = train_data['Survived']


# variables looking into
features = ['Pclass', 'Sex', 'SibSp', 'Parch']

# indicator train variables
X = pd.get_dummies(train_data[features])

# indicator test variables
X_test = pd.get_dummies(test_data[features])

# model
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

# fitting the model
fit_model = model.fit(X, y)

# prediction
prediction = fit_model.predict(X_test)

output_pred = pd.DataFrame({'Survived': prediction})

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=10)

tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
	print('Scores:', scores)
	print('Mean:', scores.mean())
	print('Standard deviation:', scores.std())


# print(display_scores(tree_rmse_scores))