In [155]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


## Set Up the Data
1. Read in the data
2. Check for nan values

In [156]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")

print(train_df.shape, test_df.shape)
train_df.head(10)

(891, 12) (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [157]:
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [158]:
print(train_df.isna().sum())

#Replace nan values in Age with average age
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)

#Igore Cabin here since we are not using this variable for model building for now

#Drop the rows with nan Embarked values since there are only two observations
train_df.dropna(subset=['Embarked'], inplace=True)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [159]:
print(test_df.isna().sum())

#Replace nan values in Age and Fare with column average
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


## Exploratory Data Analysis
1. Distribution
2. Correlation

In [None]:
print(train_df.Survived.value_counts(), '\n------------------------------', 
      '\nAverage survival rate is %.2f%%' % (train_df.Survived.mean()*100), 
      '\n------------------------------')

plt.style.use('ggplot')
fig, ax = plt.subplots()
ax.bar(['Not Survived','Survived'], train_df.Survived.value_counts()/len(train_df.Survived))
plt.title("% of Passengers")
plt.show()

In [None]:
print(train_df.groupby(['Pclass'])['Survived'].agg(['count', 'mean']), '\n------------------------------')

fig, ax = plt.subplots()
ax.bar(['1st','2nd', '3rd'], train_df.groupby(['Pclass'])['Survived'].mean(), width=train_df.Pclass.value_counts(normalize=True).sort_index())
plt.title("Survival Rate by Ticket Class")
plt.xlabel("Ticket Class; bar width represents # of passengers")
plt.ylabel("Survival Rate")
plt.show()

In [None]:
print(train_df.groupby(['Sex'])['Survived'].agg(['count', 'mean']), '\n------------------------------')

fig, ax = plt.subplots()
ax.bar(['Female','Male'], train_df.groupby(['Sex'])['Survived'].mean(), width=train_df.Sex.value_counts(normalize=True).sort_index())
plt.title("Survival Rate by Gender")
plt.xlabel("Gender; bar width represents # of passengers")
plt.ylabel("Survival Rate")
plt.show()

In [None]:
print(train_df.groupby(['Embarked'])['Survived'].agg(['count', 'mean']), '\n------------------------------')

fig, ax = plt.subplots()
ax.bar(['C','Q', 'S'], train_df.groupby(['Embarked'])['Survived'].mean(), width=train_df.Embarked.value_counts(normalize=True).sort_index())
plt.title("Survival Rate by Port of Embarkation")
plt.xlabel("Port of Embarkation; bar width represents # of passengers")
plt.ylabel("Survival Rate")
plt.show()

In [None]:
print(train_df.groupby(['SibSp'])['Survived'].agg(['count', 'mean']), '\n------------------------------')

fig, ax = plt.subplots(figsize=(4, 2))
ax.hist(train_df.SibSp)
plt.show()

In [None]:
print(train_df.groupby(['Parch'])['Survived'].agg(['count', 'mean']), '\n------------------------------')

fig, ax = plt.subplots(figsize=(4, 2))
ax.hist(train_df.Parch)
plt.show()

In [None]:
print(train_df.groupby(pd.cut(train_df['Age'], np.arange(0,80,10)))['Survived'].agg(['count', 'mean']), 
      '\n------------------------------')

fig, ax = plt.subplots(figsize=(10, 2))
ax.hist(train_df.Age, bins=50)
plt.show()

In [None]:
print(train_df.groupby(pd.cut(train_df['Fare'], np.arange(0,300,50)))['Survived'].agg(['count', 'mean']), 
      '\n------------------------------')

fig, ax = plt.subplots(figsize=(10, 2))
ax.hist(train_df.Fare, bins=50)
plt.show()

In [None]:
train_df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].corr().style.background_gradient(cmap='coolwarm', axis=None)

## Model Building

In [160]:
from sklearn.model_selection import train_test_split

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = pd.get_dummies(train_df[features])
y = train_df['Survived']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2)
X_test = pd.get_dummies(test_df[features])

### Logistic Regression

First, we try a simple logistic regression with no interaction terms.

In [None]:
from sklearn.linear_model import LogisticRegression

logit1 = LogisticRegression().fit(X_train, y_train)
logit1_pred = logit1.predict(X_validation)

In [None]:
#Training Accuracy
logit1.score(X_train, y_train)

In [None]:
#Validation Accuracy
logit1.score(X_validation, y_validation)

Next, we would like to test if there is interaction between age/class and gender.

In [None]:
X_train['Age*Sex_male'] = X_train['Age'] * X_train['Sex_male']
X_validation['Age*Sex_male'] = X_validation['Age'] * X_validation['Sex_male']
X_test['Age*Sex_male'] = X_test['Age'] * X_test['Sex_male']

X_train['Pclass*Sex_male'] = X_train['Pclass'] * X_train['Sex_male']
X_validation['Pclass*Sex_male'] = X_validation['Pclass'] * X_validation['Sex_male']
X_test['Pclass*Sex_male'] = X_test['Pclass'] * X_test['Sex_male']

In [None]:
logit2 = LogisticRegression().fit(X_train, y_train)
logit2_pred = logit2.predict(X_validation)

In [None]:
#Training Accuracy
logit2.score(X_train, y_train)

In [None]:
#Validation Accuracy
logit2.score(X_validation, y_validation)

In [None]:
logit2_pred_test = logit2.predict(X_test)
#output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': logit2_pred_test})
#output.to_csv('submission.csv', index=False)

### Random Forest

In [162]:
from sklearn.ensemble import RandomForestClassifier
RF1 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
RF1.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, random_state=1)

In [163]:
#Training Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_train, RF1.predict(X_train))

0.8579465541490858

In [164]:
#Validation Accuracy
accuracy_score(y_validation, RF1.predict(X_validation))

0.8202247191011236

In [165]:
RF1_pred_test = RF1.predict(X_test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': RF1_pred_test})
output.to_csv('submission.csv', index=False)