Titanic dataset
===

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
# read data
data = pd.read_csv('./train.csv')

In [None]:
# Take a look at first 5 rows
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [None]:
# Check value of sex column
data['Sex'].unique()

array(['male', 'female'], dtype=object)

In [None]:
# Drop Cabin column
data.drop(columns=['Cabin'], inplace=True)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 76.6+ KB


In [None]:
data['Age'].median()

28.0

In [None]:
# Fillna with median/mean
data['Age'] = data['Age'].fillna(data['Age'].median())

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 76.6+ KB


In [None]:
data = pd.get_dummies(data, columns=['Sex'], drop_first=True)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       889 non-null object
Sex_male       891 non-null uint8
dtypes: float64(2), int64(5), object(3), uint8(1)
memory usage: 70.6+ KB


### Create train test set

In [None]:
from sklearn.model_selection import train_test_split
X = data[["Pclass", 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male']]
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

### Create model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()
# Train model
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [None]:
# Evaluate model
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
prediction = lr.predict(X_test)

In [None]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.77      0.86      0.81       154
           1       0.78      0.65      0.71       114

   micro avg       0.77      0.77      0.77       268
   macro avg       0.77      0.76      0.76       268
weighted avg       0.77      0.77      0.77       268



In [None]:
print(confusion_matrix(y_test, prediction))

[[133  21]
 [ 40  74]]


### Naive Bayes model

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
%%time
nb = MultinomialNB()
nb.fit(X_train, y_train)
prediction = nb.predict(X_test)
# classification report
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.68      0.84      0.75       154
           1       0.69      0.46      0.55       114

   micro avg       0.68      0.68      0.68       268
   macro avg       0.68      0.65      0.65       268
weighted avg       0.68      0.68      0.67       268

CPU times: user 7.17 ms, sys: 3.89 ms, total: 11.1 ms
Wall time: 9.8 ms


In [None]:
%%time
lr = LogisticRegression()
lr.fit(X_train, y_train)
prediction = nb.predict(X_test)
# classification report
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.68      0.84      0.75       154
           1       0.69      0.46      0.55       114

   micro avg       0.68      0.68      0.68       268
   macro avg       0.68      0.65      0.65       268
weighted avg       0.68      0.68      0.67       268

CPU times: user 14.7 ms, sys: 414 µs, total: 15.1 ms
Wall time: 13 ms


