# Tutorial 1 The Titanic Disaster

## Project Summary
Predict which passengers survived the Titanic shipwreck.

## Importing Some Basic Libraries

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

## Importing the Dataset

In [3]:
dataset = pd.read_csv('./Titanic_Data.csv')

## Showing the Dataset in a Table

In [4]:
dataset.head(8)  # first 8 rows

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Number of Siblings/Spouses Aboard,Number of Parents/Children Aboard,Fare,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,No
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,Yes
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,Yes
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,Yes
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,No
5,6,3,"Moran, Mr. James",male,,0,0,8.4583,Q,No
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,S,No
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,21.075,S,No


## A Quick Review of the Data

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   PassengerId                        891 non-null    int64  
 1   Pclass                             891 non-null    int64  
 2   Name                               891 non-null    object 
 3   Sex                                891 non-null    object 
 4   Age                                714 non-null    float64
 5   Number of Siblings/Spouses Aboard  891 non-null    int64  
 6   Number of Parents/Children Aboard  891 non-null    int64  
 7   Fare                               891 non-null    float64
 8   Embarked                           889 non-null    object 
 9   Survived                           891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


## Encoding Categorical Data

### Encoding the Input Data

In [6]:
gender = {'male':0.0, 'female':1.0}

In [7]:
dataset['Sex'] = dataset['Sex'].map(gender)

In [8]:
dataset['Sex'].head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Sex, dtype: float64

In [9]:
print(dataset['Embarked'].unique())

['S' 'C' 'Q' nan]


In [10]:
ports = {'S':0.0, 'C':1.0, 'Q':2.0}

In [11]:
dataset['Embarked'] = dataset['Embarked'].map(ports)

In [12]:
dataset['Embarked'].head()

0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
Name: Embarked, dtype: float64

### Encoding the Output Data (Labels)

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
dataset['Survived'] = le.fit_transform(dataset['Survived'])

In [14]:
dataset['Survived'].head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

## Dropping Irrelevant Input Data 

In [15]:
dataset.drop(['PassengerId', 'Name'], axis=1, inplace=True)

## Checking the Preprocessed Dataset

In [16]:
dataset.head()

Unnamed: 0,Pclass,Sex,Age,Number of Siblings/Spouses Aboard,Number of Parents/Children Aboard,Fare,Embarked,Survived
0,3,0.0,22.0,1,0,7.25,0.0,0
1,1,1.0,38.0,1,0,71.2833,1.0,1
2,3,1.0,26.0,0,0,7.925,0.0,1
3,1,1.0,35.0,1,0,53.1,0.0,1
4,3,0.0,35.0,0,0,8.05,0.0,0


## Splitting the Dataset into the Training Set and Test Set

In [17]:
X = dataset.iloc[:, 0:7]
y = dataset.iloc[:, -1]

## Showing the Input Data in a Table Format

In [18]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Number of Siblings/Spouses Aboard,Number of Parents/Children Aboard,Fare,Embarked
0,3,0.0,22.0,1,0,7.25,0.0
1,1,1.0,38.0,1,0,71.2833,1.0
2,3,1.0,26.0,0,0,7.925,0.0
3,1,1.0,35.0,1,0,53.1,0.0
4,3,0.0,35.0,0,0,8.05,0.0


## A Quick Check of the Output Data

In [19]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

## Taking Care of Missing Data Inputs

In [20]:
from sklearn.impute import SimpleImputer

In [21]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [22]:
X = imputer.fit_transform(X)

## Split Dataset into training & test sets

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [25]:
X_train.shape

(712, 7)

In [26]:
X_test.shape

(179, 7)

## Scaling the Features

In [27]:
# Scale the age and fare
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, [2,5]] = sc.fit_transform(X_train[:, [2,5]])

## Training and Testing Predictive Models

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
# Support vector machine
from sklearn.svm import SVC
sv_classifier = SVC(kernel='rbf')
sv_classifier.fit(X_train, y_train)
X_test[:, [2,5]] = sc.fit_transform(X_test[:, [2,5]])
Y_pred = sv_classifier.predict(X_test)
print(accuracy_score(y_test, Y_pred))

0.7932960893854749


In [30]:
from sklearn.linear_model import LogisticRegression
# Linear regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
X_test[:, [2,5]] = sc.fit_transform(X_test[:, [2,5]])
Y_pred = lr.predict(X_test)
print(accuracy_score(y_test, Y_pred))

0.8100558659217877
