In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import dtale as dt

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

from sklearn import svm
from sklearn.svm import SVC, LinearSVC

from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier

from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import SGDClassifier

from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB

from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

from sklearn import model_selection
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, cross_val_predict


#### Load the dataset

The train and test were provided separately

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Using the D-tale library to display the dataset

In [8]:
dt.show(train)

2020-12-04 03:54:19,364 - INFO     - NumExpr defaulting to 4 threads.




In [9]:
dt.show(test)



In [5]:
train.shape

(891, 12)

In [6]:
test.shape

(418, 11)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Performing Data Cleaning and Analysis
<!-- #### 1. Understanding meaning of each column: -->
<!-- <br>Data Dictionary: -->
<br>**Variable        Description**</br>
1. Survived	- Survived (1) or died (0)
2. Pclass -	Passenger’s class (1 = 1st, 2 = 2nd, 3 = 3rd)
3. Name	- Passenger’s name
4. Sex -	Passenger’s sex
5. Age	- Passenger’s age
6. SibSp -	Number of siblings/spouses aboard
7. Parch -	Number of parents/children aboard (Some children travelled only with a nanny, therefore parch=0 for them.)
8. Ticket -	Ticket number
9. Fare -	Fare
10. Cabin -	Cabin
11. Embarked -	Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [10]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Observation:

From the above, it was discovered that;

1. The train dataset was (891 rows, 12 column) and the test (418 rows, 11 collumns)
2. There are missing value in age, Embarked and cabin for both train and test dataset
3. Some columns are not needed to determining the survival of a person e.g. Name, PassengerId, Ticket 

#### Deleting unnecessary column for both train and test dataset

In [12]:
"""
Deleting the columns not required for determining the survival of a person in the train data
"""

del train['PassengerId']
del train['Ticket']
del train['Fare']
del train['Cabin']
del train['Name']

In [13]:
"""
Similarly, deleting the columns not required for determining the survival of a person for the test.csv data
"""

del test['Ticket']
del test['Fare']
del test['Cabin']
del test['Name']

## Imputation method for missing value

The cabin columns has been removed since it is not usefull.

#### Embarked column has 2 missing value in the train data, though removing or doing an imputation will not make much of a difference, Imputation was chosen. The missing value will be replaced by the mode since it is an object data type.


#### Age column has missing value for some of the people in both training and testing data. It can be solved by 
* filling the ones who have survived with the mean age of the survived people
* similarly fill those who haven't survived with the mean age of all non-survived people.

#### But Note, this type of imputation will not be proper for testing, this is because the null will be filled with the single value of mean to predict their survival status but this will not generalise as the case may be. 

* To solve the issue, an array of random numbers which are generated from mean age value in regards to standard deviation and is_null will be used for the missing value imputation.

In [14]:
# we create a list of training and testing dataset

titanic_data = [train, test]

for data in titanic_data:
    mean = train['Age'].mean()
    std = test['Age'].std()
    is_null = data["Age"].isnull().sum()
    
    # random numbers from mean, standard deviation and is_null will be computed
    random_age = np.random.randint(mean - std, mean + std, size = is_null)

    # fill NaN values in Age column with random values generated
    age_slice = data["Age"].copy()
    age_slice[np.isnan(age_slice)] = random_age
    data['Age'] = age_slice
    data['Age'] = train['Age'].astype(int)
    
train['Age'].isnull().sum

<bound method Series.sum of 0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool>

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  891 non-null    int64 
 1   Pclass    891 non-null    int64 
 2   Sex       891 non-null    object
 3   Age       891 non-null    int32 
 4   SibSp     891 non-null    int64 
 5   Parch     891 non-null    int64 
 6   Embarked  889 non-null    object
dtypes: int32(1), int64(4), object(2)
memory usage: 45.4+ KB


In [16]:
train['Embarked'].fillna((train['Embarked'].value_counts().index[0]), inplace=True)

In [17]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  418 non-null    int64 
 1   Pclass       418 non-null    int64 
 2   Sex          418 non-null    object
 3   Age          418 non-null    int32 
 4   SibSp        418 non-null    int64 
 5   Parch        418 non-null    int64 
 6   Embarked     418 non-null    object
dtypes: int32(1), int64(4), object(2)
memory usage: 21.4+ KB


In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  891 non-null    int64 
 1   Pclass    891 non-null    int64 
 2   Sex       891 non-null    object
 3   Age       891 non-null    int32 
 4   SibSp     891 non-null    int64 
 5   Parch     891 non-null    int64 
 6   Embarked  891 non-null    object
dtypes: int32(1), int64(4), object(2)
memory usage: 45.4+ KB


In [19]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
count,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.359147,0.523008,0.381594
std,0.486592,0.836071,13.559485,1.102743,0.806057
min,0.0,1.0,0.0,0.0,0.0
25%,0.0,2.0,20.0,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0
75%,1.0,3.0,37.0,1.0,0.0
max,1.0,3.0,80.0,8.0,6.0


In [20]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch
count,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,28.521531,0.447368,0.392344
std,120.810458,0.841838,13.300647,0.89676,0.981429
min,892.0,1.0,0.0,0.0,0.0
25%,996.25,1.0,20.0,0.0,0.0
50%,1100.5,3.0,28.0,0.0,0.0
75%,1204.75,3.0,36.0,1.0,0.0
max,1309.0,3.0,71.0,8.0,9.0


### Categorical Encoding

#### Column sex and Embarked neede to be encoded as they are categorical feature.
* One-Hot Encoding will be used in encoding the sex feature
* Label encoding will be used for Embarked features

In [27]:

# mark the variable as categorical type

train['Sex'] = train['Sex'].astype('category')
test['Sex'] = test['Sex'].astype('category')

train['Embarked'] = train['Embarked'].astype('category')
test['Embarked'] = test['Embarked'].astype('category')

# Also convert the passenger's class to category
train['Pclass'] = train['Pclass'].astype('category')
test['Pclass'] = test['Pclass'].astype('category')

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    int64   
 1   Pclass    891 non-null    category
 2   Sex       891 non-null    category
 3   Age       891 non-null    int32   
 4   SibSp     891 non-null    int64   
 5   Parch     891 non-null    int64   
 6   Embarked  891 non-null    category
dtypes: category(3), int32(1), int64(3)
memory usage: 27.4 KB


In [22]:
train['Sex'] = pd.get_dummies(train['Sex'])
test["Sex"] = pd.get_dummies(test['Sex'])

train['Embarked'] = train['Embarked'].cat.codes
test['Embarked'] = test['Embarked'].cat.codes

In [29]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,22,1,0,2
1,1,1,1,38,1,0,0
2,1,3,1,26,0,0,2
3,1,1,1,35,1,0,2
4,0,3,0,35,0,0,2


In [30]:
train[['Pclass','Survived']].groupby(['Pclass']).mean().sort_values(by='Survived',ascending=True)

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
3,0.242363
2,0.472826
1,0.62963


In [24]:
dt.show(train)

