In [2]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 104 kB/s eta 0:00:01
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.9.1


In [4]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 96 kB/s eta 0:00:011
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
!pip install feature-engine

# Importing all required libraries
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import plotly.offline as pyo
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import scipy.stats as stats
import statistics
import re

from feature_engine.transformation import PowerTransformer
from feature_engine.imputation import RandomSampleImputer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
import xgboost
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import plot_tree, export_text

from sklearn.model_selection import GridSearchCV


from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score



In [7]:
#train dataset

dataset = pd.read_csv('train.csv')
print(dataset.shape)

dataset.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
#data analysis

data = [go.Bar(x = ['Not Survived', 'Survived'], y = dataset['Survived'].value_counts(), marker=dict(color='#DD0F32'))]
layout = go.Layout(title = 'Titanic Dataset: Target Feature')

fig = go.Figure(data, layout)
fig.update_layout(autosize = False, width = 500, height = 500, yaxis_title = 'Count')

pyo.iplot(fig)

In [12]:
(dataset['Survived'].value_counts() / len(dataset['Survived'])).apply(lambda x: round(x*100, 2))

0    61.62
1    38.38
Name: Survived, dtype: float64

This is an imbalanced dataset. We can see that 61.62% of values tell us about who didn't survive the tragic titanic incident.

Inorder to make this a balanced dataset, we have to make synthetic samples using methods such as oversampling the minority class (i.e., class-1), undersampling the majority class (i.e., class-0) or performing both over and under sampling methods.

##### The Variable Types
Checking the data type of all features is an important step before proceeding further. It is possible for pandas to make incorrect assumptions about the dataset while loading.

In this dataset, we can see two features dtype is incorrect.

In [13]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


'Survived' is our target feature. Classification algorithms work only when the target feauture is a cateogry. Hence, we must convert Survived dtype.

As per the given data_description.txt file, Pclass is a category, pandas dataframe read figured it as an int64 type. We must convert it.

##### Datatype Conversion
Converting datatypes of 'Survived' and 'Pclass' features to object. Since they are not integers.

In [14]:
dataset['Survived'] = dataset['Survived'].astype('object')
dataset['Pclass'] = dataset['Pclass'].astype('object')

In [15]:
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtype == 'O' and feature != 'Survived']
print("Total number of Categorical Features: ", len(categorical_features))

print(categorical_features)

Total number of Categorical Features:  6
['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [16]:
numerical_features = [feature for feature in dataset.columns if feature not in categorical_features and feature != 'Survived']
print("Total number of Numerical Features: ", len(numerical_features))

print(numerical_features)

Total number of Numerical Features:  5
['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare']


###### Missing Values
Missing values creates a bias in our dataset that effects the performance of our model. It reduces the statistical power of accepting and rejecting the null hypothesis and also representativeness of the samples.

We can drop the entire feature if we have high percentage of missing values in it to avoid biased results by imputation methods.

In [18]:
features_with_missing_values = [feature for feature in dataset.columns if dataset[feature].isnull().sum() > 0]
print('Total number of Numerical features which have null values: ', len(features_with_missing_values))
print(features_with_missing_values)

Total number of Numerical features which have null values:  3
['Age', 'Cabin', 'Embarked']


##### Visualizing the percentage of missing values

In [19]:
feature_names = list((dataset[features_with_missing_values].isnull().mean().sort_values(ascending=False)).apply(lambda x: x*100).index)
percentage_of_na = (dataset[features_with_missing_values].isnull().mean().sort_values(ascending=False)).apply(lambda x: x*100)

data = [go.Bar(x = feature_names, y = percentage_of_na, marker=dict(color='#CD7F32'))]
layout = go.Layout(title = 'Percentage of missing values in each feature')

fig = go.Figure(data, layout)
fig.update_layout(autosize = False, width = 600, height = 550, yaxis_title = 'Percentage')

pyo.iplot(fig)

About 77.1% of Cabin values are missing. We can drop this feature.

About 19.86% of Age values are missing, We will check the distribution and decide what to do with it.

Since there are only two rows(0.22%) of missing data for Embarked column, we can drop those rows.

##### Relationship between the missing values and target variable

In [20]:
def analyse_missing_data(df, feature):
    data = df.copy()
    
    # 1 -> If value is missing and 0 -> If value is not missing
    data[feature] = np.where(data[feature].isnull(), 1, 0)
    temp_data = data.groupby(feature)['Survived'].agg('count')
    
    data = [go.Bar(y = temp_data, x = ['Not Missing', 'Missing'])]
    layout = go.Layout(title = 'Relationship between {} and Survived'.format(feature))
    
    fig = go.Figure(data, layout)
    fig.update_layout(autosize = False, width = 600, height = 400, yaxis_title = feature)
    
    pyo.iplot(fig)

In [21]:
for feature in features_with_missing_values:
    analyse_missing_data(dataset, feature)