# Titanic Dataset
The dataset contains passenger information of individuals who boarded the Titanic with infomration such as the survival status, class, fare and other variables.

In [1]:
# General libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

#Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#modelling libraries

#Data splitting
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Import metrics

# Classification metrics
from sklearn.metrics import classification_report


# Data Obtainance, Cleaning & Feature Engineering

Data loading

In [3]:
titanic_data = sns.load_dataset('titanic')
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


Data Cleaning

In [5]:
# Clean age, embarked, embark_town and deck
titanic_data.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


# Data Cleaning

Remove duplicates

In [6]:
titanic_data.duplicated().sum()

107

In [10]:
titanic_data.drop_duplicates()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [11]:
titanic_data.duplicated().count()

891

In [12]:
titanic_data.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
886     True
887    False
888    False
889    False
890    False
Length: 891, dtype: bool

Dealing with missing data

In [13]:
# check for missing values
titanic_data.isnull().sum()


survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [14]:
# Check percentages of missing data
titanic_data.isnull().mean()

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

In [16]:
# Impute age
titanic_data['age'].fillna(titanic_data['age'].mean(), inplace=True)

In [18]:
# Check value count
titanic_data['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [19]:
# Check embark_town value count
titanic_data['embark_town'].value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [21]:
# Check deck value count
titanic_data['deck'].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64

In [22]:
# Impute embarked
titanic_data['embarked'].fillna('S', inplace=True)

In [23]:
# Impute embark_town
titanic_data['embark_town'].fillna('Southampton', inplace=True)

In [26]:
# Drop deck since it has over 70% missing values
titanic_data.drop('deck', axis=1, inplace=True)

In [27]:
# Check for null values again
titanic_data.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [28]:
# Checks the data
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [32]:
titanic_data.drop_duplicates(inplace=True)

In [33]:
titanic_data.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
885    False
887    False
888    False
889    False
890    False
Length: 780, dtype: bool

In [34]:
titanic_data.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [35]:
# Check for irregularities in the data
report = ProfileReport(titanic_data)
report.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [36]:
# Create two datasets
titanic_classification = titanic_data.copy()

In [57]:
# Use label encoder to encode data

encoder = LabelEncoder()

# Create a mapper dictonary
mapper = {}

target_values = list(titanic_classification['survived'].unique())
for value, key in enumerate(target_values):
    mapper[key] = value
    
    print(mapper)
    
    titanic_classification['survived'] = titanic_classification['survived'].replace(mapper)
    
# Encode the categorical column - return columns that are categorical
cat_columns = [column for column in titanic_classification.columns if titanic_classification[column].dtype == 'O']

# Label encode - change all data to numeric
for column in cat_columns:
    titanic_classification[column] = encoder.fit_transform(titanic_classification[column])



{0: 0}
{0: 0, 1: 1}


In [51]:
titanic_classification.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,0,Third,1,True,2,0,False
1,1,1,0,38.0,1,0,71.2833,1,First,2,False,0,1,False
2,1,3,0,26.0,0,0,7.925,0,Third,2,False,2,1,True
3,1,1,0,35.0,1,0,53.1,0,First,2,False,2,1,False
4,0,3,1,35.0,0,0,8.05,0,Third,1,True,2,0,True


In [55]:
# Split data - to train, test and split
x_c = titanic_classification.drop('survived', axis=1)
y_c = titanic_classification['survived']

# Create out first split
x_train_c, x_holdout_c, y_train_c, y_holdout_c = train_test_split(x_c, y_c, test_size=0.3, random_state=42)
x_test_c, x_val_c, y_test_c, y_val_c = train_test_split(x_holdout_c, y_holdout_c, test_size=0.5, random_state=42)



# Modelling

In [56]:
# Classification problem

# Instantiate our model
dt_classifier = DecisionTreeClassifier()

# Train model
dt_classifier.fit(x_train_c, y_train_c)

# Predictions on our model
y_predictions = dt_classifier.predict(x_val_c)

# Evaluate our model
print(classification_report(y_val_c, y_predictions))


ValueError: Cannot cast object dtype to float32

You can convert them to strings and do one hot encoding or label encoding