In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import re 
from pathlib import Path
import tarfile 
import urllib.request
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


In [2]:
def download_titanic_dataset():
    tarball_path = Path("datasets/titanic.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://homl.info/titanic.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as titanic_tarball:
            titanic_tarball.extractall(path="datasets")

def read_dataset(dataset_type):
    return pd.read_csv(f'datasets/titanic/{dataset_type}.csv')

def categorical_count(data, cat_col):
    sns.countplot(x=data[cat_col])
    plt.show()
    
    cat_col_count = data[cat_col].value_counts().to_frame(name='count').reset_index()
    cat_col_count['percent'] = round((cat_col_count['count'] / cat_col_count['count'].sum()) * 100,2)
    cat_col_count.sort_values(cat_col)
    return cat_col_count

def feature_survival(data, col):
    data_count = data.groupby([col, 'survived']).count()['passengerid'].to_frame(name='count').reset_index()
    data_count['percent'] = round((data_count['count'] / data_count['count'].sum()) * 100,2)
    sns.barplot(x=data_count[col], y=data_count['percent'], hue=data_count['survived'])
    plt.show()
    return data_count

In [3]:
# download titanic dataset
download_titanic_dataset()

# read train and test datasets into dataframe
train_df = read_dataset('train')
test_df = read_dataset('test')

# create copy of training dataset 
titanic = train_df.copy()

In [4]:
# lower case column names
titanic.columns = titanic.columns.str.lower()

# inspect first 5 rows
titanic.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# print train dataset info 
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
# convert pclass to object 
titanic['pclass'] = titanic['pclass'].astype('object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    object 
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 83.7+ KB


In [6]:
# missing value percentage
missing_cols = ['age', 'cabin', 'embarked']

for col in missing_cols:
    print(f"{col}: {round(titanic[col].isna().sum() / len(titanic) * 100, 2)} %")

age: 19.87 %
cabin: 77.1 %
embarked: 0.22 %


In [7]:
# summary statistics on titanic dataset
titanic.describe()

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699113,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526507,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.4167,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292
