In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

#visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Increase default plot size
from pylab import rcParams
rcParams['figure.figsize'] = (6, 5)

#warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#load dataset
og_train_df = pd.read_csv('titanic-dataset/train.csv')
og_test_df = pd.read_csv('titanic-dataset/test.csv')

og_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#look into data type
og_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [43]:
full_dataset = [og_train_df, og_test_df]

In [42]:
#inspect missing values
def frame_missing(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = total/df.shape[0]*100
    miss_df =  pd.concat([total, percent], axis=1)
    return miss_df

miss_train = frame_missing(og_train_df)
miss_train.columns = ["train - total nan", "train - % nan"]

miss_test = frame_missing(og_test_df)
miss_test.columns = ["test - total nan", "test - % nan"]

pd.concat([miss_train, miss_test], axis=1)

Unnamed: 0,train - total nan,train - % nan,test - total nan,test - % nan
Age,177,19.86532,86.0,20.574163
Cabin,687,77.104377,327.0,78.229665
Embarked,2,0.224467,0.0,0.0
Fare,0,0.0,1.0,0.239234
Name,0,0.0,0.0,0.0
Parch,0,0.0,0.0,0.0
PassengerId,0,0.0,0.0,0.0
Pclass,0,0.0,0.0,0.0
Sex,0,0.0,0.0,0.0
SibSp,0,0.0,0.0,0.0


In [44]:
#fill missing embarked values - train df
og_train_df.Embarked.fillna(value='C', inplace=True)
og_train_df.Embarked.isnull().sum()

0

In [46]:
#drop missing fare values - test df
og_test_df.Fare.dropna(inplace=True)
og_test_df.Fare.isnull().sum()

0

In [47]:
#drop all cabin feature - lots of missing value
for df in full_dataset:
    df.drop(['Cabin'], axis=1, inplace=True)

In [49]:
#everything is cleaned except Age
cln_train_df = og_train_df
cln_test_df = og_test_df

cln_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [None]:
print("Shape of our train dataset: ", cln_train_df.shape)
print("Shape of our test dataset: ", cln_test_df.shape)