In [1]:
! pip install matplotlib --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting matplotlib
  Downloading matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 4.6 MB/s 
[?25hCollecting contourpy>=1.0.1
  Downloading contourpy-1.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (295 kB)
[K     |████████████████████████████████| 295 kB 44.6 MB/s 
Collecting fonttools>=4.22.0
  Downloading fonttools-4.38.0-py3-none-any.whl (965 kB)
[K     |████████████████████████████████| 965 kB 48.4 MB/s 
Installing collected packages: fonttools, contourpy, matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.2.2
    Uninstalling matplotlib-3.2.2:
      Successfully uninstalled matplotlib-3.2.2
Successfully installed contourpy-1.0.6 fonttools-4.38.0 matplotlib-3.6.2


# Load Library

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler

# Load Dataset

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
pd.set_option('display.max_columns', None)
train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
150,151,0,2,"Bateman, Rev. Robert James",male,51.0,0,0,S.O.P. 1166,12.525,,S
542,543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11.0,4,2,347082,31.275,,S
221,222,0,2,"Bracken, Mr. James H",male,27.0,0,0,220367,13.0,,S
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S
658,659,0,2,"Eitemiller, Mr. George Floyd",male,23.0,0,0,29751,13.0,,S


# Data Preprocessing

In [5]:
#impute missing value in embarked column with mode
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])

In [6]:
#impute missing value in age column with mean
train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age'] = train['Age'].fillna(train['Age'].median())

In [7]:
#impute missing value in test fare column with median
test['Fare'] = train['Fare'].fillna(train['Fare'].median())

In [8]:
train.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis = 1, inplace=True)
test.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis = 1, inplace=True)

# Feature Encoding

In [9]:
cats_encode = ['Sex', 'Embarked'] 

for col in cats_encode:
  print(f'value counts of column {col}')
  print(train[col].value_counts())
  print('---'*10, '\n')

value counts of column Sex
male      577
female    314
Name: Sex, dtype: int64
------------------------------ 

value counts of column Embarked
S    646
C    168
Q     77
Name: Embarked, dtype: int64
------------------------------ 



In [10]:
cats_encode = ['Sex', 'Embarked'] 

for col in cats_encode:
  print(f'value counts of column {col}')
  print(test[col].value_counts())
  print('---'*10, '\n')

value counts of column Sex
male      266
female    152
Name: Sex, dtype: int64
------------------------------ 

value counts of column Embarked
S    270
C    102
Q     46
Name: Embarked, dtype: int64
------------------------------ 



In [11]:
# label encode for train sex
mapping_sex = {
    'male' : 1,
    'female' : 0
}

train['Sex'] = train['Sex'].map(mapping_sex)
test['Sex'] = test['Sex'].map(mapping_sex)

In [12]:
# label encode for train sex
mapping_embarked = {
    'S' : 1,
    'C' : 2,
    'Q' : 3
}

train['Embarked'] = train['Embarked'].map(mapping_embarked)
test['Embarked'] = test['Embarked'].map(mapping_embarked)

In [13]:
train.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
651,1,2,0,18.0,0,1,23.0,1
593,0,3,0,28.0,0,2,7.75,3
550,1,1,1,17.0,0,2,110.8833,2
99,0,2,1,34.0,1,0,26.0,1
851,0,3,1,74.0,0,0,7.775,1


In [14]:
test.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
366,3,1,60.0,1,0,75.25,2
253,3,1,30.0,0,0,16.1,1
295,3,1,28.0,0,0,27.7208,1
21,3,1,34.0,0,1,13.0,1
5,3,1,28.0,0,0,8.4583,1


# Modeling

In [15]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [16]:
X = train.drop('Survived', axis=1)
y = train['Survived']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
### XGBoost
xg = XGBClassifier()
score = cross_validate(xg, X, y, cv=10, scoring='accuracy', return_train_score=True)
print('accuracy (train): '+ str(score['train_score'].mean()))
print('accuracy (test): '+ str(score['test_score'].mean()))

accuracy (train): 0.8762949056821119
accuracy (test): 0.829450686641698


In [19]:
### XGBoost
xg = XGBClassifier()
xg.fit(X_train, y_train)
prediction = xg.predict(X_test)
accuracy_score(y_test, prediction)

0.8059701492537313

In [20]:
submission_pred = xg.predict(test)

In [21]:
df_test = pd.read_csv('test.csv')
submission = pd.DataFrame({
    'PassengerId' : df_test['PassengerId'],
    'Survived' : submission_pred
})
submission.sample(5)

Unnamed: 0,PassengerId,Survived
169,1061,0
353,1245,0
130,1022,0
278,1170,1
39,931,0


In [22]:
test['Survived'] = submission['Survived']
test.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
160,3,0,44.0,0,0,16.1,3,1
81,1,1,29.0,1,0,9.5,1,0
133,3,1,29.0,1,0,26.0,2,0
52,2,0,49.0,2,1,76.7292,1,1
270,1,1,28.0,0,0,31.0,2,0


In [23]:
submission.to_csv('submit.csv', index=False)