# Welcome to my first kaggle challenge - Titanic Dataset

For this project, was used Google Colab notebook. So it has a small difference from the jupyter notebook, like the drive mount, necessary to read csv file from Google Drive.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


I first thought about separating a part of the training dataset to perform a local validation before submitting to the kaggle platform. And use a common machine learning algorithm, Random Forest, to see what score it has.

In [0]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [94]:
df_test = pd.read_csv('/content/drive/My Drive/DataScience/titanic/test.csv')
df_train = pd.read_csv('/content/drive/My Drive/DataScience/titanic/train.csv')
print("Test dataset size: ", df_test.shape)
print("Train dataset size: ", df_train.shape)

Test dataset size:  (418, 11)
Train dataset size:  (891, 12)


PS: most machine learning algorithms need to turn features into numbers, such as 'Sex' to 'BinSex'. I used get_dummies to create some bin features.

In [95]:
df_train = pd.get_dummies(df_train, prefix=['Sex', 'Embarked'], columns=['Sex', 'Embarked'])
df_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,0,0,1
5,6,0,3,"Moran, Mr. James",,0,0,330877,8.4583,,0,1,0,1,0
6,7,0,1,"McCarthy, Mr. Timothy J",54.0,0,0,17463,51.8625,E46,0,1,0,0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",2.0,3,1,349909,21.075,,0,1,0,0,1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0,0,2,347742,11.1333,,1,0,0,0,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,1,0,237736,30.0708,,1,0,1,0,0


PS: most machine learning algorithms do not deal with Nan and very large numbers. So what I did was put mean age in to empty age values.

In [96]:
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
dtype: int64

In [97]:
df_train['Parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

The choice of features is essential, here I just chose a few values briefly to test the algorithms.

In [0]:
features = ['Pclass', 'Age', 'SibSp', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
X = df_train[features]
y = df_train['Survived']

In [99]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.35, random_state=0)
X_train.shape, X_valid.shape

((579, 8), (312, 8))

In [100]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [101]:
p = model.predict(X_valid)
np.mean(p == y_valid)

0.7564102564102564

The value below is a baseline, the criterion used is, all female people survive.

In [105]:
baseline = X_valid['Sex_female']
np.mean(baseline == y_valid)

0.7756410256410257

In [0]:
df_test = pd.get_dummies(df_test, prefix=['Sex', 'Embarked'], columns=['Sex', 'Embarked'])
df_test['Age'].fillna(df_test['Age'].mean(), inplace=True)
X_test = df_test[features]

In [0]:
p = model.predict(X_test)

In [0]:
sub = pd.Series(p, index=df_test['PassengerId'], name='Survived')
with open('/content/drive/My Drive/DataScience/titanic/predict.csv', 'w') as f:
  sub.to_csv(f)