In [1]:
import pandas as pd
import numpy as np

---
---
### ETL

In [2]:
train_data = pd.read_csv("./titanic/train.csv")
test_data = pd.read_csv("./titanic/test.csv")
len(train_data), len(test_data)

(891, 418)

In [3]:
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
train_data['CabinLetter'] = train_data['Cabin'].apply(lambda x : str(x)[0] if not pd.isna(x) else np.nan)

In [4]:
X = train_data.drop(columns=['PassengerId','Survived','Name','Ticket','Cabin'])
y = train_data['Survived']

In [5]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter
0,3,male,22.0,1,0,7.25,S,
1,1,female,38.0,1,0,71.2833,C,C
2,3,female,26.0,0,0,7.925,S,
3,1,female,35.0,1,0,53.1,S,C
4,3,male,35.0,0,0,8.05,S,


In [6]:
X = pd.get_dummies(X, columns=['Sex','Embarked','CabinLetter'], dummy_na=True)
X.drop(columns=['Sex_male','Sex_nan','Embarked_nan'], inplace=True)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Embarked_C,Embarked_Q,Embarked_S,CabinLetter_A,CabinLetter_B,CabinLetter_C,CabinLetter_D,CabinLetter_E,CabinLetter_F,CabinLetter_G,CabinLetter_T,CabinLetter_nan
0,3,22.0,1,0,7.25,0,0,0,1,0,0,0,0,0,0,0,0,1
1,1,38.0,1,0,71.2833,1,1,0,0,0,0,1,0,0,0,0,0,0
2,3,26.0,0,0,7.925,1,0,0,1,0,0,0,0,0,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,1,0,0,1,0,0,0,0,0,0
4,3,35.0,0,0,8.05,0,0,0,1,0,0,0,0,0,0,0,0,1


---
---
## Modeling

In [7]:
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import pickle
from tqdm import tqdm

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
len(X_train), len(X_test)

(757, 134)

In [21]:
def random_forest_cross_val(X, y, params, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    acc = []
    
    for tr_ind, val_ind in skf.split(X, y):
        X_tr = X.iloc[tr_ind]
        y_tr = y.iloc[tr_ind]
        
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]
        
        model = RandomForestClassifier(n_estimators=params['n_estimators'],
                                       max_depth=params['max_depth'],
                                       min_samples_split=params['min_samples_split'],
                                       n_jobs=-1, random_state=42, verbose=0)
        
        model.fit(X_tr, y_tr)        
        
        y_pred = model.predict(X_val)
        
        accuracy = accuracy_score(y_val, y_pred)
        acc.append(accuracy)
        
    return sum(acc) / n_splits

In [22]:
def grid_search_CV(X, y, params_GridSearch):
    ps = {'acc': 0,
          'params': []}
        
    for prms in tqdm( list(ParameterGrid(params_GridSearch), ascii=True, desc='Params Tuning:') ):
                          
        acc = random_forest_cross_val(X, y, prms, n_splits=5)

        if acc > ps['acc']:
            ps['acc'] = acc
            ps['params'] = prms
            
    print(f"Best params {ps['params']} | Accuracy: {ps['acc']}")
            
    return ps['params']

---
### Random Forest

In [23]:
params_GridSearch = {'n_estimators':[20,30,50,100,200],
                     'max_depth':[2,3,4,6,8],
                     'min_samples_split':[2,3,4,8]}

In [25]:
best_params = grid_search_CV(X_train, y_train, params_GridSearch)

Best params {'max_depth': 6, 'min_samples_split': 4, 'n_estimators': 100} | Accuracy: 0.8242767514813524


In [26]:
model = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                               max_depth=best_params['max_depth'],
                               min_samples_split=best_params['min_samples_split'],
                               n_jobs=-1, random_state=42, verbose=0)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, min_samples_split=4, n_jobs=-1,
                       random_state=42)

In [27]:
y_pred = model.predict(X_test)

In [28]:
accuracy_score(y_test, y_pred)

0.8059701492537313

In [29]:
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.81      0.87      0.84        78\n           1       0.80      0.71      0.75        56\n\n    accuracy                           0.81       134\n   macro avg       0.80      0.79      0.80       134\nweighted avg       0.81      0.81      0.80       134\n'

In [30]:
pickle.dump(model, open('./models/model-randForest_n100_maxDepth6_minSamplesSplit4.pkl', 'wb'))

---
### Submission file

In [9]:
loaded_model = pickle.load(open('./models/model-randForest_n100_maxDepth6_minSamplesSplit4.pkl', 'rb'))

In [11]:
test_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['CabinLetter'] = test_data['Cabin'].apply(lambda x : str(x)[0] if not pd.isna(x) else np.nan)
X_test = test_data.drop(columns=['PassengerId','Name','Ticket','Cabin'])
X_test = pd.get_dummies(X_test, columns=['Sex','Embarked','CabinLetter'], dummy_na=True)
for col in X.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test.drop(columns=list(set(X_test.columns) - set(X.columns)), inplace=True)

In [13]:
y_pred = loaded_model.predict(X_test)

In [14]:
y_pred[:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0])

In [15]:
predictions = pd.DataFrame({'PassengerId':test_data['PassengerId'], 'Survived':y_pred})
predictions.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [16]:
predictions.to_csv('submission.csv', index=False)