# Chapter 3: Classification
## Ex. 3: Titanic classifier

In [None]:
from zipfile import ZipFile

import pandas as pd
import sklearn as sk
from sklearn.calibration import cross_val_predict
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

sk.set_config(
    display="diagram",
    transform_output="pandas",
)

In [2]:
!kaggle competitions download -p datasets titanic

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
ZipFile("datasets/titanic.zip").extractall("datasets/titanic")
titanic_train, titanic_test = (
    pd.read_csv("datasets/titanic/train.csv", index_col="PassengerId"),
    pd.read_csv("datasets/titanic/test.csv", index_col="PassengerId"),
)

In [4]:
X_train, X_test, y_train = (
    titanic_train.drop(columns="Survived"),
    titanic_test.copy(),
    titanic_train["Survived"],
)

X_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
preprocessing = make_column_transformer(
    ("drop", ["Name", "Ticket", "Cabin"]),
    (
        make_pipeline(SimpleImputer(), StandardScaler()),
        make_column_selector(dtype_include="number"),
    ),
    (OneHotEncoder(drop="if_binary", sparse_output=False), ["Sex", "Embarked"]),
    remainder="passthrough",
)

model = make_pipeline(preprocessing, RandomForestClassifier())

param_grid = {
    "randomforestclassifier__n_estimators": [10**_ for _ in range(1, 4)],
    "randomforestclassifier__min_samples_leaf": [0.001 * 10**_ for _ in range(3)],
}

grid_search = GridSearchCV(
    model,
    param_grid,
    cv=StratifiedKFold(shuffle=True),
    scoring="accuracy",
    verbose=3,
)

grid_search

In [7]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END randomforestclassifier__min_samples_leaf=0.001, randomforestclassifier__n_estimators=10;, score=0.799 total time=   0.1s
[CV 2/5] END randomforestclassifier__min_samples_leaf=0.001, randomforestclassifier__n_estimators=10;, score=0.837 total time=   0.0s
[CV 3/5] END randomforestclassifier__min_samples_leaf=0.001, randomforestclassifier__n_estimators=10;, score=0.787 total time=   0.0s
[CV 4/5] END randomforestclassifier__min_samples_leaf=0.001, randomforestclassifier__n_estimators=10;, score=0.792 total time=   0.0s
[CV 5/5] END randomforestclassifier__min_samples_leaf=0.001, randomforestclassifier__n_estimators=10;, score=0.753 total time=   0.0s
[CV 1/5] END randomforestclassifier__min_samples_leaf=0.001, randomforestclassifier__n_estimators=100;, score=0.827 total time=   0.2s
[CV 2/5] END randomforestclassifier__min_samples_leaf=0.001, randomforestclassifier__n_estimators=100;, score=0.820 total time=   0.2s


In [8]:
grid_search.best_params_

{'randomforestclassifier__min_samples_leaf': 0.01,
 'randomforestclassifier__n_estimators': 10}

In [10]:
grid_search.best_score_

0.8249011361496453

In [11]:
accuracy_score(y_train, grid_search.predict(X_train))

0.8473625140291807

In [12]:
accuracy_score(
    y_train, cross_val_predict(grid_search.best_estimator_, X_train, y_train)
)

0.819304152637486

In [13]:
dummy_classifier = DummyClassifier()
accuracy_score(y_train, cross_val_predict(dummy_classifier, X_train, y_train))

0.6161616161616161

In [14]:
pd.DataFrame(
    grid_search.predict(X_test),
    columns=["Survived"],
    index=X_test.index,
).to_csv(
    "models/03_titanic_test_predictions.csv",
)