In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
train = pd.read_csv('data/train.csv', index_col="PassengerId")
test = pd.read_csv('data/test.csv', index_col="PassengerId")
train.shape, test.shape

((891, 11), (418, 10))

In [3]:
train.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
train["Survived"].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [6]:
lable_name = "Survived"

In [7]:
X_train = train.drop(columns=[lable_name, "Name"])
X_test = test.drop(columns=["Name"])

y_train = train[lable_name]

In [8]:
cat_col = X_train.select_dtypes(exclude="number").columns
print(cat_col)
X_train[cat_col] = X_train[cat_col].astype("category")
X_test[cat_col] = X_test[cat_col].astype("category")

Index(['Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')


* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html
* https://scikit-learn.org/stable/modules/ensemble.html#histogram-based-gradient-boosting
* https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
      (StandardScaler(),
       make_column_selector(dtype_include=np.number)),
      (OneHotEncoder(handle_unknown="ignore"),
       make_column_selector(dtype_include="category")))

X_train_ct = ct.fit_transform(X_train).toarray()
X_test_ct = ct.transform(X_test).toarray()
X_train_ct.shape, X_test_ct.shape

((891, 840), (418, 840))

In [10]:
df_X_train = pd.DataFrame(X_train_ct, columns=ct.get_feature_names_out())
df_X_train.describe()

Unnamed: 0,standardscaler__Pclass,standardscaler__Age,standardscaler__SibSp,standardscaler__Parch,standardscaler__Fare,onehotencoder__Sex_female,onehotencoder__Sex_male,onehotencoder__Ticket_110152,onehotencoder__Ticket_110413,onehotencoder__Ticket_110465,...,onehotencoder__Cabin_F33,onehotencoder__Cabin_F38,onehotencoder__Cabin_F4,onehotencoder__Cabin_G6,onehotencoder__Cabin_T,onehotencoder__Cabin_nan,onehotencoder__Embarked_C,onehotencoder__Embarked_Q,onehotencoder__Embarked_S,onehotencoder__Embarked_nan
count,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,-8.772133e-17,2.388379e-16,4.3860660000000004e-17,5.3829000000000005e-17,3.9873330000000004e-18,0.352413,0.647587,0.003367,0.003367,0.002245,...,0.003367,0.001122,0.002245,0.004489,0.001122,0.771044,0.188552,0.08642,0.722783,0.002245
std,1.000562,1.000701,1.000562,1.000562,1.000562,0.47799,0.47799,0.057961,0.057961,0.047351,...,0.057961,0.033501,0.047351,0.06689,0.033501,0.420397,0.391372,0.281141,0.447876,0.047351
min,-1.566107,-2.016979,-0.4745452,-0.4736736,-0.6484217,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.3693648,-0.6595416,-0.4745452,-0.4736736,-0.4891482,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.8273772,-0.1170488,-0.4745452,-0.4736736,-0.3573909,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,0.8273772,0.571831,0.4327934,-0.4736736,-0.02424635,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,0.8273772,3.465126,6.784163,6.974147,9.667167,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
df_X_train.describe().T.describe()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,840.0,840.0,840.0,840.0,840.0,840.0,840.0,840.0
mean,890.789286,0.004761905,0.045964,-0.006166,-0.001746,0.002863,0.00635,1.027045
std,6.10708,0.04483991,0.08117,0.093654,0.051874,0.071329,0.080151,0.422489
min,714.0,-8.772133e-17,0.033501,-2.016979,-0.659542,-0.474545,-0.473674,0.827377
25%,891.0,0.001122334,0.033501,0.0,0.0,0.0,0.0,1.0
50%,891.0,0.001122334,0.033501,0.0,0.0,0.0,0.0,1.0
75%,891.0,0.001122334,0.033501,0.0,0.0,0.0,0.0,1.0
max,891.0,0.7710438,1.000701,0.0,1.0,1.0,1.0,9.667167


In [12]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import make_pipeline

model = HistGradientBoostingClassifier(random_state=42)
model

In [13]:
from sklearn.model_selection import cross_validate

cv_result = cross_validate(model, X_train_ct, y_train, cv=3)
cv_result

{'fit_time': array([3.36035085, 3.40240192, 3.3999908 ]),
 'score_time': array([0.00230718, 0.0027051 , 0.00193405]),
 'test_score': array([0.8047138 , 0.83501684, 0.83501684])}

In [14]:
# fit & predict
model.fit(X_train_ct, y_train)

https://scikit-learn.org/stable/modules/permutation_importance.html#permutation-importance

In [15]:
from sklearn.inspection import permutation_importance

result = permutation_importance(model, X_train_ct, y_train,
                                n_repeats=3, random_state=0)

In [16]:
feature_names = ct.get_feature_names_out()

In [17]:
for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2 * result.importances_std[i] > 0:
        print(f"{feature_names[i]:<8}"
              f"{result.importances_mean[i]:.3f}"
              f" +/- {result.importances_std[i]:.3f}")

onehotencoder__Sex_female0.213 +/- 0.011
standardscaler__Age0.175 +/- 0.006
standardscaler__Fare0.137 +/- 0.005
standardscaler__Pclass0.083 +/- 0.004
standardscaler__SibSp0.029 +/- 0.006
onehotencoder__Cabin_nan0.021 +/- 0.003
onehotencoder__Embarked_C0.012 +/- 0.002
standardscaler__Parch0.010 +/- 0.003
onehotencoder__Embarked_S0.004 +/- 0.001


In [18]:
y_pred_hgb = model.predict(X_test_ct)
y_pred_hgb[:5]

array([0, 0, 0, 0, 0])

In [19]:
pd.Series(y_pred_hgb).value_counts()

0    270
1    148
Name: count, dtype: int64

In [21]:
submit = pd.read_csv("data/gender_submission.csv", index_col="PassengerId")
submit.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [22]:
submit["Survived"] = y_pred_hgb

In [24]:
submit.to_csv("submission_hgb.csv")
pd.read_csv("submission_hgb.csv")

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
