In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, Binarizer, KBinsDiscretizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from mlxtend.feature_selection import SequentialFeatureSelector

In [2]:
train_data = pd.read_csv("./titanic/train.csv")
test_data = pd.read_csv("./titanic/test.csv")
len(train_data), len(test_data)

(891, 418)

---
---
## Random forest

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
df_train = train_data.loc[~train_data['Embarked'].isna(), ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']]
X_train = df_train.drop(columns='Survived')
y_train = df_train['Survived']

X_test = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']]

In [5]:
pipe_age = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'),
    KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='uniform') )

pipe_fare = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'),
    KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans') )

def valid_letter(l):
    return l == 'A' or l == 'B' or l == 'C' or l == 'D' or l == 'E' or l == 'F'

def extract_cabin_letter(X):
    return X.applymap(lambda x : str(x)[0] if not pd.isna(x) and valid_letter(str(x)[0]) else 'nan')

cabin_letter = FunctionTransformer(extract_cabin_letter)

pipe_cabin = make_pipeline(
    cabin_letter,
    OneHotEncoder(drop='if_binary') )

column_transformer = make_column_transformer(
    (pipe_cabin, ['Cabin']),
    (pipe_age, ['Age']),
    (pipe_fare, ['Fare']),
    (Binarizer(threshold=0), ['SibSp','Parch']),
    (OneHotEncoder(drop='if_binary'), ['Sex','Embarked']),
    remainder='passthrough')

model = RandomForestClassifier(n_jobs=-1, random_state=42, verbose=0)

pipe = make_pipeline(column_transformer, model)

In [6]:
pipe.steps

[('columntransformer',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('pipeline-1',
                                   Pipeline(steps=[('functiontransformer',
                                                    FunctionTransformer(func=<function extract_cabin_letter at 0x7fbe208eea60>)),
                                                   ('onehotencoder',
                                                    OneHotEncoder(drop='if_binary'))]),
                                   ['Cabin']),
                                  ('pipeline-2',
                                   Pipeline(steps=[('simpleimputer',
                                                    SimpleImputer()),
                                                   ('kbinsdiscretizer',
                                                    KBinsDiscretizer(encode='ordinal',
                                                                     n_bins=8,
                                                

In [7]:
pd.DataFrame(column_transformer.fit_transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,3.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0


In [8]:
param_grid = {'randomforestclassifier__n_estimators':[30,50,100],
              'randomforestclassifier__max_depth':[2,3,4,6],
              'randomforestclassifier__min_samples_split':[2,4,6]}

In [9]:
gs_cv = GridSearchCV(pipe, param_grid, scoring='accuracy', cv=10, n_jobs=-1)

In [10]:
%%time
gs_cv.fit(X_train, y_train);

CPU times: user 2.45 s, sys: 396 ms, total: 2.84 s
Wall time: 24.4 s


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('functiontransformer',
                                                                                          FunctionTransformer(func=<function extract_cabin_letter at 0x7fbe208eea60>)),
                                                                                         ('onehotencoder',
                                                                                          OneHotEncoder(drop='if_binary'))]),
                                                                         ['Cabin']),
                                                                        ('pipeline-2',
                                                               

In [11]:
gs_cv.best_score_, gs_cv.best_params_

(0.8313329928498467,
 {'randomforestclassifier__max_depth': 6,
  'randomforestclassifier__min_samples_split': 4,
  'randomforestclassifier__n_estimators': 100})

In [12]:
pickle.dump(gs_cv, open('./models/model-randForestCLF-20211210.pkl', 'wb'))

In [None]:
# test = pickle.load(open('./models/model-catboostCLF.pkl', 'rb'))
# test.predict(X_test)[:10]

In [13]:
y_pred = gs_cv.predict(X_test)
y_pred[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [14]:
predictions = pd.DataFrame({'PassengerId':test_data['PassengerId'], 'Survived':y_pred})
predictions.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [15]:
predictions.to_csv('submission.csv', index=False)

#### Testing Cross-validation

In [7]:
cross_val_score(pipe, X_train, y_train, cv=10, scoring='accuracy').mean()

0.8313329928498467

#### Sequential Feature Selection

In [23]:
sfs = SequentialFeatureSelector(pipe, forward=True, k_features=6, scoring='accuracy', cv=10, n_jobs=-1)
_ = sfs.fit(X_train, y_train)

In [24]:
sfs.subsets_

{1: {'feature_idx': (0,),
  'cv_scores': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]),
  'avg_score': nan,
  'feature_names': ('Pclass',)},
 2: {'feature_idx': (0, 1),
  'cv_scores': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]),
  'avg_score': nan,
  'feature_names': ('Pclass', 'Sex')},
 3: {'feature_idx': (0, 1, 2),
  'cv_scores': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]),
  'avg_score': nan,
  'feature_names': ('Pclass', 'Sex', 'Age')},
 4: {'feature_idx': (0, 1, 2, 3),
  'cv_scores': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]),
  'avg_score': nan,
  'feature_names': ('Pclass', 'Sex', 'Age', 'SibSp')},
 5: {'feature_idx': (0, 1, 2, 3, 4),
  'cv_scores': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]),
  'avg_score': nan,
  'feature_names': ('Pclass', 'Sex', 'Age', 'SibSp', 'Parch')},
 6: {'feature_idx': (0, 1, 2, 3, 4, 5),
  'cv_scores': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]),
  'avg_score': nan,
  '

---
---
## Gradient boosting

In [3]:
from catboost import CatBoostClassifier

In [4]:
df_train = train_data.loc[~train_data['Embarked'].isna(), ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
X_train = df_train.drop(columns='Survived')
y_train = df_train['Survived']

X_test = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

In [5]:
pipe_age = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'),
    KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='uniform') )

pipe_fare = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'),
    KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans') )

column_transformer = make_column_transformer(
    (pipe_age, ['Age']),
    (pipe_fare, ['Fare']),
    (Binarizer(threshold=0), ['SibSp','Parch']),
    remainder='passthrough')

model = CatBoostClassifier(random_seed=42, cat_features=[5,6])

pipe = make_pipeline(column_transformer, model)

In [6]:
pipe.steps

[('columntransformer',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('pipeline-1',
                                   Pipeline(steps=[('simpleimputer',
                                                    SimpleImputer()),
                                                   ('kbinsdiscretizer',
                                                    KBinsDiscretizer(encode='ordinal',
                                                                     n_bins=8,
                                                                     strategy='uniform'))]),
                                   ['Age']),
                                  ('pipeline-2',
                                   Pipeline(steps=[('simpleimputer',
                                                    SimpleImputer()),
                                                   ('kbinsdiscretizer',
                                                    KBinsDiscretizer(encode='ordinal',
                     

In [7]:
pd.DataFrame(column_transformer.fit_transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6
0,2,0,1,0,3,male,S
1,3,3,1,0,1,female,C
2,2,0,0,0,3,female,S
3,3,2,1,0,1,female,S
4,3,0,0,0,3,male,S


In [8]:
param_grid = {'catboostclassifier__n_estimators':[30,50,100],
              'catboostclassifier__learning_rate':[0.1,0.3,0.5],
              'catboostclassifier__l2_leaf_reg':[0,1,3,7],
              'catboostclassifier__depth':[2,3,4,6]}

In [9]:
gs_cv = GridSearchCV(pipe, param_grid, scoring='accuracy', cv=10, n_jobs=-1)

In [10]:
%%time
gs_cv.fit(X_train, y_train);

0:	learn: 0.5925852	total: 53.9ms	remaining: 5.34s
1:	learn: 0.5302514	total: 54.9ms	remaining: 2.69s
2:	learn: 0.4940811	total: 55.6ms	remaining: 1.8s
3:	learn: 0.4717085	total: 56.1ms	remaining: 1.35s
4:	learn: 0.4581755	total: 56.7ms	remaining: 1.08s
5:	learn: 0.4438225	total: 57.2ms	remaining: 896ms
6:	learn: 0.4374819	total: 57.8ms	remaining: 768ms
7:	learn: 0.4332104	total: 58.3ms	remaining: 671ms
8:	learn: 0.4264228	total: 58.9ms	remaining: 595ms
9:	learn: 0.4249912	total: 59.4ms	remaining: 534ms
10:	learn: 0.4234998	total: 59.9ms	remaining: 484ms
11:	learn: 0.4221396	total: 60.5ms	remaining: 443ms
12:	learn: 0.4208055	total: 61ms	remaining: 408ms
13:	learn: 0.4198669	total: 61.5ms	remaining: 378ms
14:	learn: 0.4180385	total: 62.1ms	remaining: 352ms
15:	learn: 0.4135591	total: 62.5ms	remaining: 328ms
16:	learn: 0.4124998	total: 63.1ms	remaining: 308ms
17:	learn: 0.4097350	total: 63.6ms	remaining: 290ms
18:	learn: 0.4048749	total: 64.2ms	remaining: 274ms
19:	learn: 0.4041883	tota

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer()),
                                                                                         ('kbinsdiscretizer',
                                                                                          KBinsDiscretizer(encode='ordinal',
                                                                                                           n_bins=8,
                                                                                                           strategy='uniform'))]),
                                                   

In [11]:
gs_cv.best_score_, gs_cv.best_params_

(0.8312946884576098,
 {'catboostclassifier__depth': 4,
  'catboostclassifier__l2_leaf_reg': 3,
  'catboostclassifier__learning_rate': 0.3,
  'catboostclassifier__n_estimators': 100})

In [12]:
pickle.dump(gs_cv, open('./models/model-catboostCLF-20211210.pkl', 'wb'))

In [23]:
# test = pickle.load(open('./models/model-catboostCLF.pkl', 'rb'))
# test.predict(X_test)[:10]

In [13]:
y_pred = gs_cv.predict(X_test)
y_pred[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [14]:
predictions = pd.DataFrame({'PassengerId':test_data['PassengerId'], 'Survived':y_pred})
predictions.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [15]:
predictions.to_csv('submission.csv', index=False)

---
## Logistic Regression

---
## Support Vector Machines

In [None]:
# TODO:
# Probar forward/backward selection: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html