In [20]:
import pandas as pd
import numpy as np
import pickle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, Binarizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
train_data = pd.read_csv("./titanic/train.csv")
test_data = pd.read_csv("./titanic/test.csv")
len(train_data), len(test_data)

(891, 418)

---
### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
df_train = train_data.loc[~train_data['Embarked'].isna(), ['Survived','Pclass','Sex','Age','Fare','Cabin','Embarked']]
X_train = df_train.drop(columns='Survived')
y_train = df_train['Survived']

X_test = test_data[['Pclass','Sex','Age','Fare','Cabin','Embarked']]

In [4]:
def extract_cabin_letter(X):
    return X.applymap(lambda x : str(x)[0] if not pd.isna(x) else 'nan')

cabin_letter = FunctionTransformer(extract_cabin_letter)

In [5]:
columns_transformer_1 = make_column_transformer(
    (cabin_letter, ['Cabin']),
    (SimpleImputer(missing_values=np.nan, strategy='mean'), ['Age','Fare']),
    (OneHotEncoder(drop='if_binary'), ['Sex','Embarked']),
    remainder='passthrough')

In [6]:
columns_transformer_2 = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), [0]),  # 0 is 'Cabin'
    remainder='passthrough')

In [7]:
model = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=4,
                               n_jobs=-1, random_state=42, verbose=0)

In [8]:
pipe_full = make_pipeline(columns_transformer_1, columns_transformer_2, model)

In [None]:
cross_val_score(pipe_full, X_train, y_train, cv=5, scoring='accuracy').mean()

In [10]:
pipe_full.fit(X_train, y_train)
pipe_full.predict(X_test)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

---
## Gradient boosting

In [3]:
from catboost import CatBoostClassifier

In [4]:
df_train = train_data.loc[~train_data['Embarked'].isna(), ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
X_train = df_train.drop(columns='Survived')
y_train = df_train['Survived']

X_test = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

In [5]:
column_transformer = make_column_transformer(
    (SimpleImputer(missing_values=np.nan, strategy='mean'), ['Age','Fare']),
    (Binarizer(threshold=0), ['SibSp','Parch']),
    remainder='passthrough')

model = CatBoostClassifier(random_seed=42, cat_features=[5,6])

pipe = make_pipeline(column_transformer, model)

In [6]:
pipe.steps

[('columntransformer',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('simpleimputer', SimpleImputer(),
                                   ['Age', 'Fare']),
                                  ('binarizer', Binarizer(threshold=0),
                                   ['SibSp', 'Parch'])])),
 ('catboostclassifier', <catboost.core.CatBoostClassifier at 0x7fe6cbc3bbb0>)]

In [7]:
pd.DataFrame(column_transformer.fit_transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6
0,22,7.25,1,0,3,male,S
1,38,71.2833,1,0,1,female,C
2,26,7.925,0,0,3,female,S
3,35,53.1,1,0,1,female,S
4,35,8.05,0,0,3,male,S


In [8]:
param_grid = {'catboostclassifier__n_estimators':[30,50,100],
              'catboostclassifier__learning_rate':[0.1,0.3,0.5],
              'catboostclassifier__l2_leaf_reg':[0,1,3,7],
              'catboostclassifier__depth':[2,3,4,6]}

In [9]:
gs_cv = GridSearchCV(pipe, param_grid, scoring='accuracy', cv=10, n_jobs=-1)

In [10]:
%%time
gs_cv.fit(X_train, y_train);

0:	learn: 0.5340434	total: 52.8ms	remaining: 5.23s
1:	learn: 0.4667734	total: 54.3ms	remaining: 2.66s
2:	learn: 0.4309812	total: 55.3ms	remaining: 1.79s
3:	learn: 0.4196095	total: 56.6ms	remaining: 1.36s
4:	learn: 0.4160502	total: 57.6ms	remaining: 1.09s
5:	learn: 0.4093924	total: 58.6ms	remaining: 919ms
6:	learn: 0.4046885	total: 59.8ms	remaining: 794ms
7:	learn: 0.4033602	total: 60.6ms	remaining: 696ms
8:	learn: 0.3993996	total: 61.7ms	remaining: 624ms
9:	learn: 0.3938602	total: 63.8ms	remaining: 574ms
10:	learn: 0.3920298	total: 64.4ms	remaining: 521ms
11:	learn: 0.3915657	total: 64.9ms	remaining: 476ms
12:	learn: 0.3898432	total: 65.5ms	remaining: 438ms
13:	learn: 0.3860493	total: 66.1ms	remaining: 406ms
14:	learn: 0.3836698	total: 66.8ms	remaining: 378ms
15:	learn: 0.3822327	total: 67.3ms	remaining: 354ms
16:	learn: 0.3803194	total: 68ms	remaining: 332ms
17:	learn: 0.3747768	total: 68.7ms	remaining: 313ms
18:	learn: 0.3713557	total: 69.4ms	remaining: 296ms
19:	learn: 0.3700617	tot

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('simpleimputer',
                                                                         SimpleImputer(),
                                                                         ['Age',
                                                                          'Fare']),
                                                                        ('binarizer',
                                                                         Binarizer(threshold=0),
                                                                         ['SibSp',
                                                                          'Parch'])])),
                                       ('catboostclassifier',
                                        <catboost.core.CatBoostClassifier o

In [14]:
gs_cv.best_score_, gs_cv.best_params_

(0.8380107252298263,
 {'catboostclassifier__depth': 4,
  'catboostclassifier__l2_leaf_reg': 1,
  'catboostclassifier__learning_rate': 0.5,
  'catboostclassifier__n_estimators': 100})

In [21]:
pickle.dump(gs_cv, open('./models/model-catboostCLF.pkl', 'wb'))

In [23]:
# test = pickle.load(open('./models/model-catboostCLF.pkl', 'rb'))
# test.predict(X_test)[:10]

In [17]:
y_pred = gs_cv.predict(X_test)
y_pred[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [18]:
predictions = pd.DataFrame({'PassengerId':test_data['PassengerId'], 'Survived':y_pred})
predictions.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [19]:
predictions.to_csv('submission.csv', index=False)

In [None]:
# TODO:
# Comparar el rendimiento de varios modelos: https://www.youtube.com/watch?v=Vc-qn5VcJmw&list=PLbUIzNzCKD8CAak1UTDtJUfNnCmMrV-BC&index=7
# Probar forward/backward selection: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html
# Probar otros modelos: Logistic regression, SVM