In [80]:
import torch
import torch.nn as nn
import catboost
import pandas as pd
from util import *

In [2]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1070'

In [6]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [7]:
train.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

There are two choices: model selection according to CV scores vs separate validation set.

In [19]:
X, y = train.drop(columns="Transported"), train["Transported"]

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X ,y, test_size=0.2, stratify=y, random_state=42)

In [63]:
from catboost import CatBoostClassifier



In [150]:
model = CatBoostClassifier(
    iterations=1000
)

In [35]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

cat_cols = [0, 1, 2, 3, 5, 6]
c = ColumnTransformer([
    ("path", SimpleImputer(strategy="most_frequent"), ["Destination", "HomePlanet"]),
    # ("gender", GenderExtractor(), "Name"),
    ("cabin", CabinExtractor(), "Cabin"),
    ("group", ExtractGroupMembership(), "PassengerId"),
    ("expences", SimpleImputer(strategy="constant", fill_value=0), ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]),
    ("age", SimpleImputer(strategy="median"), ["Age"]),
    ("special_conditions", make_pipeline(
        FunctionalTransformer(lambda x: int(x)),
        SimpleImputer(strategy="constant", fill_value=-0), 
    ), ["VIP", "CryoSleep"]),
], remainder="drop")

In [152]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ("prep", c),
    ("model", model)
])
pipeline.get_params()

{'memory': None,
 'steps': [('prep', ColumnTransformer(transformers=[('path',
                                    SimpleImputer(strategy='most_frequent'),
                                    ['Destination', 'HomePlanet']),
                                   ('gender', GenderExtractor(), 'Name'),
                                   ('cabin', CabinExtractor(), 'Cabin'),
                                   ('group', ExtractGroupMembership(),
                                    'PassengerId'),
                                   ('expences',
                                    SimpleImputer(fill_value=0,
                                                  strategy='constant'),
                                    ['RoomService', 'FoodCourt', 'ShoppingMall',
                                     'Spa', 'VRDeck']),
                                   ('age', SimpleImputer(strategy='median'),
                                    ['Age']),
                                   ('special_conditions',
     

In [32]:
X_train_prepd = c.fit_transform(X_train)
X_val_prepd = c.transform(X_val)

In [108]:
model = CatBoostClassifier(
    iterations=500,
    task_type="GPU",
    devices='0:1',
    depth=6
)
model.fit(X_train_prepd, y_train, cat_features=cat_cols, eval_set=(X_val_prepd, y_val))

Learning rate set to 0.080457
0:	learn: 0.6532070	test: 0.6530449	best: 0.6530449 (0)	total: 67.5ms	remaining: 33.7s
1:	learn: 0.6209778	test: 0.6211569	best: 0.6211569 (1)	total: 120ms	remaining: 29.8s
2:	learn: 0.5992353	test: 0.5999640	best: 0.5999640 (2)	total: 172ms	remaining: 28.5s
3:	learn: 0.5726558	test: 0.5739548	best: 0.5739548 (3)	total: 225ms	remaining: 27.9s
4:	learn: 0.5551168	test: 0.5568973	best: 0.5568973 (4)	total: 278ms	remaining: 27.5s
5:	learn: 0.5369734	test: 0.5400460	best: 0.5400460 (5)	total: 330ms	remaining: 27.2s
6:	learn: 0.5215763	test: 0.5251278	best: 0.5251278 (6)	total: 383ms	remaining: 27s
7:	learn: 0.5111958	test: 0.5150063	best: 0.5150063 (7)	total: 446ms	remaining: 27.4s
8:	learn: 0.5032709	test: 0.5077069	best: 0.5077069 (8)	total: 500ms	remaining: 27.3s
9:	learn: 0.4940363	test: 0.4986767	best: 0.4986767 (9)	total: 554ms	remaining: 27.1s
10:	learn: 0.4852197	test: 0.4905355	best: 0.4905355 (10)	total: 617ms	remaining: 27.4s
11:	learn: 0.4775439	te

<catboost.core.CatBoostClassifier at 0x1b4435947f0>

In [76]:
f = np.vectorize(lambda x: x == "True")
f(model.predict(X_val_prepd))

array([ True,  True, False, ...,  True, False, False])

In [77]:
from sklearn.metrics import classification_report
print(
    classification_report(y_true=y_val, y_pred=f(model.predict(X_val_prepd)))
)

              precision    recall  f1-score   support

       False       0.82      0.80      0.81       863
        True       0.81      0.82      0.81       876

    accuracy                           0.81      1739
   macro avg       0.81      0.81      0.81      1739
weighted avg       0.81      0.81      0.81      1739



In [94]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin

class EnsembleClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model, grid, pred_preprocess_hook=lambda x : x) -> None:
        super().__init__()
        self.model = model
        self.grid = grid
        self.models = [ self.model(**params) for params in grid ]
        self.pred_preprocess_hook = pred_preprocess_hook
    
    def fit(self, X, y, *args, **kwargs):
        for model in self.models:
            model.fit(X, y, *args, **kwargs)
        return self
    
    def predict(self, X):
        preds = np.zeros(len(X))
        for model in self.models:
            preds += self.pred_preprocess_hook(model.predict(X))
        preds /= len(self.models)
        # majority vote
        return np.where(preds > 0.5, 1, 0)

grid = [{
    "iterations": 100,
    "random_seed": i
} for i in range(10)]

model2 = EnsembleClassifier(model=CatBoostClassifier, grid=grid, pred_preprocess_hook=f)

In [95]:
model2.fit(X_train_prepd, y_train, cat_features=cat_cols, eval_set=(X_val_prepd, y_val))

Learning rate set to 0.051161
0:	learn: 0.6676736	test: 0.6677046	best: 0.6677046 (0)	total: 27.4ms	remaining: 27.4s
1:	learn: 0.6469028	test: 0.6479576	best: 0.6479576 (1)	total: 56.4ms	remaining: 28.1s
2:	learn: 0.6271241	test: 0.6281823	best: 0.6281823 (2)	total: 85.1ms	remaining: 28.3s
3:	learn: 0.6121571	test: 0.6139168	best: 0.6139168 (3)	total: 114ms	remaining: 28.5s
4:	learn: 0.5953151	test: 0.5971939	best: 0.5971939 (4)	total: 142ms	remaining: 28.3s
5:	learn: 0.5815776	test: 0.5835880	best: 0.5835880 (5)	total: 171ms	remaining: 28.3s
6:	learn: 0.5669531	test: 0.5692917	best: 0.5692917 (6)	total: 200ms	remaining: 28.3s
7:	learn: 0.5564051	test: 0.5587331	best: 0.5587331 (7)	total: 228ms	remaining: 28.3s
8:	learn: 0.5511142	test: 0.5536552	best: 0.5536552 (8)	total: 251ms	remaining: 27.6s
9:	learn: 0.5404796	test: 0.5430342	best: 0.5430342 (9)	total: 283ms	remaining: 28s
10:	learn: 0.5315861	test: 0.5340885	best: 0.5340885 (10)	total: 316ms	remaining: 28.4s
11:	learn: 0.5244023	

EnsembleClassifier(grid=[{'iterations': 1000, 'random_seed': 0},
                         {'iterations': 1000, 'random_seed': 1},
                         {'iterations': 1000, 'random_seed': 2},
                         {'iterations': 1000, 'random_seed': 3},
                         {'iterations': 1000, 'random_seed': 4},
                         {'iterations': 1000, 'random_seed': 5},
                         {'iterations': 1000, 'random_seed': 6},
                         {'iterations': 1000, 'random_seed': 7},
                         {'iterations': 1000, 'random_seed': 8},
                         {'iterations': 1000, 'random_seed': 9}],
                   model=<class 'catboost.core.CatBoostClassifier'>,
                   pred_preprocess_hook=<numpy.vectorize object at 0x000001B4430EE4F0>)

In [99]:
print(
    classification_report(y_true=y_val, y_pred=(model2.predict(X_val_prepd)))
)

              precision    recall  f1-score   support

       False       0.80      0.81      0.81       863
        True       0.81      0.80      0.81       876

    accuracy                           0.81      1739
   macro avg       0.81      0.81      0.81      1739
weighted avg       0.81      0.81      0.81      1739



## CV with GridSearch

In [176]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, uniform

catboost_grid = {
    "iterations": [100, 500, 1000],
    "learning_rate": [2e-2],
    "depth": [2, 3, 5],
    "subsample": [0.2, 0.3, 0.5, 1],
    "verbose": [False],
    "cat_features": [cat_cols]
}
catboost_grid = { "model__" + k: v for k, v in catboost_grid.items() }

grid = RandomizedSearchCV(
    pipeline, 
    catboost_grid, 
    scoring="accuracy", 
    cv=5, 
    random_state=42,
    n_iter=15,
    n_jobs=3,
    verbose=4
)
grid

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, uniform

cat_cols = [0, 1, 2, 3, 4, 6, 7]
catboost_grid = {
    "iterations": [100, 500, 1000],
    "learning_rate": [2e-2],
    "depth": [2, 3, 5],
    "subsample": [0.2, 0.3, 0.5, 1],
    "verbose": [False],
    "cat_features": [cat_cols]
}
catboost_grid = { "model__" + k: v for k, v in catboost_grid.items() }

grid = RandomizedSearchCV(
    pipeline, 
    catboost_grid, 
    scoring="accuracy", 
    cv=5, 
    random_state=42,
    n_iter=15,
    n_jobs=3,
    verbose=4
)
grid

In [154]:
y_train_prepd = FunctionalTransformer(lambda x: int(x)).fit_transform(y_train.to_frame()).iloc[:, 0]
y_train_prepd

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: Transported, Length: 8693, dtype: int64

In [155]:
c.fit_transform(X_train)[:, cat_cols]

array([['TRAPPIST-1e', 'Europa', 'none', ..., 'P', 1, '01'],
       ['TRAPPIST-1e', 'Earth', 'none', ..., 'S', 1, '01'],
       ['TRAPPIST-1e', 'Europa', 'none', ..., 'S', 2, '01'],
       ...,
       ['TRAPPIST-1e', 'Earth', 'none', ..., 'S', 1, '01'],
       ['55 Cancri e', 'Europa', 'none', ..., 'S', 2, '01'],
       ['TRAPPIST-1e', 'Europa', 'none', ..., 'S', 2, '02']], dtype=object)

In [177]:
grid.fit(X_train, y_train_prepd)

Fitting 5 folds for each of 15 candidates, totalling 75 fits



## Submission

In [228]:
test_prepd = c.transform(test)

In [234]:
eval = pd.Series(model.predict(test_prepd))

In [171]:
import inspect

def retrieve_name(var):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    return [var_name for var_name, var_val in callers_local_vars if var_val is var][0]


for eval in [ grid_cv_3_no_tune, grid_cv_5_no_tune, grid_cv_5_new_vars_no_tune, grid_cv_5_slight_tune ]:
    eval_name = retrieve_name(eval)
    pd.DataFrame({"PassengerId": test["PassengerId"], "Transported": pd.Series(eval).map(lambda x: bool(x))}).set_index("PassengerId").to_csv(eval_name + ".csv")

In [237]:
pd.DataFrame({"PassengerId": test["PassengerId"], "Transported": eval.map(lambda x: bool(x))}).set_index("PassengerId").to_csv("eval.csv")