In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
pd.set_option('display.max_rows', None)
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import pickle

# Carga de datos

In [2]:
train_data = pd.read_csv("data/processed_train.csv")
print(train_data.shape)
train_data.sample(3)

(8693, 15)


Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupSize,CabinDeck,CabinSide,TotalExpensesBinarized,AgeBinarized
4797,Earth,False,TRAPPIST-1e,False,0.0,464.0,321.0,0.0,90.0,True,1,F,S,MEDIUM,16-30
972,Earth,False,TRAPPIST-1e,False,756.0,84.0,0.0,8.0,99.0,False,1,F,S,MEDIUM,0-15
1221,Earth,False,55 Cancri e,False,1.0,299.0,1.0,380.0,13.0,False,1,F,S,LOW,16-30


In [3]:
train_data = pd.get_dummies(train_data, columns=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'])
train_data.drop(columns=['HomePlanet_Ns/Nc','CryoSleep_False','CryoSleep_Ns/Nc','Destination_Ns/Nc','VIP_False','VIP_Ns/Nc','CabinSide_Ns/Nc','AgeBinarized_76-?'], inplace=True)

In [4]:
X = train_data.drop(columns=['Transported'])
y = train_data.Transported

In [5]:
X.sample(3)

Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupSize,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,...,TotalExpensesBinarized_HIGH,TotalExpensesBinarized_LOW,TotalExpensesBinarized_MEDIUM,TotalExpensesBinarized_TOP,TotalExpensesBinarized_ZERO,AgeBinarized_0-15,AgeBinarized_16-30,AgeBinarized_31-45,AgeBinarized_46-60,AgeBinarized_61-75
1057,0.0,3.0,3505.0,199.0,17.0,2,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
4023,0.0,0.0,0.0,0.0,0.0,1,1,0,0,1,...,0,0,0,0,1,0,0,1,0,0
4031,0.0,0.0,0.0,0.0,0.0,3,1,0,0,1,...,0,0,0,0,1,1,0,0,0,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train), len(X_test)

(6954, 1739)

In [7]:
submission_data = pd.read_csv("data/processed_test.csv")
submission_data.shape

(4277, 15)

In [8]:
submission_data = pd.get_dummies(submission_data, columns=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'])
submission_data.drop(columns=['HomePlanet_Ns/Nc','CryoSleep_False','CryoSleep_Ns/Nc','Destination_Ns/Nc','VIP_False','VIP_Ns/Nc','CabinSide_Ns/Nc','AgeBinarized_76-?'], inplace=True)

In [9]:
X_submission = submission_data.drop(columns=['Transported'])

# Modelado

## Random forest

In [10]:
def random_forest_cross_val(X, y, params, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    acc = []
    
    for tr_ind, val_ind in skf.split(X, y):
        X_tr = X.iloc[tr_ind]
        y_tr = y.iloc[tr_ind]
        
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]
        
        model = RandomForestClassifier(n_estimators=params['n_estimators'],
                                       max_depth=params['max_depth'],
                                       max_samples=params['max_samples'],
                                       n_jobs=-1, random_state=42)
        
        model.fit(X_tr, y_tr)        
        
        y_pred = model.predict(X_val)
        
        accuracy = accuracy_score(y_val, y_pred)
        acc.append(accuracy)
        
    return sum(acc) / n_splits

In [11]:
def grid_search_CV(X, y, params_GridSearch):
    df_acc_by_params = pd.DataFrame(columns=['n_estimators','max_depth','max_samples','accuracy'])
        
    for prms in tqdm( list(ParameterGrid(params_GridSearch)), ascii=True, desc='Params Tuning:' ):
                          
        acc = random_forest_cross_val(X, y, prms, n_splits=5)
        
        df_acc_by_params = pd.concat([df_acc_by_params,
                                      pd.DataFrame(data={'n_estimators':[prms['n_estimators']], 'max_depth':[prms['max_depth']], 'max_samples':[prms['max_samples']], 'accuracy':[acc]})], 
                                     ignore_index=True)
        
    return df_acc_by_params

In [12]:
params_GridSearch = {'n_estimators':[50,100,200],
                     'max_depth':[3,5,8,10,13],
                     'max_samples':[0.4,0.7,0.85,None]}

In [13]:
df_acc_by_params = grid_search_CV(X, y, params_GridSearch)

Params Tuning:: 100%|##########| 60/60 [02:20<00:00,  2.34s/it]


In [16]:
df_acc_by_params['n_estimators'] = df_acc_by_params.n_estimators.astype(int)
df_acc_by_params['max_depth'] = df_acc_by_params.max_depth.astype(int)

In [17]:
df_acc_by_params.corr()['accuracy']

n_estimators    0.010945
max_depth       0.919290
max_samples     0.005750
accuracy        1.000000
Name: accuracy, dtype: float64

In [18]:
best_params = df_acc_by_params[ df_acc_by_params.accuracy==df_acc_by_params.accuracy.max() ]
best_params

Unnamed: 0,n_estimators,max_depth,max_samples,accuracy
44,200,10,0.85,0.802599


In [19]:
model = RandomForestClassifier(n_estimators=int(best_params.n_estimators),
                               max_depth=int(best_params.max_depth),
                               max_samples=float(best_params.max_samples),
                               n_jobs=-1, random_state=42)
model = model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

In [21]:
accuracy_score(y_test, y_pred)

0.7901092581943646

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.79      0.78      0.79       861
        True       0.79      0.80      0.79       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



In [24]:
pickle.dump(model, open('../models/model_v2-randForest_n200_maxDepth10_maxSamples085.pkl', 'wb'))

## Catboost

Puede trabajar con variables categóricas.

In [8]:
def gradient_boosting_cross_val(X, y, params, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    acc = []
    
    for tr_ind, tst_ind in skf.split(X, y):
        X_tr = X.iloc[tr_ind]
        y_tr = y.iloc[tr_ind]
        
        X_tr, X_val, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.15, random_state=42)
        
        X_tst = X.iloc[tst_ind]
        y_tst = y.iloc[tst_ind]
        
        model = CatBoostClassifier(iterations=params['iterations'],
                                   learning_rate=params['learning_rate'],
                                   depth=params['depth'],
                                   rsm=params['rsm'],  # Subsample ratio of columns when constructing each tree
                                   use_best_model=params['use_best_model'],
                                   cat_features=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'],
                                   eval_metric='Accuracy', verbose=False, thread_count=-1, random_seed=42)
        
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val))        
        
        y_pred = model.predict(X_tst)
        
        accuracy = accuracy_score(y_tst, y_pred)
        acc.append(accuracy)
        
    return sum(acc) / n_splits

In [9]:
def grid_search_CV(X, y, params_GridSearch):
    df_acc_by_params = pd.DataFrame(columns=['iterations','learning_rate','depth','rsm','use_best_model','accuracy'])
        
    for prms in tqdm( list(ParameterGrid(params_GridSearch)), ascii=True, desc='Params Tuning:' ):
                          
        acc = gradient_boosting_cross_val(X, y, prms, n_splits=5)
        
        df_acc_by_params = pd.concat([df_acc_by_params,
                                      pd.DataFrame(data={'iterations':[prms['iterations']], 'learning_rate':[prms['learning_rate']],
                                                         'depth':[prms['depth']], 'rsm':[prms['rsm']],
                                                         'use_best_model':[prms['use_best_model']], 'accuracy':[acc]})], 
                                     ignore_index=True)
        
    return df_acc_by_params

In [10]:
params_GridSearch = {'iterations':[500],
                     'learning_rate':[0.05,0.1],
                     'depth':[6,8],
                     'rsm':[0.4,0.7,0.85,None],
                     'use_best_model':[False, True]}

In [11]:
df_acc_by_params = grid_search_CV(X, y.astype(str), params_GridSearch)

Params Tuning:: 100%|##########| 32/32 [11:44<00:00, 22.03s/it]


In [12]:
df_acc_by_params.corr()['accuracy']

learning_rate   -0.352525
rsm              0.051553
accuracy         1.000000
Name: accuracy, dtype: float64

In [13]:
df_acc_by_params.sort_values('accuracy', ascending=False).head(10)

Unnamed: 0,iterations,learning_rate,depth,rsm,use_best_model,accuracy
6,500,0.05,6,,False,0.806282
18,500,0.05,8,0.7,False,0.806166
4,500,0.05,6,0.85,False,0.805706
2,500,0.05,6,0.7,False,0.805016
10,500,0.1,6,0.7,False,0.804671
27,500,0.1,8,0.7,True,0.804441
0,500,0.05,6,0.4,False,0.80398
8,500,0.1,6,0.4,False,0.80398
9,500,0.1,6,0.4,True,0.803865
14,500,0.1,6,,False,0.80375


In [14]:
best_params = df_acc_by_params[ df_acc_by_params.accuracy==df_acc_by_params.accuracy.max() ]
best_params

Unnamed: 0,iterations,learning_rate,depth,rsm,use_best_model,accuracy
6,500,0.05,6,,False,0.806282


In [23]:
model = CatBoostClassifier(iterations=int(best_params.iterations),
                           learning_rate=float(best_params.learning_rate),
                           depth=int(best_params.depth),
                           rsm=None,  # Subsample ratio of columns when constructing each tree
                           use_best_model=bool(str(best_params.use_best_model)),
                           cat_features=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'],
                           eval_metric='Accuracy', verbose=False, thread_count=-1, random_seed=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

model.fit(X_train, y_train, eval_set=(X_val, y_val))

<catboost.core.CatBoostClassifier at 0x7fa0fccad4f0>

In [24]:
y_pred = model.predict(X_test)

In [25]:
accuracy_score(y_test.astype(str), y_pred)

0.7889591719378953

In [26]:
print(classification_report(y_test.astype(str), y_pred))

              precision    recall  f1-score   support

       False       0.81      0.74      0.78       861
        True       0.77      0.83      0.80       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



In [28]:
pickle.dump(model, open('../models/model_v2-catboost_n500_lr005_depth6_rsmNone_ubmFalse.pkl', 'wb'))

---
---
---

In [10]:
model = pickle.load(open("../models/model_v2-randForest_n200_maxDepth10_maxSamples085.pkl", 'rb'))

In [12]:
pd.read_csv("../spaceship-titanic/test.csv", usecols=['PassengerId']).merge( pd.DataFrame( {'Transported':model.fit(X,y).predict(X_submission)} ),
                                                                          left_index=True, right_index=True, how='left').\
    to_csv("../spaceship-titanic/submission.csv", index=False)