In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
pd.set_option('display.max_rows', None)
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import pickle

# Carga de datos

In [2]:
train_data = pd.read_csv("data/processed_train.csv")
print(train_data.shape)
train_data.sample(3)

(8693, 10)


Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,Transported,GroupSize,CabinDeck,CabinSide,TotalExpensesBinarized,AgeBinarized
7739,Earth,False,55 Cancri e,False,True,3,G,S,ZERO,0-15
1393,Earth,False,Ns/Nc,False,True,2,F,P,MEDIUM,16-30
6350,Mars,True,TRAPPIST-1e,False,True,1,F,P,ZERO,31-45


In [3]:
train_data = pd.get_dummies(train_data, columns=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'])
train_data.drop(columns=['HomePlanet_Ns/Nc','CryoSleep_False','CryoSleep_Ns/Nc','Destination_Ns/Nc','VIP_False','VIP_Ns/Nc','CabinSide_Ns/Nc','AgeBinarized_76-?'], inplace=True)

In [3]:
X = train_data.drop(columns=['Transported'])
y = train_data.Transported

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train), len(X_test)

(6954, 1739)

In [5]:
submission_data = pd.read_csv("data/processed_test.csv")
submission_data.shape

(4277, 10)

In [7]:
submission_data = pd.get_dummies(submission_data, columns=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'])
submission_data.drop(columns=['HomePlanet_Ns/Nc','CryoSleep_False','CryoSleep_Ns/Nc','Destination_Ns/Nc','VIP_False','VIP_Ns/Nc','CabinSide_Ns/Nc','AgeBinarized_76-?'], inplace=True)

In [6]:
X_submission = submission_data.drop(columns=['Transported'])

# Modelado

## Random forest

### Primer modelo

In [9]:
model = RandomForestClassifier(n_estimators=100,
                               max_depth=3,
                               n_jobs=-1,
                               random_state=42,
                               max_samples=0.7)

In [10]:
model = model.fit(X_train, y_train)

In [11]:
y_pred = pd.DataFrame( {'Transported':model.predict(X_test)} )

In [12]:
accuracy_score(y_test, y_pred)

0.7400805060379528

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.71      0.81      0.75       861
        True       0.78      0.67      0.72       878

    accuracy                           0.74      1739
   macro avg       0.74      0.74      0.74      1739
weighted avg       0.75      0.74      0.74      1739



In [30]:
pickle.dump(model, open('./models/model-randForest_n100_maxDepth3_maxsamples07.pkl', 'wb'))

### Cross validation

In [9]:
def random_forest_cross_val(X, y, params, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    acc = []
    
    for tr_ind, val_ind in skf.split(X, y):
        X_tr = X.iloc[tr_ind]
        y_tr = y.iloc[tr_ind]
        
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]
        
        model = RandomForestClassifier(n_estimators=params['n_estimators'],
                                       max_depth=params['max_depth'],
                                       max_samples=params['max_samples'],
                                       n_jobs=-1, random_state=42)
        
        model.fit(X_tr, y_tr)        
        
        y_pred = model.predict(X_val)
        
        accuracy = accuracy_score(y_val, y_pred)
        acc.append(accuracy)
        
    return sum(acc) / n_splits

In [10]:
def grid_search_CV(X, y, params_GridSearch):
    df_acc_by_params = pd.DataFrame(columns=['n_estimators','max_depth','max_samples','accuracy'])
        
    for prms in tqdm( list(ParameterGrid(params_GridSearch)), ascii=True, desc='Params Tuning:' ):
                          
        acc = random_forest_cross_val(X, y, prms, n_splits=5)
        
        df_acc_by_params = pd.concat([df_acc_by_params,
                                      pd.DataFrame(data={'n_estimators':[prms['n_estimators']], 'max_depth':[prms['max_depth']], 'max_samples':[prms['max_samples']], 'accuracy':[acc]})], 
                                     ignore_index=True)
        
    return df_acc_by_params

In [11]:
params_GridSearch = {'n_estimators':[50,100,200],
                     'max_depth':[3,5,8,10,13,17],
                     'max_samples':[0.4,0.7,0.85,None]}

In [12]:
df_acc_by_params = grid_search_CV(X, y, params_GridSearch)
df_acc_by_params['n_estimators'] = df_acc_by_params.n_estimators.astype(int)
df_acc_by_params['max_depth'] = df_acc_by_params.max_depth.astype(int)

Params Tuning:: 100%|##########| 72/72 [01:36<00:00,  1.34s/it]


In [13]:
df_acc_by_params.corr()['accuracy']

n_estimators    0.046972
max_depth      -0.482512
max_samples    -0.147332
accuracy        1.000000
Name: accuracy, dtype: float64

In [14]:
best_params = df_acc_by_params[ df_acc_by_params.accuracy==df_acc_by_params.accuracy.max() ]
best_params

Unnamed: 0,n_estimators,max_depth,max_samples,accuracy
38,200,10,0.4,0.748992


In [20]:
model = RandomForestClassifier(n_estimators=int(best_params.n_estimators),
                               max_depth=int(best_params.max_depth),
                               max_samples=float(best_params.max_samples),
                               n_jobs=-1, random_state=42)
model = model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)

In [22]:
accuracy_score(y_test, y_pred)

0.7412305922944221

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.71      0.81      0.76       861
        True       0.78      0.67      0.72       878

    accuracy                           0.74      1739
   macro avg       0.75      0.74      0.74      1739
weighted avg       0.75      0.74      0.74      1739



In [24]:
pickle.dump(model, open('./models/model-randForest_n200_maxDepth10_maxSamples04.pkl', 'wb'))

## Catboost

Puede trabajar con variables categóricas.

In [7]:
def gradient_boosting_cross_val(X, y, params, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    acc = []
    
    for tr_ind, tst_ind in skf.split(X, y):
        X_tr = X.iloc[tr_ind]
        y_tr = y.iloc[tr_ind]
        
        X_tr, X_val, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.15, random_state=42)
        
        X_tst = X.iloc[tst_ind]
        y_tst = y.iloc[tst_ind]
        
        model = CatBoostClassifier(iterations=params['iterations'],
                                   learning_rate=params['learning_rate'],
                                   depth=params['depth'],
                                   rsm=params['rsm'],  # Subsample ratio of columns when constructing each tree
                                   use_best_model=params['use_best_model'],
                                   cat_features=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'],
                                   eval_metric='Accuracy', verbose=False, thread_count=-1, random_seed=42)
        
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val))        
        
        y_pred = model.predict(X_tst)
        
        accuracy = accuracy_score(y_tst, y_pred)
        acc.append(accuracy)
        
    return sum(acc) / n_splits

In [8]:
def grid_search_CV(X, y, params_GridSearch):
    df_acc_by_params = pd.DataFrame(columns=['iterations','learning_rate','depth','rsm','use_best_model','accuracy'])
        
    for prms in tqdm( list(ParameterGrid(params_GridSearch)), ascii=True, desc='Params Tuning:' ):
                          
        acc = gradient_boosting_cross_val(X, y, prms, n_splits=5)
        
        df_acc_by_params = pd.concat([df_acc_by_params,
                                      pd.DataFrame(data={'iterations':[prms['iterations']], 'learning_rate':[prms['learning_rate']],
                                                         'depth':[prms['depth']], 'rsm':[prms['rsm']],
                                                         'use_best_model':[prms['use_best_model']], 'accuracy':[acc]})], 
                                     ignore_index=True)
        
    return df_acc_by_params

In [9]:
params_GridSearch = {'iterations':[50,100,500],
                     'learning_rate':[0.01,0.05,0.1,0.5],
                     'depth':[3,6,8,10,13],
                     'rsm':[0.4,0.7,0.85,None],
                     'use_best_model':[False, True]}

In [10]:
df_acc_by_params = grid_search_CV(X, y.astype(str), params_GridSearch)

Params Tuning:: 100%|##########| 480/480 [2:45:43<00:00, 20.72s/it]   


In [11]:
df_acc_by_params.corr()['accuracy']

learning_rate   -0.147367
rsm             -0.020262
accuracy         1.000000
Name: accuracy, dtype: float64

In [15]:
df_acc_by_params.sort_values('accuracy', ascending=False).head(10)

Unnamed: 0,iterations,learning_rate,depth,rsm,use_best_model,accuracy
275,500,0.1,8,0.7,True,0.750834
170,500,0.05,6,0.7,False,0.750604
80,500,0.1,3,0.4,False,0.750603
82,500,0.1,3,0.7,False,0.750143
179,500,0.1,6,0.7,True,0.749914
180,500,0.1,6,0.85,False,0.749683
432,100,0.1,13,0.4,False,0.749338
124,50,0.5,6,0.85,False,0.749338
174,500,0.05,6,,False,0.749338
454,500,0.01,13,,False,0.749223


In [12]:
best_params = df_acc_by_params[ df_acc_by_params.accuracy==df_acc_by_params.accuracy.max() ]
best_params

Unnamed: 0,iterations,learning_rate,depth,rsm,use_best_model,accuracy
275,500,0.1,8,0.7,True,0.750834


In [22]:
model = CatBoostClassifier(iterations=int(best_params.iterations),
                           learning_rate=float(best_params.learning_rate),
                           depth=int(best_params.depth),
                           rsm=float(best_params.rsm),  # Subsample ratio of columns when constructing each tree
                           use_best_model=bool(str(best_params.use_best_model)),
                           cat_features=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'],
                           eval_metric='Accuracy', verbose=False, thread_count=-1, random_seed=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

model.fit(X_train, y_train, eval_set=(X_val, y_val))

<catboost.core.CatBoostClassifier at 0x7fb7777f7580>

In [23]:
y_pred = model.predict(X_test)

In [24]:
accuracy_score(y_test.astype(str), y_pred)

0.7418056354226567

In [25]:
print(classification_report(y_test.astype(str), y_pred))

              precision    recall  f1-score   support

       False       0.71      0.81      0.76       861
        True       0.78      0.67      0.73       878

    accuracy                           0.74      1739
   macro avg       0.75      0.74      0.74      1739
weighted avg       0.75      0.74      0.74      1739



In [26]:
pickle.dump(model, open('../models/model-catboost_n500_lr01_depth8_rsm07_ubmTrue.pkl', 'wb'))

---
---
---

In [28]:
pd.read_csv("../spaceship-titanic/test.csv", usecols=['PassengerId']).merge( pd.DataFrame( {'Transported':model.predict(X_submission)} ),
                                                                          left_index=True, right_index=True, how='left').\
    to_csv("../spaceship-titanic/submission.csv", index=False)