In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import pickle

# Carga de datos

In [2]:
train_data = pd.read_csv("spaceship-titanic/processed_train.csv")
print(train_data.shape)
train_data.sample(3)

(8693, 10)


Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,Transported,GroupSize,CabinDeck,CabinSide,TotalExpensesBinarized,AgeBinarized
2987,Mars,True,TRAPPIST-1e,False,True,1,F,S,ZERO,31-45
5971,Earth,False,TRAPPIST-1e,False,False,1,F,P,LOW,16-30
2185,Mars,False,TRAPPIST-1e,False,False,1,F,P,MEDIUM,46-60


In [3]:
train_data = pd.get_dummies(train_data, columns=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'])
train_data.drop(columns=['HomePlanet_Ns/Nc','CryoSleep_False','CryoSleep_Ns/Nc','Destination_Ns/Nc','VIP_False','VIP_Ns/Nc','CabinSide_Ns/Nc','AgeBinarized_76-?'], inplace=True)

In [4]:
X = train_data.drop(columns=['Transported'])
y = train_data.Transported

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train), len(X_test), len(y_test)

(6954, 1739, 1739)

In [6]:
submission_data = pd.read_csv("spaceship-titanic/processed_test.csv")
submission_data.shape

(4277, 10)

In [7]:
submission_data = pd.get_dummies(submission_data, columns=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','TotalExpensesBinarized','AgeBinarized'])
submission_data.drop(columns=['HomePlanet_Ns/Nc','CryoSleep_False','CryoSleep_Ns/Nc','Destination_Ns/Nc','VIP_False','VIP_Ns/Nc','CabinSide_Ns/Nc','AgeBinarized_76-?'], inplace=True)

In [8]:
X_submission = submission_data.drop(columns=['Transported'])

# Modelado

## Random forest

### Primer modelo

In [9]:
model = RandomForestClassifier(n_estimators=100,
                               max_depth=3,
                               n_jobs=-1,
                               random_state=42,
                               max_samples=0.7)

In [10]:
model = model.fit(X_train, y_train)

In [11]:
y_pred = pd.DataFrame( {'Transported':model.predict(X_test)} )

In [12]:
accuracy_score(y_test, y_pred)

0.7400805060379528

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.71      0.81      0.75       861
        True       0.78      0.67      0.72       878

    accuracy                           0.74      1739
   macro avg       0.74      0.74      0.74      1739
weighted avg       0.75      0.74      0.74      1739



In [30]:
pickle.dump(model, open('./models/model-randForest_n100_maxDepth3_maxsamples07.pkl', 'wb'))

### Cross validation

In [9]:
def random_forest_cross_val(X, y, params, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    acc = []
    
    for tr_ind, val_ind in skf.split(X, y):
        X_tr = X.iloc[tr_ind]
        y_tr = y.iloc[tr_ind]
        
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]
        
        model = RandomForestClassifier(n_estimators=params['n_estimators'],
                                       max_depth=params['max_depth'],
                                       max_samples=params['max_samples'],
                                       n_jobs=-1, random_state=42)
        
        model.fit(X_tr, y_tr)        
        
        y_pred = model.predict(X_val)
        
        accuracy = accuracy_score(y_val, y_pred)
        acc.append(accuracy)
        
    return sum(acc) / n_splits

In [10]:
def grid_search_CV(X, y, params_GridSearch):
    df_acc_by_params = pd.DataFrame(columns=['n_estimators','max_depth','max_samples','accuracy'])
        
    for prms in tqdm( list(ParameterGrid(params_GridSearch)), ascii=True, desc='Params Tuning:' ):
                          
        acc = random_forest_cross_val(X, y, prms, n_splits=5)
        
        df_acc_by_params = pd.concat([df_acc_by_params,
                                      pd.DataFrame(data={'n_estimators':[prms['n_estimators']], 'max_depth':[prms['max_depth']], 'max_samples':[prms['max_samples']], 'accuracy':[acc]})], 
                                     ignore_index=True)
        
    return df_acc_by_params

In [11]:
params_GridSearch = {'n_estimators':[50,100,200],
                     'max_depth':[3,5,8,10,13,17],
                     'max_samples':[0.4,0.7,0.85,None]}

In [12]:
df_acc_by_params = grid_search_CV(X, y, params_GridSearch)
df_acc_by_params['n_estimators'] = df_acc_by_params.n_estimators.astype(int)
df_acc_by_params['max_depth'] = df_acc_by_params.max_depth.astype(int)

Params Tuning:: 100%|##########| 72/72 [01:36<00:00,  1.34s/it]


In [13]:
df_acc_by_params.corr()['accuracy']

n_estimators    0.046972
max_depth      -0.482512
max_samples    -0.147332
accuracy        1.000000
Name: accuracy, dtype: float64

In [14]:
best_params = df_acc_by_params[ df_acc_by_params.accuracy==df_acc_by_params.accuracy.max() ]
best_params

Unnamed: 0,n_estimators,max_depth,max_samples,accuracy
38,200,10,0.4,0.748992


In [20]:
model = RandomForestClassifier(n_estimators=int(best_params.n_estimators),
                               max_depth=int(best_params.max_depth),
                               max_samples=float(best_params.max_samples),
                               n_jobs=-1, random_state=42)
model = model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)

In [22]:
accuracy_score(y_test, y_pred)

0.7412305922944221

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.71      0.81      0.76       861
        True       0.78      0.67      0.72       878

    accuracy                           0.74      1739
   macro avg       0.75      0.74      0.74      1739
weighted avg       0.75      0.74      0.74      1739



In [24]:
pickle.dump(model, open('./models/model-randForest_n200_maxDepth10_maxSamples04.pkl', 'wb'))

## Catboost

Puede trabajar con variables categóricas.

---
---
---

In [25]:
pd.read_csv("spaceship-titanic/test.csv", usecols=['PassengerId']).merge( pd.DataFrame( {'Transported':model.fit(X, y).predict(X_submission)} ),
                                                                          left_index=True, right_index=True, how='left').\
    to_csv("spaceship-titanic/submission.csv", index=False)