In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import pickle

# Carga de datos

In [2]:
train_data = pd.read_csv("data/processed_train.csv")
print(train_data.shape)
train_data.sample(3)

(8693, 12)


Unnamed: 0,CryoSleep,Destination,Age,VIP,Transported,GroupSize,HomePlanet,CabinDeck,CabinSide,AnyExpenses,EssencialBill,NonEssencialBill
3455,False,TRAPPIST-1e,0,False,True,4,Earth,G,P,False,0.0,0.0
1036,False,TRAPPIST-1e,1,False,False,4,Mars,E,P,True,370.0,658.0
7917,False,55 Cancri e,3,False,False,6,Europa,C,S,True,4332.0,6994.0


In [3]:
train_data = pd.get_dummies(train_data, columns=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','AnyExpenses'])
train_data.drop(columns=['HomePlanet_Ns/Nc','CryoSleep_Ns/Nc','Destination_Ns/Nc','VIP_Ns/Nc','AnyExpenses_False'], inplace=True)
train_data.sample(3)

Unnamed: 0,Age,Transported,GroupSize,EssencialBill,NonEssencialBill,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,AnyExpenses_True
4141,1,False,1,42.0,3602.0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,1
3400,0,True,2,0.0,0.0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0
7285,2,False,1,368.0,1045.0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,1


In [4]:
X = train_data.drop(columns=['Transported'])
y = train_data.Transported

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train), len(X_test)

(6954, 1739)

In [6]:
submission_data = pd.read_csv("data/processed_test.csv")
submission_data.shape

(4277, 12)

In [7]:
submission_data = pd.get_dummies(submission_data, columns=['HomePlanet','CryoSleep','Destination','VIP','CabinDeck','CabinSide','AnyExpenses'])
submission_data.drop(columns=['HomePlanet_Ns/Nc','CryoSleep_Ns/Nc','Destination_Ns/Nc','VIP_Ns/Nc','AnyExpenses_False'], inplace=True)

In [8]:
X_submission = submission_data.drop(columns=['Transported'])

# Modelado

## Random forest

### Cross validation

In [9]:
def random_forest_cross_val(X, y, params, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    acc = []
    
    for tr_ind, val_ind in skf.split(X, y):
        X_tr = X.iloc[tr_ind]
        y_tr = y.iloc[tr_ind]
        
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]
        
        model = RandomForestClassifier(n_estimators=params['n_estimators'],
                                       max_depth=params['max_depth'],
                                       max_samples=params['max_samples'],
                                       n_jobs=-1, random_state=42)
        
        model.fit(X_tr, y_tr)        
        
        y_pred = model.predict(X_val)
        
        accuracy = accuracy_score(y_val, y_pred)
        acc.append(accuracy)
        
    return sum(acc) / n_splits

In [10]:
def grid_search_CV(X, y, params_GridSearch):
    df_acc_by_params = pd.DataFrame(columns=['n_estimators','max_depth','max_samples','accuracy'])
        
    for prms in tqdm( list(ParameterGrid(params_GridSearch)), ascii=True, desc='Params Tuning:' ):
                          
        acc = random_forest_cross_val(X, y, prms, n_splits=5)
        
        df_acc_by_params = pd.concat([df_acc_by_params,
                                      pd.DataFrame(data={'n_estimators':[prms['n_estimators']], 'max_depth':[prms['max_depth']], 'max_samples':[prms['max_samples']], 'accuracy':[acc]})], 
                                     ignore_index=True)
        
    return df_acc_by_params

In [11]:
params_GridSearch = {'n_estimators':[100,200,500],
                     'max_depth':[3,5,8,10,13],
                     'max_samples':[0.4,0.7,0.85,None]}

In [12]:
df_acc_by_params = grid_search_CV(X, y, params_GridSearch)

Params Tuning:: 100%|##########| 60/60 [03:07<00:00,  3.12s/it]


In [13]:
df_acc_by_params.dtypes

n_estimators     object
max_depth        object
max_samples     float64
accuracy        float64
dtype: object

In [14]:
df_acc_by_params['n_estimators'] = df_acc_by_params.n_estimators.astype(int)
df_acc_by_params['max_depth'] = df_acc_by_params.max_depth.astype(int)

In [15]:
df_acc_by_params.corr()['accuracy']

n_estimators   -0.043158
max_depth       0.744899
max_samples     0.002174
accuracy        1.000000
Name: accuracy, dtype: float64

In [16]:
df_acc_by_params.sort_values('accuracy', ascending=False).head(10)

Unnamed: 0,n_estimators,max_depth,max_samples,accuracy
25,200,8,0.4,0.80651
26,500,8,0.4,0.80628
33,100,8,,0.80582
34,200,8,,0.80582
46,200,10,,0.80559
35,500,8,,0.805475
30,100,8,0.85,0.80467
31,200,8,0.85,0.80444
32,500,8,0.85,0.804439
28,200,8,0.7,0.804324


In [17]:
best_params = df_acc_by_params[ df_acc_by_params.accuracy==df_acc_by_params.accuracy.max() ]
best_params

Unnamed: 0,n_estimators,max_depth,max_samples,accuracy
25,200,8,0.4,0.80651


In [18]:
model = RandomForestClassifier(n_estimators=int(best_params.n_estimators),
                               max_depth=int(best_params.max_depth),
                               max_samples=float(best_params.max_samples),
                               n_jobs=-1, random_state=42)
model = model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_test)

In [20]:
accuracy_score(y_test, y_pred)

0.7912593444508338

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.81      0.76      0.78       861
        True       0.78      0.82      0.80       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



In [23]:
pickle.dump(model, open('../models/model_v3-randForest_n200_maxDepth8_maxSamples04.pkl', 'wb'))

---
---
---

In [24]:
pd.read_csv("../spaceship-titanic/test.csv", usecols=['PassengerId']).merge( pd.DataFrame( {'Transported':model.fit(X,y).predict(X_submission)} ),
                                                                          left_index=True, right_index=True, how='left').\
    to_csv("../spaceship-titanic/submission.csv", index=False)

In [25]:
print(pd.read_csv("data/processed_train.csv").columns.tolist())

['CryoSleep', 'Destination', 'Age', 'VIP', 'Transported', 'GroupSize', 'HomePlanet', 'CabinDeck', 'CabinSide', 'AnyExpenses', 'EssencialBill', 'NonEssencialBill']
