In [8]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

In [9]:
tr_tr_new = joblib.load('../joblib/tr_tr_encoded.joblib')

In [10]:
te_tr_new = joblib.load('../joblib/te_tr_encoded.joblib')

In [11]:
tr_tr_new.fillna(tr_tr_new.median(), inplace=True)

# Fill missing values with median for testing data
te_tr_new.fillna(tr_tr_new.median(), inplace=True)

In [12]:
# Splitting the dataset into features and target
X = tr_tr_new.drop(columns=['isFraud'])
y = tr_tr_new['isFraud']

# Splitting the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [19]:
from sklearn.ensemble import RandomForestClassifier as model_constructor
from sklearn.metrics import roc_auc_score as metric

# Random Forest
n_estimators_values = [500,1000]
max_features_values = [12,14]
max_samples_values = [200, 600]

params_grid = {'max_features': max_features_values,
              'n_estimators': n_estimators_values,
               'max_samples': max_samples_values}

num_iter = 1
grid_results = pd.DataFrame(columns = ('max_features',
                                       'n_estimators',
                                       'max_samples',
                                       'metric_train',
                                       'metric_val'))

for max_features in params_grid['max_features']:
    for n_estimators in params_grid['n_estimators']:
        for max_samples in params_grid['max_samples']:

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  max_samples = max_samples,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(X_train, y_train)


                        # [5] Predict
                        pred_train = model.predict(X_train) # predict!
                        pred_val = model.predict(X_val) # predict!

                        # [6] Compute metric
                        metric_train = metric(y_train, pred_train)
                        metric_val = metric(y_val, pred_val)

                        # print error
                        print('Metric train = %.2f - Metric validation = %.2f.'
                              % (metric_train, metric_val))

                        # Save iteration results
                        grid_results.loc[num_iter]=[ max_features,
                                                    n_estimators,
                                                    max_samples,
                                                 metric_train,
                                                 metric_val]
                        num_iter += 1




Iteracion = 1
Metric train = 0.51 - Metric validation = 0.50.
Iteracion = 2
Metric train = 0.55 - Metric validation = 0.54.
Iteracion = 3
Metric train = 0.51 - Metric validation = 0.50.
Iteracion = 4
Metric train = 0.55 - Metric validation = 0.54.
Iteracion = 5
Metric train = 0.51 - Metric validation = 0.51.
Iteracion = 6
Metric train = 0.55 - Metric validation = 0.55.
Iteracion = 7
Metric train = 0.51 - Metric validation = 0.51.
Iteracion = 8
Metric train = 0.55 - Metric validation = 0.55.


In [18]:
grid_results = grid_results.sort_values(by = ['metric_val', 'metric_train'], ascending = [False, False])
best_model = grid_results.iloc[0]
best_model

max_features     12.000000
n_estimators    500.000000
max_samples     600.000000
metric_train      0.549443
metric_val        0.543480
Name: 1, dtype: float64

In [None]:
import numpy as np

print('Old train data size = ' + str(X_train.shape))
print('Old train target size = ' + str(y_train.shape))

# Combine train and validación
X_train = np.concatenate((X_train, X_val), axis = 0)
y_train = np.concatenate((y_train, y_val), axis = 0)

print('New train data size = ' + str(X_train.shape))
print('New train target size = ' + str(y_train.shape))

In [None]:
# [3] Define model
model = model_constructor(criterion = 'gini',
                          max_depth = None,
                          min_samples_split = 2,
                          min_samples_leaf = 1,
                          max_features = int(best_model.max_features),
                          n_estimators =  int(best_model.n_estimators),
                          max_samples = int(best_model.max_samples),
                          random_state = 0) # Use same random_state as in training!!!

# [4] Train model
model.fit(X_train, y_train)

# [5] Predict
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

# [6] Compute metric
metric_train = metric(y_train, pred_train)
metric_test = metric(y_test, pred_test)



In [None]:
# print error
print('AUC train = %.2f - AUC test = %.2f'
      % (metric_train, metric_test))