In [1]:
# Importação das bibliotecas
import pandas as pd
from sklearn import metrics as mt
from sklearn import ensemble as es
import numpy as np

In [2]:
def calculate_metrics(y, yhat):
    '''
        Recebe as targets original e prevista, calcula as métricas e retorna um dicionário
    '''
    metrics = {'accuracy': round(mt.accuracy_score( y, yhat ), 3),
           'precision': round(mt.precision_score( y, yhat), 3),
           'recall': round(mt.recall_score( y, yhat), 3),
           'f1_score': round(mt.f1_score( y, yhat), 3)}

    return metrics

In [3]:
def best_values(df_scores):
    '''
        Recebe um dataframe com as métricas encontradas para cada parâmetro
        Retorna um dataframe o melhor valor de cada métrica, e qual parâmetro utilizado para encontrá-lo
    '''
    df_best_values = pd.DataFrame(columns=['metric', 'performance', 'max_depth'])

    list_metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for metric in list_metrics:
        max_index = df_scores[metric].idxmax()
        best_value = df_scores.loc[max_index, metric]
        best_max_depth = df_scores.loc[max_index, 'max_depth']
        best_estimators = df_scores.loc[max_index, 'estimators']
        df_best_values = df_best_values._append({'metric': metric, 'estimators': best_estimators, 'performance': best_value, 'max_depth': best_max_depth},
                                                 ignore_index=True)

    return df_best_values


In [4]:
# Seleção de Features
features = ['customer_type', 'age', 'class', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'on_board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'gender_Female', 'gender_Male', 'type_of_travel_business_travel',
       'type_of_travel_personal_travel']

In [5]:
# Carregamento dos dados de treino
X_train = pd.read_csv( 'X_training.csv' )
y_train = pd.read_csv( 'y_training.csv' )

# Preparação dos dados de treino
X_train = X_train.loc[:, features]
y_train = y_train.values.ravel()

In [6]:
# Carregamento dos dados de validação:
X_val = pd.read_csv( 'X_validation.csv' )
y_val = pd.read_csv( 'y_validation.csv' )

# Preparação dos dados de validação
X_val = X_val.loc[:, features]
y_val = y_val.values.ravel()

In [7]:
# Carregamento dos dados de teste
X_test = pd.read_csv( 'X_test.csv' )
y_test = pd.read_csv( 'y_test.csv' )

# Preparação dos dados de teste
X_test = X_test.loc[:, features]
y_test = y_test.values.ravel()

In [8]:
# Treinamento do modelo com os parâmetros padrão do algoritmo
model = es.RandomForestClassifier()
model.fit( X_train,y_train )

In [9]:
# Cálculo das métricas para os dados de treinamento com os parametros padrão, a objetivo de comparação
df_scores_train_ini = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1_score'])

yhat_train = model.predict( X_train)
res = calculate_metrics(y_train, yhat_train)
df_scores_train_ini = df_scores_train_ini._append({'accuracy': res['accuracy'], 'precision': res['precision'], 
                                               'recall': res['recall'], 'f1_score': res['f1_score']}, ignore_index=True)

df_scores_train_ini

  df_scores_train_ini = df_scores_train_ini._append({'accuracy': res['accuracy'], 'precision': res['precision'],


Unnamed: 0,accuracy,precision,recall,f1_score
0,1.0,1.0,1.0,1.0


In [10]:
# Cálculo das métricas para os dados de validação com os parametros padrão, a objetivo de comparação
df_scores_val_ini = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1_score'])

yhat_val = model.predict( X_val)
res = calculate_metrics(y_val, yhat_val)
df_scores_val_ini= df_scores_val_ini._append({'accuracy': res['accuracy'], 'precision': res['precision'], 
                                               'recall': res['recall'], 'f1_score': res['f1_score']}, ignore_index=True)

df_scores_val_ini


  df_scores_val_ini= df_scores_val_ini._append({'accuracy': res['accuracy'], 'precision': res['precision'],


Unnamed: 0,accuracy,precision,recall,f1_score
0,0.962,0.972,0.941,0.956


In [11]:
# Treinar os modelos variandos os parâmetros e armazenar os valores em dataframes

df_scores_params_train = pd.DataFrame(columns=['max_depth','estimators','accuracy', 'precision', 'recall', 'f1_score'])
df_scores_params_val = pd.DataFrame(columns=['max_depth','estimators','accuracy', 'precision', 'recall', 'f1_score'])

list_max_depth = [3,5,13,21,35,50,100]
list_estimators = [10, 50, 200, 500,1000,2000]
for i in list_max_depth:
    for j in list_estimators:
        model = es.RandomForestClassifier( max_depth = i, n_estimators = j, n_jobs=-1)
        model.fit( X_train,y_train )    
    
        # Coletando as métricas para os dados de treinamento
        yhat_train = model.predict( X_train)
        res = calculate_metrics(y_train, yhat_train)   
    
        df_scores_params_train = df_scores_params_train._append({'max_depth': i, 'estimators': j, 'accuracy': res['accuracy'], 'precision': res['precision'],
                                                     'recall': res['recall'], 'f1_score': res['f1_score']}, ignore_index=True)
        
        
        # Coletando as métricas para os dados de validação
        yhat_val = model.predict( X_val)
        res = calculate_metrics(y_val, yhat_val)
        
        df_scores_params_val = df_scores_params_val._append({'max_depth': i, 'estimators': j, 'accuracy': res['accuracy'], 'precision': res['precision'],
                                                     'recall': res['recall'], 'f1_score': res['f1_score']}, ignore_index=True)


    


  df_scores_params_train = df_scores_params_train._append({'max_depth': i, 'estimators': j, 'accuracy': res['accuracy'], 'precision': res['precision'],
  df_scores_params_val = df_scores_params_val._append({'max_depth': i, 'estimators': j, 'accuracy': res['accuracy'], 'precision': res['precision'],


In [12]:
# Determinar as melhores performances para os dados de treino e validação
df_best_values_train = best_values(df_scores_params_train)
df_best_values_val = best_values(df_scores_params_val)
df_best_values_val

  df_best_values = df_best_values._append({'metric': metric, 'estimators': best_estimators, 'performance': best_value, 'max_depth': best_max_depth},
  df_best_values = df_best_values._append({'metric': metric, 'estimators': best_estimators, 'performance': best_value, 'max_depth': best_max_depth},


Unnamed: 0,metric,performance,max_depth,estimators
0,accuracy,0.963,21.0,200.0
1,precision,0.972,21.0,200.0
2,recall,0.942,21.0,50.0
3,f1_score,0.957,50.0,500.0


In [13]:
# Concatenar os dados de treino e validação para uso da técnica de validação holdout com os dados de teste
X_trainval = pd.concat([X_train, X_val], ignore_index=True)
y_trainval = np.concatenate((y_train, y_val), axis=None)

In [14]:
# Agora vamos medir a performance de teste individualmente para cada uma das métricas


# accuracy
best_max_depth = int(df_best_values_val[df_best_values_val['metric']=='accuracy']['max_depth'].item())
best_estimators = int(df_best_values_val[df_best_values_val['metric']=='accuracy']['estimators'].item())

# Performance de teste:
model = es.RandomForestClassifier( max_depth = best_max_depth, n_estimators = best_estimators)
model.fit( X_trainval, y_trainval )

yhat_test = model.predict( X_test)
res = calculate_metrics(y_test, yhat_test)
accuracy_test = res['accuracy']

In [15]:
# precision
best_max_depth = int(df_best_values_val[df_best_values_val['metric']=='precision']['max_depth'].item())
best_estimators = int(df_best_values_val[df_best_values_val['metric']=='precision']['estimators'].item())

# Performance de teste:
model = es.RandomForestClassifier( max_depth = best_max_depth, n_estimators = best_estimators)
model.fit( X_trainval, y_trainval )

yhat_test = model.predict( X_test)
res = calculate_metrics(y_test, yhat_test)
precision_test = res['precision']

In [16]:
# recall
best_max_depth = int(df_best_values_val[df_best_values_val['metric']=='recall']['max_depth'].item())
best_estimators = int(df_best_values_val[df_best_values_val['metric']=='recall']['estimators'].item())

# Performance de teste:
model = es.RandomForestClassifier( max_depth = best_max_depth, n_estimators = best_estimators)
model.fit( X_trainval, y_trainval )

yhat_test = model.predict( X_test)
res = calculate_metrics(y_test, yhat_test)
recall_test = res['recall']

In [17]:
# f1_score
best_max_depth = int(df_best_values_val[df_best_values_val['metric']=='f1_score']['max_depth'].item())
best_estimators = int(df_best_values_val[df_best_values_val['metric']=='f1_score']['estimators'].item())

# Performance de teste:
model = es.RandomForestClassifier( max_depth = best_max_depth, n_estimators = best_estimators)
model.fit( X_trainval, y_trainval )

yhat_test = model.predict( X_test)
res = calculate_metrics(y_test, yhat_test)
f1_score_test = res['f1_score']

In [18]:
# Construindo um dataframe com os valores finais de performance para facilitar a visualização

accuracy_train = df_best_values_train[df_best_values_train['metric'] == 'accuracy']['performance'].item()
precision_train = df_best_values_train[df_best_values_train['metric'] == 'precision']['performance'].item()
recall_train = df_best_values_train[df_best_values_train['metric'] == 'recall']['performance'].item()
f1_score_train = df_best_values_train[df_best_values_train['metric'] == 'f1_score']['performance'].item()

accuracy_val = df_best_values_val[df_best_values_val['metric'] == 'accuracy']['performance'].item()
precision_val = df_best_values_val[df_best_values_val['metric'] == 'precision']['performance'].item()
recall_val = df_best_values_val[df_best_values_val['metric'] == 'recall']['performance'].item()
f1_score_val = df_best_values_val[df_best_values_val['metric'] == 'f1_score']['performance'].item()


df_scores_final = pd.DataFrame(columns=['dataset','accuracy', 'precision', 'recall', 'f1_score'])

df_scores_final = df_scores_final._append({'dataset': 'Treinamento', 'accuracy': accuracy_train, 'precision': precision_train,
                                          'recall': recall_train, 'f1_score': f1_score_train}, ignore_index=True) 

df_scores_final = df_scores_final._append({'dataset': 'Validação', 'accuracy': accuracy_val, 'precision': precision_val,
                                          'recall': recall_val, 'f1_score': f1_score_val}, ignore_index=True) 

df_scores_final = df_scores_final._append({'dataset': 'Teste', 'accuracy': accuracy_test, 'precision': precision_test,
                                          'recall': recall_test, 'f1_score': f1_score_test}, ignore_index=True) 


df_scores_final

  df_scores_final = df_scores_final._append({'dataset': 'Treinamento', 'accuracy': accuracy_train, 'precision': precision_train,


Unnamed: 0,dataset,accuracy,precision,recall,f1_score
0,Treinamento,1.0,1.0,1.0,1.0
1,Validação,0.963,0.972,0.942,0.957
2,Teste,0.963,0.97,0.943,0.958
