# Ensaio de Machine Learning - Classificação

## 0. Bibliotecas e _Helper Functions_

In [1]:
import warnings
import numpy  as np
import pandas as pd
from sklearn.tree            import DecisionTreeClassifier
from sklearn.metrics         import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble        import RandomForestClassifier
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.linear_model    import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
def personal_settings():
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.float_format', lambda x:'%.2f' % x)
    warnings.filterwarnings('ignore')


def model_avaliation(model_name, y_true, y_pred):
    acc = accuracy_score( y_true, y_pred )
    precision = precision_score( y_true, y_pred )
    recall = recall_score( y_true, y_pred )
    f1 = f1_score( y_true, y_pred )
    
    return pd.DataFrame( {'Model': model_name,
                         'Accuracy': acc,
                         'Precison': precision,
                         'Recall': recall,
                         'F1-Score': f1}, index=[0] )

personal_settings()

## 1. Carregando os Dados

### Treinamento

In [3]:
X_train = pd.read_csv( 'data/X_training.csv', low_memory=False )
y_train = pd.read_csv( 'data/y_training.csv', low_memory=False )
y_train = y_train.values.ravel()

print( X_train.shape )
display( X_train.sample( 5 ) )

(72515, 25)


Unnamed: 0,id,customer_type,age,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,inflight_entertainment,on_board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,gender_Female,gender_Male,type_of_travel_business_travel,type_of_travel_personal_travel
11913,75595,1,0.38,1.0,0.06,0.8,0.4,0.4,0.4,1.0,0.6,0.5,0.8,0.8,0.6,0.75,0.25,0.8,0.8,0.03,0.02,1.0,0.0,1.0,0.0
36441,115015,1,0.67,1.0,0.39,0.8,0.8,0.6,0.8,1.0,1.0,1.0,0.8,0.8,0.8,0.75,1.0,0.8,1.0,0.0,0.0,0.0,1.0,1.0,0.0
71315,84679,1,0.41,1.0,0.43,0.6,0.8,0.8,0.8,0.4,0.6,0.75,0.6,0.6,0.4,0.5,0.75,0.6,0.2,0.0,0.0,0.0,1.0,1.0,0.0
52308,106848,1,0.64,0.0,0.34,0.4,0.8,0.4,0.4,0.4,0.6,0.75,0.4,0.4,0.4,0.25,0.75,0.4,1.0,0.01,0.0,1.0,0.0,0.0,1.0
54913,116444,1,0.12,0.0,0.06,0.4,0.8,0.6,0.4,0.6,0.6,0.0,0.6,0.8,1.0,0.25,1.0,1.0,0.6,0.0,0.0,1.0,0.0,0.0,1.0


### Validação

In [4]:
X_val = pd.read_csv( 'data/X_validation.csv', low_memory=False )
y_val = pd.read_csv( 'data/y_validation.csv', low_memory=False )
y_val = y_val.values.ravel()

print( X_val.shape )

(31079, 25)


### Teste

In [5]:
X_test = pd.read_csv( 'data/X_test.csv', low_memory=False )
y_test = pd.read_csv( 'data/y_test.csv', low_memory=False )
y_test = y_test.values.ravel()

print( X_test.shape )

(25893, 25)


## 2. Treinamento dos Algoritmos de Machine Learning

### 2.1 Sem Alteração nos Parâmetros e Treino + Teste

#### Logistic Regression

In [6]:
lr_model = LogisticRegression( random_state=42 )
lr_model.fit( X_train, y_train )

y_pred_lr = lr_model.predict( X_test )

#### Decision Tree

In [7]:
tree_model = DecisionTreeClassifier( random_state=42 )
tree_model.fit( X_train, y_train )

y_pred_tree = tree_model.predict( X_test )

#### Random Forest

In [8]:
rf_model = RandomForestClassifier( random_state=42 )
rf_model.fit( X_train, y_train )

y_pred_rf = rf_model.predict( X_test )

#### KNN

In [9]:
knn_model = KNeighborsClassifier( )
knn_model.fit( X_train, y_train )

y_pred_knn = knn_model.predict( X_test )

#### Performance

In [11]:
lr_metrics = model_avaliation( 'Logistic Regression', y_test, y_pred_lr )
tree_metrics = model_avaliation( 'Decision Tree', y_test, y_pred_tree )
rf_metrics = model_avaliation( 'Random Forest', y_test, y_pred_rf )
knn_metrics = model_avaliation( 'KNN', y_test, y_pred_knn )

metrics = pd.concat([lr_metrics, tree_metrics, rf_metrics, knn_metrics])
display( metrics.sort_values( by='F1-Score', ascending=False ) )

Unnamed: 0,Model,Accuracy,Precison,Recall,F1-Score
0,Random Forest,0.96,0.97,0.94,0.96
0,Decision Tree,0.94,0.93,0.94,0.94
0,KNN,0.67,0.63,0.6,0.62
0,Logistic Regression,0.56,0.0,0.0,0.0


### 2.2 Buscando os Melhores Parâmetros e Treino + Validação + Teste

#### Logistic Regression

In [13]:
lr_model = LogisticRegression( C=1.0, solver='lbfgs', max_iter=100, random_state=42 )

params = {"C": [0.5, 1, 2], 
          "max_iter": [50, 100, 200],
          "solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

lr_grid = GridSearchCV(lr_model, params, cv=3, verbose=0, scoring='accuracy')

lr_grid.fit(X_train, y_train)

In [14]:
y_pred_lr_grid = lr_grid.predict( X_val )

lr_grid_metrics = model_avaliation( 'Logistic Regression - Grid Search', y_val, y_pred_lr_grid )

print( f"Logistic Regression - Best Parameters: {lr_grid.best_params_}" )
display( lr_grid_metrics )

Logistic Regression - Best Parameters: {'C': 2, 'max_iter': 200, 'solver': 'newton-cg'}


Unnamed: 0,Model,Accuracy,Precison,Recall,F1-Score
0,Logistic Regression - Grid Search,0.87,0.87,0.84,0.85


#### Decision Tree

In [16]:
tree_model = DecisionTreeClassifier( max_depth=5, min_samples_leaf=50, random_state=42 )

params = {"max_depth": [2, 5, 10, 20, 50], 
          "min_samples_leaf": [25, 50, 100, 200, 500, 800]}

tree_grid = GridSearchCV(tree_model, params, cv=3, verbose=0, scoring='accuracy')

tree_grid.fit(X_train, y_train)

In [17]:
y_pred_tree_grid = tree_grid.predict( X_val )

tree_grid_metrics = model_avaliation( 'Decision Tree - Grid Search', y_val, y_pred_tree_grid )

print( f"Decision Tree - Best Parameters: {tree_grid.best_params_}" )
display( tree_grid_metrics )

Decision Tree - Best Parameters: {'max_depth': 20, 'min_samples_leaf': 25}


Unnamed: 0,Model,Accuracy,Precison,Recall,F1-Score
0,Decision Tree - Grid Search,0.95,0.95,0.93,0.94


#### Random Forest

In [18]:
rf_model = RandomForestClassifier( n_estimators=300, max_depth=5, min_samples_leaf=50, random_state=42 )

params = {"n_estimators": [50, 100, 200, 300, 500], 
          "min_samples_leaf": [5, 10, 20, 50, 100]}

rf_grid = GridSearchCV(rf_model, params, cv=3, verbose=0, scoring='accuracy')

rf_grid.fit(X_train, y_train)

In [19]:
y_pred_rf_grid = rf_grid.predict( X_val )

rf_grid_metrics = model_avaliation( 'Random Forest - Grid Search', y_val, y_pred_rf_grid )

print( f"Random Forest - Best Parameters: {rf_grid.best_params_}" )
display( rf_grid_metrics )

Random Forest - Best Parameters: {'min_samples_leaf': 10, 'n_estimators': 300}


Unnamed: 0,Model,Accuracy,Precison,Recall,F1-Score
0,Random Forest - Grid Search,0.93,0.92,0.91,0.91


#### KNN

In [20]:
knn_model = KNeighborsClassifier( n_neighbors=5 )

params = {"n_neighbors": [3, 5, 7, 10, 15, 20]}

knn_grid = GridSearchCV(knn_model, params, cv=3, verbose=0, scoring='accuracy')

knn_grid.fit(X_train, y_train)

In [21]:
y_pred_knn_grid = knn_grid.predict( X_val )

knn_grid_metrics = model_avaliation( 'KNN - Grid Search', y_val, y_pred_knn_grid )

print( f"KNN - Best Parameters: {knn_grid.best_params_}" )
display( knn_grid_metrics )

KNN - Best Parameters: {'n_neighbors': 3}


Unnamed: 0,Model,Accuracy,Precison,Recall,F1-Score
0,KNN - Grid Search,0.68,0.63,0.62,0.62


#### Performance

In [30]:
grid_metrics = pd.concat([lr_grid_metrics, tree_grid_metrics, rf_grid_metrics, knn_grid_metrics])
display( grid_metrics.sort_values( by='F1-Score', ascending=False ) )

Unnamed: 0,Model,Accuracy,Precison,Recall,F1-Score
0,Decision Tree - Grid Search,0.95,0.95,0.93,0.94
0,Random Forest - Grid Search,0.93,0.92,0.91,0.91
0,Logistic Regression - Grid Search,0.87,0.87,0.84,0.85
0,KNN - Grid Search,0.68,0.63,0.62,0.62


In [31]:
join_metrics = pd.concat([metrics, grid_metrics])
display( join_metrics.sort_values( by='F1-Score', ascending=False ) )

Unnamed: 0,Model,Accuracy,Precison,Recall,F1-Score
0,Random Forest,0.96,0.97,0.94,0.96
0,Decision Tree - Grid Search,0.95,0.95,0.93,0.94
0,Decision Tree,0.94,0.93,0.94,0.94
0,Random Forest - Grid Search,0.93,0.92,0.91,0.91
0,Logistic Regression - Grid Search,0.87,0.87,0.84,0.85
0,KNN - Grid Search,0.68,0.63,0.62,0.62
0,KNN,0.67,0.63,0.6,0.62
0,Logistic Regression,0.56,0.0,0.0,0.0


### 2.3 Performance com os Dados de Teste + Melhores Parâmetros

In [24]:
# Juntando os Dados de Treino + Validação
X_final = np.concatenate( ( X_train, X_val ))
y_final = np.concatenate ( ( y_train, y_val ))

#### Logistic Regression

In [25]:
final_lr = LogisticRegression( C=2, max_iter=100, solver='newton-cg', random_state=42 )
final_lr.fit( X_final, y_final )

y_pred_lr_final = final_lr.predict( X_test )

#### Decision Tree

In [26]:
final_tree = DecisionTreeClassifier( max_depth=20, min_samples_leaf=25, random_state=42 )
final_tree.fit( X_final, y_final )

y_pred_tree_final = final_tree.predict( X_test )

#### Random Forest

In [27]:
final_rf = RandomForestClassifier( random_state=42 )
final_rf.fit( X_final, y_final )

y_pred_rf_final = final_rf.predict( X_test )

#### KNN

In [28]:
final_knn = KNeighborsClassifier( n_neighbors=3 )
final_knn.fit( X_final, y_final )

y_pred_knn_final = final_knn.predict( X_test )

#### Performance

In [29]:
final_lr_metrics = model_avaliation( 'Logistic Regression - Final', y_test, y_pred_lr_final )
final_tree_metrics = model_avaliation( 'Decision Tree - Final', y_test, y_pred_tree_final )
final_rf_metrics = model_avaliation( 'Random Forest - Final', y_test, y_pred_rf_final )
final_knn_metrics = model_avaliation( 'KNN - Final', y_test, y_pred_knn_final )

final_metrics = pd.concat([final_lr_metrics, final_tree_metrics, final_rf_metrics, final_knn_metrics])
display( final_metrics.sort_values( by='F1-Score', ascending=False) )

Unnamed: 0,Model,Accuracy,Precison,Recall,F1-Score
0,Random Forest - Final,0.96,0.97,0.95,0.96
0,Decision Tree - Final,0.95,0.96,0.93,0.94
0,Logistic Regression - Final,0.87,0.87,0.83,0.85
0,KNN - Final,0.69,0.65,0.64,0.64
