# Imports et récupération des données

In [2]:
# Default
import pandas as pd
import seaborn as sns
import numpy as np
import warnings
import plotly.express as px
import time

# machine learning - scikit learn:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.model_selection import GridSearchCV

# désactiver temporairement les avertissements
warnings.filterwarnings('ignore')

In [3]:
file_path = "https://raw.githubusercontent.com/remijul/dataset/master/Airline%20Passenger%20Satisfaction.csv"
df = pd.read_csv(file_path, sep=";", index_col=False)
df = df.drop('id', axis=1)

## Dataframe de base

In [4]:
df

Unnamed: 0,Satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,satisfied,Female,disloyal Customer,29,Personal Travel,Eco,1731,5,5,5,...,2,2,3,3,4,4,4,2,0,0.0
129876,neutral or dissatisfied,Male,disloyal Customer,63,Personal Travel,Business,2087,2,3,2,...,1,3,2,3,3,1,2,1,174,172.0
129877,neutral or dissatisfied,Male,disloyal Customer,69,Personal Travel,Eco,2320,3,0,3,...,2,4,4,3,4,2,3,2,155,163.0
129878,neutral or dissatisfied,Male,disloyal Customer,66,Personal Travel,Eco,2450,3,2,3,...,2,3,3,2,3,2,1,2,193,205.0


## Changement des noms de colonne

In [5]:
df.columns = ['Satisfaction', 'Gender', 'Customer_Type', 'Age', 'Type_of_Travel',
                    'Class', 'Flight_Distance', 'Seat_comfort', 'Departure_Arrival_time_convenient', 
                    'Food_and_drink', 'Gate_location', 'Inflight_wifi_service', 'Inflight_entertainment', 'Online_support', 
                    'Ease_of_Online_booking', 'On_board_service', 'Leg_room_service', 'Baggage_handling',
                    'Checkin_service', 'Cleanliness', 'Online_boarding', 'Departure_Delay_in_Minutes', 'Arrival_Delay_in_Minutes']

# Modélisation

In [6]:
df_pour_modelisation = df.copy()
df_pour_modelisation = df_pour_modelisation.dropna()

colonnes_num = list(df_pour_modelisation.select_dtypes(include=['int', 'float']).columns)
colonnes_cat = list(df_pour_modelisation.select_dtypes(include=['object']).columns)

print(f'Colonnes numériques : {colonnes_num}')
print(f'Colonnes catégorielles : {colonnes_cat}')

Colonnes numériques : ['Age', 'Flight_Distance', 'Seat_comfort', 'Departure_Arrival_time_convenient', 'Food_and_drink', 'Gate_location', 'Inflight_wifi_service', 'Inflight_entertainment', 'Online_support', 'Ease_of_Online_booking', 'On_board_service', 'Leg_room_service', 'Baggage_handling', 'Checkin_service', 'Cleanliness', 'Online_boarding', 'Departure_Delay_in_Minutes', 'Arrival_Delay_in_Minutes']
Colonnes catégorielles : ['Satisfaction', 'Gender', 'Customer_Type', 'Type_of_Travel', 'Class']


## Définition de notre X(données) et y(target)

In [7]:
y = df_pour_modelisation['Satisfaction']
X = df_pour_modelisation.drop(['Satisfaction'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

print("La longueur du dataset de base :", len(X))
print("La longueur du dataset d'entraînement :", len(X_train))
print("La longueur du dataset de test :", len(X_test))

La longueur du dataset de base : 129487
La longueur du dataset d'entraînement : 90640
La longueur du dataset de test : 38847


## Pré-processing

In [8]:
preparation = ColumnTransformer(
    transformers=[       
        ('data_cat',
         OneHotEncoder(handle_unknown='ignore'), ['Gender', 'Customer_Type', 'Type_of_Travel', 'Class']),
        ('data_num',
         StandardScaler(),colonnes_num),         
    ])

## Test de plusieurs modèles

In [9]:
# Définition des paramètres à ajuster                           Durée
models = {
    'LogisticRegression': LogisticRegression(),                 # ~6s
    'RandomForestClassifier': RandomForestClassifier(),         # ~105s
    'SVC': SVC(),                                               # ~1149s
}

params = {
    'LogisticRegression': {'C': [0.1, 1, 10]},
    'RandomForestClassifier': {'max_depth': [10, 50, 100]},
    'SVC': {},
}

#kneibourclassifier
#histrandomboosting

In [10]:
X_train_prepared = preparation.fit_transform(X_train)
X_test_prepared = preparation.transform(X_test)

p=0
for model_name, model in models.items():
    p=p+1
    print(f"\n-----------------------------Modèle {p}---------------------------------\n")
    print(f"GridSearchCV for {model_name}")
    
    start_time = time.time()

    gs = GridSearchCV(model, params[model_name], cv=5)
    gs.fit(X_train_prepared, y_train)

    end_time = time.time()
    training_time = end_time - start_time

    print(f"Best params: {gs.best_params_}")
    print(f"Train score: {gs.best_score_*100:.3f}%")
    print(f"Test score: {gs.score(X_test_prepared, y_test)*100:.3f}%\n")    
    print(f"\n------------------------------------------------------- {training_time:.2f} seconds")


-----------------------------Modèle 1---------------------------------

GridSearchCV for LogisticRegression
Best params: {'C': 1}
Train score: 83.460%
Test score: 83.801%


------------------------------------------------------- 11.82 seconds

-----------------------------Modèle 2---------------------------------

GridSearchCV for RandomForestClassifier
Best params: {'max_depth': 100}
Train score: 95.649%
Test score: 95.778%


------------------------------------------------------- 131.14 seconds

-----------------------------Modèle 3---------------------------------

GridSearchCV for SVC
Best params: {}
Train score: 94.266%
Test score: 94.383%


------------------------------------------------------- 1149.50 seconds
