# LIBRERIE

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score 
from sklearn.metrics import classification_report, plot_confusion_matrix, plot_roc_curve

# DATASET

In [None]:
airline_train = pd.read_csv("/Users/emanuele/Desktop/Machine Learning/ProgettoML/airline-train.csv")
airline_test = pd.read_csv("/Users/emanuele/Desktop/Machine Learning/ProgettoML/airline-test.csv")

print('airline_test:',airline_test.shape,'airline_train:',airline_train.shape)

data = airline_train #rinomino il dataset per l'analisi
data_train = airline_train 
data_test = airline_test

In [None]:
data = data.drop(data.iloc[:,[0, 1]], axis = 1)
categorical_indexes = [0, 1, 3, 4] + list(range(6, 20))
data.iloc[:,categorical_indexes] = data.iloc[:,categorical_indexes].astype('category')

data.columns = [c.replace(' ', '_') for c in data.columns]

data.info()

In [None]:
col_numeriche = [feature for feature in data.columns if data[feature].dtype.name != 'category']
col_numeriche.remove('satisfaction')
col_categoriche = [feature for feature in data.columns if data[feature].dtype.name == 'category']
data_describe = data.describe(include = ['category'])

print('-'*40)
print('Numero colonne categorico: ',len(col_categoriche))
print('-'*40)
print('Nomi colonne categoriche:',col_categoriche)
print(' '*40)
print('-'*40)
print('Numero colonne numeriche: ', len(col_numeriche))
print('-'*40)
print('Nomi colonne numeriche: ',col_numeriche)

In [None]:
col_binarie = [columns for columns in col_categoriche if data_describe[columns]['unique'] == 2]
col_nonbinarie = [columns for columns in col_categoriche if data_describe[columns]['unique'] > 2]

print('-'*40)
print('Numero colonne binarie: ',len(col_binarie))
print('-'*40)
print('Nome colonne binarie:',col_binarie)
print(' '*40)
print('-'*40)
print('Numero colonne non binarie: ', len(col_nonbinarie))
print('-'*40)
print('Nome colonne non binarie: ',col_nonbinarie)

# OPERAZIONI SUI DATASET

In [None]:
#rimuovo colonne dove è presente l'id 
data_train = data_train.drop(data_train.iloc[:,[0, 1]], axis = 1)
data_test = data_test.drop(data_test.iloc[:,[0, 1]], axis = 1)
#cambio i nomi delle colonne sostituendo gli spazzi
data_train.columns = [c.replace(' ', '_') for c in data_train.columns]
data_test.columns = [c.replace(' ', '_') for c in data_test.columns]

In [None]:
#trovo gli na
data_train.isna().sum()

In [None]:
#trovo gli na
data_test.isna().sum()

In [None]:
data_test.describe()

In [None]:
df_train = data_train
df_test = data_test
#elimino le colonne che non mi servono
df_train.drop(['Gender','Gate_location','Departure/Arrival_time_convenient'], axis=1, inplace=True)
df_test.drop(['Gender','Gate_location','Departure/Arrival_time_convenient'], axis=1, inplace=True)
#sostituisco gli na con la media
df_train['Arrival_Delay_in_Minutes'].fillna(df_train['Arrival_Delay_in_Minutes'].median(axis = 0), inplace = True)
df_test['Arrival_Delay_in_Minutes'].fillna(df_test['Arrival_Delay_in_Minutes'].median(axis = 0), inplace = True)

lencoders = {}
for col in df_train.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    df_train[col] = lencoders[col].fit_transform(df_train[col])
lencoders_t = {}
for col in df_test.select_dtypes(include=['object']).columns:
    lencoders_t[col] = LabelEncoder()
    df_test[col] = lencoders_t[col].fit_transform(df_test[col])

In [None]:
data_train

# MODELLI


In [None]:
#definisco x ed y 
y_train = df_train['satisfaction'].to_numpy()
y_test = df_test['satisfaction'].to_numpy()
df_train.drop(['satisfaction'], axis=1, inplace=True)
df_test.drop(['satisfaction'], axis=1, inplace=True)
X_train = df_train
X_test = df_test

#normalizzo
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
import time
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, plot_confusion_matrix, plot_roc_curve
from matplotlib import pyplot as plt 
def run_model(model, X_train, y_train, X_test, y_test, verbose=True):
    t0=time.time()
    if verbose == False:
        model.fit(X_train,y_train.ravel(), verbose=0)
    else:
        model.fit(X_train,y_train.ravel())
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred) 
    time_taken = time.time()-t0
    print("Accuracy = {}".format(accuracy))
    print("ROC Area under Curve = {}".format(roc_auc))
    print("Time taken = {}".format(time_taken))
    print(classification_report(y_test,y_pred,digits=5))
    plot_confusion_matrix(model, X_test, y_test)
    plot_roc_curve(model, X_test, y_test)                     
    
    return model, accuracy, roc_auc, time_taken

In [None]:
data_train.columns

## MODELLO 1

In [None]:
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary())

In [None]:
from sklearn.linear_model import LogisticRegression

params_lr = {'penalty': 'elasticnet', 'l1_ratio':0.5, 'solver': 'saga'}

log = LogisticRegression(**params_lr)


log.fit(X_train,y_train.ravel())
y_pred = log.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred) 


print("Accuracy = {}".format(accuracy))
print("ROC Area under Curve = {}".format(roc_auc))
print(classification_report(y_test,y_pred,digits=5))
plot_confusion_matrix(log, X_test, y_test,)
plot_roc_curve(log, X_test, y_test) 

pass

## MODELLO 2

In [None]:
df_train = data_train
df_test = data_test
df_train.drop(['Gender','Gate_location','Departure/Arrival_time_convenient','Flight_Distance'], axis=1, inplace=True)
df_test.drop(['Gender','Gate_location','Departure/Arrival_time_convenient','Flight_Distance'], axis=1, inplace=True)
#-------------------------------------------------------------------------#
df_train['Arrival_Delay_in_Minutes'].fillna(df_train['Arrival_Delay_in_Minutes'].median(axis = 0), inplace = True)
df_test['Arrival_Delay_in_Minutes'].fillna(df_test['Arrival_Delay_in_Minutes'].median(axis = 0), inplace = True)
#-------------------------------------------------------------------------#
df_train['satisfaction'].replace({'neutral or dissatisfied': 0, 'satisfied': 1},inplace = True)
df_test['satisfaction'].replace({'neutral or dissatisfied': 0, 'satisfied': 1},inplace = True)
#
from sklearn.preprocessing import LabelEncoder
lencoders = {}
for col in df_train.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    df_train[col] = lencoders[col].fit_transform(df_train[col])
lencoders_t = {}
for col in df_test.select_dtypes(include=['object']).columns:
    lencoders_t[col] = LabelEncoder()
    df_test[col] = lencoders_t[col].fit_transform(df_test[col])
#
y_train = df_train['satisfaction'].to_numpy()
y_test = df_test['satisfaction'].to_numpy()
df_train.drop(['satisfaction'], axis=1, inplace=True)
df_test.drop(['satisfaction'], axis=1, inplace=True)
X_train = df_train
X_test = df_test
#
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

params_lr = {'penalty': 'elasticnet', 'l1_ratio':0.5, 'solver': 'saga'}

log = LogisticRegression(**params_lr)


log.fit(X_train,y_train.ravel())
y_pred = log.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred) 


print("Accuracy = {}".format(accuracy))
print("ROC Area under Curve = {}".format(roc_auc))
print(classification_report(y_test,y_pred,digits=5))
plot_confusion_matrix(log, X_test, y_test,)
plot_roc_curve(log, X_test, y_test) 

pass