# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from  sklearn  import  set_config
set_config(display='diagram')

# Custom functions

In [3]:
def save_dataset(nome_file, file):
    with open(nome_file +'.pkl', 'wb') as f:
        pickle.dump(file, f)
    return

In [4]:
def dataset_parameters(df,target):
    df_features = df.drop([target], axis=1)
    data_type_numerical = ['int64', 'float64']
    data_type_object = ['object', 'bool', 'category']
    data_type_date = ['datetime64']
    categorical_features  = list(filter(lambda x: df_features[x].dtype in data_type_object, df_features.columns))
    numerical_features = list(filter(lambda x: df_features[x].dtype in data_type_numerical, df_features.columns))
    date_features = list(filter(lambda x: df_features[x].dtype in data_type_date, df_features.columns))
    all_features  = numerical_features  + categorical_features + date_features 
    return categorical_features, numerical_features, date_features, all_features,target  

In [5]:
def dataframe_with_null(df):
    percentuale = round(df[df.isnull().any(axis=1)].shape[0]/df.shape[0]*100, 2)
    print('The dataset has {}% of records with at least one NaN value'.format(percentuale))

In [6]:
def unique_values_dataframe(df, categorical_features):
    result = {}
    for col in categorical_features:
        unique_values = df[col].unique()
        result[col] = unique_values.tolist()        
    unique_df = pd.DataFrame.from_dict(result, orient='index')
    unique_df = unique_df.transpose()
    return unique_df   

# Import dataset and visualize properties

In [7]:
df = pd.read_csv('airline_passenger_satisfaction.csv', index_col=0)

In [8]:
df.sample(10, random_state=13)

Unnamed: 0,Gender,customer_type,age,type_of_travel,customer_class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,...,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
53411,Female,Loyal Customer,22,Business travel,Business,1562,2,3,3,3,...,2,4,4,3,3,3,2,34,20.0,neutral or dissatisfied
86138,Female,disloyal Customer,49,Business travel,Business,685,2,2,2,2,...,5,3,3,4,3,5,5,0,0.0,neutral or dissatisfied
87267,Male,Loyal Customer,26,Business travel,Eco,657,3,3,5,5,...,3,3,1,1,2,1,3,0,6.0,neutral or dissatisfied
122616,Male,Loyal Customer,26,Business travel,Business,565,2,2,2,2,...,4,4,3,4,3,2,4,65,61.0,satisfied
125564,Female,Loyal Customer,57,Business travel,Business,2569,3,3,3,3,...,3,3,2,3,4,3,3,0,0.0,satisfied
63596,Male,Loyal Customer,35,Business travel,Eco,297,2,1,3,1,...,2,1,4,3,2,4,2,0,18.0,neutral or dissatisfied
40140,Female,Loyal Customer,37,Business travel,Business,888,4,4,4,4,...,4,4,4,4,3,4,3,14,0.0,satisfied
44769,Female,Loyal Customer,45,Business travel,Eco,651,2,1,1,1,...,2,2,2,2,3,2,3,24,15.0,neutral or dissatisfied
40232,Female,Loyal Customer,63,Personal Travel,Eco Plus,134,3,0,3,2,...,4,4,3,4,5,4,5,1,0.0,neutral or dissatisfied
124789,Female,disloyal Customer,48,Business travel,Business,1635,2,3,3,3,...,1,3,4,5,3,4,1,0,14.0,neutral or dissatisfied


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129880 entries, 0 to 129879
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             129880 non-null  object 
 1   customer_type                      129880 non-null  object 
 2   age                                129880 non-null  int64  
 3   type_of_travel                     129880 non-null  object 
 4   customer_class                     129880 non-null  object 
 5   flight_distance                    129880 non-null  int64  
 6   inflight_wifi_service              129880 non-null  int64  
 7   departure_arrival_time_convenient  129880 non-null  int64  
 8   ease_of_online_booking             129880 non-null  int64  
 9   gate_location                      129880 non-null  int64  
 10  food_and_drink                     129880 non-null  int64  
 11  online_boarding                    1298

# Remove unnecessary records

In [10]:
categorical_features, numerical_features, date_features, all_features, target = dataset_parameters(df,'satisfaction')

In [11]:
dataframe_with_null(df)

The dataset has 0.3% of records with at least one NaN value


In [12]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Gender,customer_type,age,type_of_travel,customer_class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,...,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
213,Female,Loyal Customer,38,Business travel,Eco,109,5,3,3,3,...,5,5,2,4,1,1,5,31,,satisfied
1124,Male,Loyal Customer,53,Personal Travel,Eco,1012,3,2,3,4,...,4,4,4,4,3,3,4,38,,neutral or dissatisfied
1529,Male,Loyal Customer,39,Business travel,Business,733,2,5,5,5,...,2,2,2,2,2,2,3,11,,neutral or dissatisfied
2004,Female,disloyal Customer,26,Business travel,Business,1035,3,3,3,1,...,2,3,3,4,5,5,2,41,,neutral or dissatisfied
2108,Female,Loyal Customer,24,Personal Travel,Eco,417,2,1,2,2,...,5,1,4,2,1,2,5,1,,neutral or dissatisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127976,Male,Loyal Customer,45,Business travel,Eco,352,5,1,1,1,...,5,1,1,1,3,2,5,26,,satisfied
128037,Female,Loyal Customer,29,Business travel,Business,3873,3,3,3,3,...,3,3,4,4,4,3,3,19,,neutral or dissatisfied
128205,Male,disloyal Customer,38,Business travel,Business,759,3,3,3,1,...,4,3,3,5,5,4,4,0,,satisfied
129032,Female,Loyal Customer,52,Business travel,Business,3659,5,5,5,5,...,5,5,5,5,3,5,3,0,,satisfied


In [13]:
df = df.dropna()

In [14]:
unique_values_dataframe(df, categorical_features).fillna('')

Unnamed: 0,Gender,customer_type,type_of_travel,customer_class
0,Male,Loyal Customer,Personal Travel,Eco Plus
1,Female,disloyal Customer,Business travel,Business
2,,,,Eco


In [15]:
df.describe()

Unnamed: 0,age,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes
count,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0
mean,39.428761,1190.210662,2.728544,3.057349,2.756786,2.976909,3.204685,3.25272,3.441589,3.358067,3.383204,3.351078,3.631886,3.306239,3.642373,3.286222,14.643385,15.091129
std,15.117597,997.560954,1.329235,1.526787,1.401662,1.278506,1.329905,1.350651,1.319168,1.334149,1.287032,1.316132,1.180082,1.266146,1.176614,1.313624,37.932867,38.46565
min,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,40.0,844.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,51.0,1744.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129487 entries, 0 to 129879
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             129487 non-null  object 
 1   customer_type                      129487 non-null  object 
 2   age                                129487 non-null  int64  
 3   type_of_travel                     129487 non-null  object 
 4   customer_class                     129487 non-null  object 
 5   flight_distance                    129487 non-null  int64  
 6   inflight_wifi_service              129487 non-null  int64  
 7   departure_arrival_time_convenient  129487 non-null  int64  
 8   ease_of_online_booking             129487 non-null  int64  
 9   gate_location                      129487 non-null  int64  
 10  food_and_drink                     129487 non-null  int64  
 11  online_boarding                    1294

In [17]:
df = df.reset_index(drop=True)

In [18]:
df

Unnamed: 0,Gender,customer_type,age,type_of_travel,customer_class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,...,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129482,Male,disloyal Customer,34,Business travel,Business,526,3,3,3,1,...,4,3,2,4,4,5,4,0,0.0,neutral or dissatisfied
129483,Male,Loyal Customer,23,Business travel,Business,646,4,4,4,4,...,4,4,5,5,5,5,4,0,0.0,satisfied
129484,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,5,...,2,4,3,4,5,4,2,0,0.0,neutral or dissatisfied
129485,Male,Loyal Customer,14,Business travel,Business,1127,3,3,3,3,...,4,3,2,5,4,5,4,0,0.0,satisfied


# Save DataFrame

In [19]:
save_dataset('airline_df', df)

# Transform  categorical and numerical  features

In [20]:
transformers = [    
    ('one', OneHotEncoder(), categorical_features),
    ('scale', StandardScaler(), numerical_features)
]

In [21]:
ct = ColumnTransformer(transformers)

In [22]:
ct

In [23]:
df_transformed = ct.fit_transform(df)

# Apply labels to target

In [24]:
le = LabelEncoder()

In [25]:
df_target = le.fit_transform(df[target])

# Save X and y dataset

In [26]:
dataset_X_y = df_transformed, df_target

In [27]:
save_dataset('airline_dataset', dataset_X_y)

In [28]:
dataset_X_y

(array([[ 0.        ,  1.        ,  1.        , ...,  1.30462366,
          0.2730259 ,  0.07562286],
        [ 0.        ,  1.        ,  0.        , ..., -1.74039966,
         -0.35967326, -0.236345  ],
        [ 1.        ,  0.        ,  1.        , ...,  1.30462366,
         -0.38603572, -0.39232892],
        ...,
        [ 1.        ,  0.        ,  1.        , ..., -0.97914383,
         -0.38603572, -0.39232892],
        [ 0.        ,  1.        ,  1.        , ...,  0.54336783,
         -0.38603572, -0.39232892],
        [ 1.        ,  0.        ,  1.        , ..., -1.74039966,
         -0.38603572, -0.39232892]]),
 array([0, 0, 1, ..., 0, 1, 0]))

# Save processed DataFrame

In [29]:
df_processed = pd.DataFrame(df_transformed, columns=ct.get_feature_names_out())

In [30]:
df_processed.columns = [col.split('__')[1] for col in df_processed.columns]

In [31]:
df_processed = pd.concat([df_processed, pd.DataFrame(df_target, columns=['satisfaction'])], axis=1)

In [32]:
save_dataset('airline_df_processed', df_processed)

In [33]:
df_processed.sample(10)

Unnamed: 0,Gender_Female,Gender_Male,customer_type_Loyal Customer,customer_type_disloyal Customer,type_of_travel_Business travel,type_of_travel_Personal Travel,customer_class_Business,customer_class_Eco,customer_class_Eco Plus,age,...,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
112716,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.963863,...,-0.268387,-0.297744,0.493055,1.159342,0.547933,1.153847,-0.217888,-0.386036,-0.366332,0
74515,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,-0.755994,...,-0.268387,0.47924,1.25286,1.159342,1.337734,1.153847,-0.217888,-0.386036,-0.28834,1
100816,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,-1.748219,...,1.230701,1.256225,-1.026556,1.159342,0.547933,0.303947,1.304624,-0.280586,-0.340334,0
22923,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-0.954439,...,-1.017931,0.47924,0.493055,0.311941,-0.241868,1.153847,-0.979144,-0.386036,-0.392329,0
29241,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-1.086735,...,1.230701,-1.074728,1.25286,0.311941,-1.031669,-0.545953,1.304624,0.246663,-0.028366,0
23839,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.103935,...,0.481157,-1.851712,1.25286,-0.535461,-1.821471,-0.545953,0.543368,-0.359673,-0.392329,0
32352,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.103935,...,0.481157,1.256225,0.493055,0.311941,1.337734,1.153847,0.543368,-0.386036,-0.392329,1
4515,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.500825,...,0.481157,-1.851712,0.493055,0.311941,-1.031669,-1.395853,-0.979144,2.698373,3.143307,0
53831,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,-0.822142,...,-0.268387,-0.297744,1.25286,1.159342,1.337734,0.303947,-0.217888,1.538424,1.661459,0
112621,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.831567,...,0.481157,0.47924,0.493055,0.311941,0.547933,0.303947,-0.217888,-0.122411,0.335596,1


# Save target classes dictionary

In [34]:
class_dict = dict(zip(le.transform(le.classes_), le.classes_))

In [35]:
save_dataset('airline_satisfaction_classes', class_dict)

In [36]:
class_dict

{0: 'neutral or dissatisfied', 1: 'satisfied'}