# Dependencies

In [None]:
!pip install numpy==1.22.4
!pip install matplotlib==3.7.1
!pip install mglearn==0.2.0
!pip install pandas==2.0.3
!pip install scikit-learn==1.3.2
!pip install seaborn==0.13.1

In [None]:
!pip freeze

# Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display
import seaborn as sns
from sklearn import datasets
from sklearn.cluster import KMeans

In [2]:
df=pd.read_csv("airline_satisfaction.csv")

In [3]:
#target
y = df.satisfaction.values

#storing the values of the features in 3 variables: categorical, quantitative and ordinal
categorical_variables = ['Gender','Customer Type','Type of Travel','Class']

quantitative_variables = ['Age','Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes']

ordinal_variables = ['Inflight wifi service', 'Departure/Arrival time convenient','Ease of Online booking', 
                     'Gate location','Food and drink','Online boarding','Seat comfort','Inflight entertainment',
                     'On-board service','Leg room service','Baggage handling','Checkin service','Inflight service',
                     'Cleanliness']

X = df[categorical_variables + quantitative_variables + ordinal_variables].values
X_quantitative = df[quantitative_variables].values
X_categorical = df[categorical_variables].values
X_ordinal = df[ordinal_variables].values

In [4]:
#display the 5 first rows of our dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


# Data Preprocessing

## Encoding Variables

### One-Hot Encoding
for categorical variables

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
one_hot_encoder = OneHotEncoder()

In [7]:
one_hot_encoder.fit(X_categorical)
one_hot_encoded_X = one_hot_encoder.transform(X_categorical)

In [8]:
one_hot_encoded_X.toarray()

array([[0., 1., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 1., ..., 1., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 1., 0.]])

In [9]:
one_hot_encoder.get_feature_names_out()

array(['x0_Female', 'x0_Male', 'x1_Loyal Customer',
       'x1_disloyal Customer', 'x2_Business travel', 'x2_Personal Travel',
       'x3_Business', 'x3_Eco', 'x3_Eco Plus'], dtype=object)

In [10]:
one_hot_encoded_X

<129880x9 sparse matrix of type '<class 'numpy.float64'>'
	with 519520 stored elements in Compressed Sparse Row format>

In [11]:
df_categorical=df[categorical_variables]

In [12]:
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols)

print("Number of new columns generated after one-hot encoding:", df_encoded.shape[1] - df.shape[1])

Number of new columns generated after one-hot encoding: 6


In [13]:
X_categorical_encoded = pd.concat([df_categorical, df_encoded], axis=1)

In [14]:
#display the categorical features, not encoded and encoded
X_categorical_encoded

Unnamed: 0.1,Gender,Customer Type,Type of Travel,Class,Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_neutral or dissatisfied,satisfaction_satisfied
0,Male,Loyal Customer,Personal Travel,Eco Plus,0,70172,13,460,3,4,...,True,True,False,False,True,False,False,True,True,False
1,Male,disloyal Customer,Business travel,Business,1,5047,25,235,3,2,...,True,False,True,True,False,True,False,False,True,False
2,Female,Loyal Customer,Business travel,Business,2,110028,26,1142,2,2,...,False,True,False,True,False,True,False,False,False,True
3,Female,Loyal Customer,Business travel,Business,3,24026,25,562,2,5,...,False,True,False,True,False,True,False,False,True,False
4,Male,Loyal Customer,Business travel,Business,4,119299,61,214,3,3,...,True,True,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,Male,disloyal Customer,Business travel,Business,25971,78463,34,526,3,3,...,True,False,True,True,False,True,False,False,True,False
129876,Male,Loyal Customer,Business travel,Business,25972,71167,23,646,4,4,...,True,True,False,True,False,True,False,False,False,True
129877,Female,Loyal Customer,Personal Travel,Eco,25973,37675,17,828,2,5,...,False,True,False,False,True,False,True,False,True,False
129878,Male,Loyal Customer,Business travel,Business,25974,90086,14,1127,3,3,...,True,True,False,True,False,True,False,False,False,True


### Ordinal Encoding

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
label_encoder = LabelEncoder()

In [17]:
X_ordinal_encoded = np.zeros_like(X_ordinal)

In [18]:
for i in range(X_ordinal.shape[1]):
    X_ordinal_encoded[:, i] = label_encoder.fit_transform(X_ordinal[:, i])

In [19]:
for i, col in enumerate(ordinal_variables):
    print(f"===== Encoded classes for {col}: {label_encoder.classes_}")

===== Encoded classes for Inflight wifi service: [0 1 2 3 4 5]
===== Encoded classes for Departure/Arrival time convenient: [0 1 2 3 4 5]
===== Encoded classes for Ease of Online booking: [0 1 2 3 4 5]
===== Encoded classes for Gate location: [0 1 2 3 4 5]
===== Encoded classes for Food and drink: [0 1 2 3 4 5]
===== Encoded classes for Online boarding: [0 1 2 3 4 5]
===== Encoded classes for Seat comfort: [0 1 2 3 4 5]
===== Encoded classes for Inflight entertainment: [0 1 2 3 4 5]
===== Encoded classes for On-board service: [0 1 2 3 4 5]
===== Encoded classes for Leg room service: [0 1 2 3 4 5]
===== Encoded classes for Baggage handling: [0 1 2 3 4 5]
===== Encoded classes for Checkin service: [0 1 2 3 4 5]
===== Encoded classes for Inflight service: [0 1 2 3 4 5]
===== Encoded classes for Cleanliness: [0 1 2 3 4 5]


In [20]:
X_ordinal_encoded

array([[3, 4, 3, ..., 4, 5, 5],
       [3, 2, 3, ..., 1, 4, 1],
       [2, 2, 2, ..., 4, 4, 5],
       ...,
       [2, 5, 1, ..., 5, 4, 2],
       [3, 3, 3, ..., 4, 5, 4],
       [2, 5, 2, ..., 1, 1, 1]])

In [21]:
#i was checking if the size is the same
X_ordinal_encoded.shape

(129880, 14)

In [22]:
X_ordinal.shape

(129880, 14)

### Creating encoded dataset

In [23]:
df_quantitative = pd.DataFrame(X_quantitative, columns=['Age','Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes'])

In [24]:
#display a datafram which contains the quantitative variables
df_quantitative

Unnamed: 0,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes
0,13.0,460.0,25.0,18.0
1,25.0,235.0,1.0,6.0
2,26.0,1142.0,0.0,0.0
3,25.0,562.0,11.0,9.0
4,61.0,214.0,0.0,0.0
...,...,...,...,...
129875,34.0,526.0,0.0,0.0
129876,23.0,646.0,0.0,0.0
129877,17.0,828.0,0.0,0.0
129878,14.0,1127.0,0.0,0.0


In [25]:
df_categorical_encoded = pd.DataFrame(X_categorical_encoded, columns=['Gender_Male','Customer Type_Loyal Customer', 'Customer Type_disloyal Customer', 'Type of Travel_Business travel', 'Type of Travel_Personal Travel', 'Class_Business', 'Class_Eco', 'Class_Eco Plus', 'satisfaction_neutral or dissatisfied', 'satisfaction_satisfied'])

In [26]:
#display a datafram which contains the categorical encoded variables
df_categorical_encoded

Unnamed: 0,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_neutral or dissatisfied,satisfaction_satisfied
0,True,True,False,False,True,False,False,True,True,False
1,True,False,True,True,False,True,False,False,True,False
2,False,True,False,True,False,True,False,False,False,True
3,False,True,False,True,False,True,False,False,True,False
4,True,True,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...
129875,True,False,True,True,False,True,False,False,True,False
129876,True,True,False,True,False,True,False,False,False,True
129877,False,True,False,False,True,False,True,False,True,False
129878,True,True,False,True,False,True,False,False,False,True


In [27]:
df_ordinal_encoded = pd.DataFrame(X_ordinal_encoded, columns=['Inflight wifi service','Departure/Arrival time convenient','Ease of Online booking','Gate location','Food and drink','Online boarding','Seat comfort','Inflight entertainment','On-board service','Leg room service','Baggage handling','Checkin service','Inflight service','Cleanliness'])

In [28]:
#display a datafram which contains the ordinal encoded variables
df_ordinal_encoded

Unnamed: 0,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
0,3,4,3,1,5,3,5,5,4,3,3,4,5,5
1,3,2,3,3,1,3,1,1,1,5,2,1,4,1
2,2,2,2,2,5,5,5,5,4,3,3,4,4,5
3,2,5,5,5,2,2,2,2,2,5,2,1,4,2
4,3,3,3,3,4,5,5,3,3,4,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,3,3,3,1,4,3,4,4,3,2,3,4,5,4
129876,4,4,4,4,4,4,4,4,4,5,4,5,5,4
129877,2,5,1,5,2,1,2,2,4,3,3,5,4,2
129878,3,3,3,3,4,4,4,4,3,2,4,4,5,4


In [29]:
df_encoded = pd.concat([df_quantitative, df_categorical_encoded, df_ordinal_encoded], axis=1)

In [30]:
#display the whole dataframe
print(df_encoded)

         Age  Flight Distance  Departure Delay in Minutes  \
0       13.0            460.0                        25.0   
1       25.0            235.0                         1.0   
2       26.0           1142.0                         0.0   
3       25.0            562.0                        11.0   
4       61.0            214.0                         0.0   
...      ...              ...                         ...   
129875  34.0            526.0                         0.0   
129876  23.0            646.0                         0.0   
129877  17.0            828.0                         0.0   
129878  14.0           1127.0                         0.0   
129879  42.0            264.0                         0.0   

        Arrival Delay in Minutes  Gender_Male  Customer Type_Loyal Customer  \
0                           18.0         True                          True   
1                            6.0         True                         False   
2                            0

In [31]:
df_encoded.to_csv('/home/estelle/robotlearn/airline_satisfaction/airline_satisfaction_encoded.csv')

## Imputing Missing Values

In [32]:
nan_counts = df_encoded.isna().sum()

In [33]:
print(nan_counts)

Age                                       0
Flight Distance                           0
Departure Delay in Minutes                0
Arrival Delay in Minutes                393
Gender_Male                               0
Customer Type_Loyal Customer              0
Customer Type_disloyal Customer           0
Type of Travel_Business travel            0
Type of Travel_Personal Travel            0
Class_Business                            0
Class_Eco                                 0
Class_Eco Plus                            0
satisfaction_neutral or dissatisfied      0
satisfaction_satisfied                    0
Inflight wifi service                     0
Departure/Arrival time convenient         0
Ease of Online booking                    0
Gate location                             0
Food and drink                            0
Online boarding                           0
Seat comfort                              0
Inflight entertainment                    0
On-board service                

we notice that there are 393 missing values for the "Arrival Delay in Minutes" feature

### Univariate feature imputation (Static imputation)

In [34]:
from sklearn.impute import SimpleImputer

In [35]:
X_encoded = df_encoded.values

In [36]:
simple_imputer = SimpleImputer(strategy="mean")
simple_imputer.fit(X_encoded)

X_imputed = simple_imputer.transform(X_encoded)

In [37]:
print(X_imputed)

[[1.300e+01 4.600e+02 2.500e+01 ... 4.000e+00 5.000e+00 5.000e+00]
 [2.500e+01 2.350e+02 1.000e+00 ... 1.000e+00 4.000e+00 1.000e+00]
 [2.600e+01 1.142e+03 0.000e+00 ... 4.000e+00 4.000e+00 5.000e+00]
 ...
 [1.700e+01 8.280e+02 0.000e+00 ... 5.000e+00 4.000e+00 2.000e+00]
 [1.400e+01 1.127e+03 0.000e+00 ... 4.000e+00 5.000e+00 4.000e+00]
 [4.200e+01 2.640e+02 0.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]]


mean strategy : OK for quantitative var, but not for categorical var or ordinal var...

In [38]:
simple_imputer = SimpleImputer(strategy="mean")
simple_imputer.fit(X_quantitative)

X_imputed = simple_imputer.transform(X_quantitative)

In [39]:
X_imputed

array([[1.300e+01, 4.600e+02, 2.500e+01, 1.800e+01],
       [2.500e+01, 2.350e+02, 1.000e+00, 6.000e+00],
       [2.600e+01, 1.142e+03, 0.000e+00, 0.000e+00],
       ...,
       [1.700e+01, 8.280e+02, 0.000e+00, 0.000e+00],
       [1.400e+01, 1.127e+03, 0.000e+00, 0.000e+00],
       [4.200e+01, 2.640e+02, 0.000e+00, 0.000e+00]])

most frequent strategy for categorical var ?
- not working with X_categorical_encoded  
-> encoding after imputing missing values ?

In [40]:
simple_imputer = SimpleImputer(strategy="most_frequent")
simple_imputer.fit(X_categorical)

X_imputed = simple_imputer.transform(X_categorical)

In [41]:
X_imputed

array([['Male', 'Loyal Customer', 'Personal Travel', 'Eco Plus'],
       ['Male', 'disloyal Customer', 'Business travel', 'Business'],
       ['Female', 'Loyal Customer', 'Business travel', 'Business'],
       ...,
       ['Female', 'Loyal Customer', 'Personal Travel', 'Eco'],
       ['Male', 'Loyal Customer', 'Business travel', 'Business'],
       ['Female', 'Loyal Customer', 'Personal Travel', 'Eco']],
      dtype=object)

### KNN imputation

In [42]:
from sklearn.impute import KNNImputer

In [43]:
knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")
knn_imputer.fit(X_encoded)

X_imputed = knn_imputer.transform(X_encoded)

In [44]:
X_imputed

array([[1.300e+01, 4.600e+02, 2.500e+01, ..., 4.000e+00, 5.000e+00,
        5.000e+00],
       [2.500e+01, 2.350e+02, 1.000e+00, ..., 1.000e+00, 4.000e+00,
        1.000e+00],
       [2.600e+01, 1.142e+03, 0.000e+00, ..., 4.000e+00, 4.000e+00,
        5.000e+00],
       ...,
       [1.700e+01, 8.280e+02, 0.000e+00, ..., 5.000e+00, 4.000e+00,
        2.000e+00],
       [1.400e+01, 1.127e+03, 0.000e+00, ..., 4.000e+00, 5.000e+00,
        4.000e+00],
       [4.200e+01, 2.640e+02, 0.000e+00, ..., 1.000e+00, 1.000e+00,
        1.000e+00]])

In [45]:
X_imputed.shape

(129880, 28)

KNN imputation seems to be working with all variables

saving best imputation strategy in a df and new csv

In [46]:
df_imputed = pd.DataFrame(X_imputed, columns = ['Age','Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes', 'Gender_Male','Customer Type_Loyal Customer', 'Customer Type_disloyal Customer', 'Type of Travel_Business travel', 'Type of Travel_Personal Travel', 'Class_Business', 'Class_Eco', 'Class_Eco Plus', 'satisfaction_neutral or dissatisfied', 'satisfaction_satisfied', 'Inflight wifi service','Departure/Arrival time convenient','Ease of Online booking','Gate location','Food and drink','Online boarding','Seat comfort','Inflight entertainment','On-board service','Leg room service','Baggage handling','Checkin service','Inflight service','Cleanliness'])

In [47]:
nan_counts = df_imputed.isna().sum()

In [48]:
#we verify there are no longer missing values

print(nan_counts)

Age                                     0
Flight Distance                         0
Departure Delay in Minutes              0
Arrival Delay in Minutes                0
Gender_Male                             0
Customer Type_Loyal Customer            0
Customer Type_disloyal Customer         0
Type of Travel_Business travel          0
Type of Travel_Personal Travel          0
Class_Business                          0
Class_Eco                               0
Class_Eco Plus                          0
satisfaction_neutral or dissatisfied    0
satisfaction_satisfied                  0
Inflight wifi service                   0
Departure/Arrival time convenient       0
Ease of Online booking                  0
Gate location                           0
Food and drink                          0
Online boarding                         0
Seat comfort                            0
Inflight entertainment                  0
On-board service                        0
Leg room service                  

In [49]:
#display the dataframe which contains all features with no missing values
df_imputed

Unnamed: 0,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,...,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
0,13.0,460.0,25.0,18.0,1.0,1.0,0.0,0.0,1.0,0.0,...,5.0,3.0,5.0,5.0,4.0,3.0,3.0,4.0,5.0,5.0
1,25.0,235.0,1.0,6.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,3.0,1.0,1.0,1.0,5.0,2.0,1.0,4.0,1.0
2,26.0,1142.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,5.0,5.0,5.0,5.0,4.0,3.0,3.0,4.0,4.0,5.0
3,25.0,562.0,11.0,9.0,0.0,1.0,0.0,1.0,0.0,1.0,...,2.0,2.0,2.0,2.0,2.0,5.0,2.0,1.0,4.0,2.0
4,61.0,214.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,4.0,5.0,5.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,34.0,526.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,4.0,3.0,4.0,4.0,3.0,2.0,3.0,4.0,5.0,4.0
129876,23.0,646.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,4.0,4.0,4.0,4.0,4.0,5.0,4.0,5.0,5.0,4.0
129877,17.0,828.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,2.0,1.0,2.0,2.0,4.0,3.0,3.0,5.0,4.0,2.0
129878,14.0,1127.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,4.0,4.0,4.0,4.0,3.0,2.0,4.0,4.0,5.0,4.0


In [50]:
df_imputed.to_csv('/home/estelle/robotlearn/airline_satisfaction/airline_satisfaction_imputed.csv')

## Feature Scaling

### MinMax scaling

In [74]:
from sklearn.preprocessing import MinMaxScaler

In [75]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X_imputed)

X_scaled = minmax_scaler.transform(X_imputed)

In [76]:
X_scaled

array([[7.69230769e-02, 8.66316640e-02, 1.57035176e-02, ...,
        8.00000000e-01, 1.00000000e+00, 1.00000000e+00],
       [2.30769231e-01, 4.11954766e-02, 6.28140704e-04, ...,
        2.00000000e-01, 8.00000000e-01, 2.00000000e-01],
       [2.43589744e-01, 2.24353796e-01, 0.00000000e+00, ...,
        8.00000000e-01, 8.00000000e-01, 1.00000000e+00],
       ...,
       [1.28205128e-01, 1.60945073e-01, 0.00000000e+00, ...,
        1.00000000e+00, 8.00000000e-01, 4.00000000e-01],
       [8.97435897e-02, 2.21324717e-01, 0.00000000e+00, ...,
        8.00000000e-01, 1.00000000e+00, 8.00000000e-01],
       [4.48717949e-01, 4.70516963e-02, 0.00000000e+00, ...,
        2.00000000e-01, 2.00000000e-01, 2.00000000e-01]])

In [77]:
df_scaled = pd.DataFrame(X_scaled, columns = ['Age','Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes', 'Gender_Male','Customer Type_Loyal Customer', 'Customer Type_disloyal Customer', 'Type of Travel_Business travel', 'Type of Travel_Personal Travel', 'Class_Business', 'Class_Eco', 'Class_Eco Plus', 'satisfaction_neutral or dissatisfied', 'satisfaction_satisfied', 'Inflight wifi service','Departure/Arrival time convenient','Ease of Online booking','Gate location','Food and drink','Online boarding','Seat comfort','Inflight entertainment','On-board service','Leg room service','Baggage handling','Checkin service','Inflight service','Cleanliness'])

In [78]:
df_scaled

Unnamed: 0,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,...,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
0,0.076923,0.086632,0.015704,0.011364,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.6,1.0,1.0,0.8,0.6,0.75,0.8,1.0,1.0
1,0.230769,0.041195,0.000628,0.003788,1.0,0.0,1.0,1.0,0.0,1.0,...,0.2,0.6,0.2,0.2,0.2,1.0,0.50,0.2,0.8,0.2
2,0.243590,0.224354,0.000000,0.000000,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,0.8,0.6,0.75,0.8,0.8,1.0
3,0.230769,0.107229,0.006910,0.005682,0.0,1.0,0.0,1.0,0.0,1.0,...,0.4,0.4,0.4,0.4,0.4,1.0,0.50,0.2,0.8,0.4
4,0.692308,0.036955,0.000000,0.000000,1.0,1.0,0.0,1.0,0.0,1.0,...,0.8,1.0,1.0,0.6,0.6,0.8,0.75,0.6,0.6,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,0.346154,0.099960,0.000000,0.000000,1.0,0.0,1.0,1.0,0.0,1.0,...,0.8,0.6,0.8,0.8,0.6,0.4,0.75,0.8,1.0,0.8
129876,0.205128,0.124192,0.000000,0.000000,1.0,1.0,0.0,1.0,0.0,1.0,...,0.8,0.8,0.8,0.8,0.8,1.0,1.00,1.0,1.0,0.8
129877,0.128205,0.160945,0.000000,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,...,0.4,0.2,0.4,0.4,0.8,0.6,0.75,1.0,0.8,0.4
129878,0.089744,0.221325,0.000000,0.000000,1.0,1.0,0.0,1.0,0.0,1.0,...,0.8,0.8,0.8,0.8,0.6,0.4,1.00,0.8,1.0,0.8


### Standard scaling

In [68]:
from sklearn.preprocessing import StandardScaler

In [69]:
standard_scaler = MinMaxScaler()
standard_scaler.fit(X_imputed)

X_scaled = standard_scaler.transform(X_imputed)

In [70]:
X_scaled

array([[7.69230769e-02, 8.66316640e-02, 1.57035176e-02, ...,
        8.00000000e-01, 1.00000000e+00, 1.00000000e+00],
       [2.30769231e-01, 4.11954766e-02, 6.28140704e-04, ...,
        2.00000000e-01, 8.00000000e-01, 2.00000000e-01],
       [2.43589744e-01, 2.24353796e-01, 0.00000000e+00, ...,
        8.00000000e-01, 8.00000000e-01, 1.00000000e+00],
       ...,
       [1.28205128e-01, 1.60945073e-01, 0.00000000e+00, ...,
        1.00000000e+00, 8.00000000e-01, 4.00000000e-01],
       [8.97435897e-02, 2.21324717e-01, 0.00000000e+00, ...,
        8.00000000e-01, 1.00000000e+00, 8.00000000e-01],
       [4.48717949e-01, 4.70516963e-02, 0.00000000e+00, ...,
        2.00000000e-01, 2.00000000e-01, 2.00000000e-01]])

In [71]:
df_scaled = pd.DataFrame(X_scaled, columns = ['Age','Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes', 'Gender_Male','Customer Type_Loyal Customer', 'Customer Type_disloyal Customer', 'Type of Travel_Business travel', 'Type of Travel_Personal Travel', 'Class_Business', 'Class_Eco', 'Class_Eco Plus', 'satisfaction_neutral or dissatisfied', 'satisfaction_satisfied', 'Inflight wifi service','Departure/Arrival time convenient','Ease of Online booking','Gate location','Food and drink','Online boarding','Seat comfort','Inflight entertainment','On-board service','Leg room service','Baggage handling','Checkin service','Inflight service','Cleanliness'])

In [72]:
df_scaled

Unnamed: 0,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,...,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
0,0.076923,0.086632,0.015704,0.011364,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.6,1.0,1.0,0.8,0.6,0.75,0.8,1.0,1.0
1,0.230769,0.041195,0.000628,0.003788,1.0,0.0,1.0,1.0,0.0,1.0,...,0.2,0.6,0.2,0.2,0.2,1.0,0.50,0.2,0.8,0.2
2,0.243590,0.224354,0.000000,0.000000,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,0.8,0.6,0.75,0.8,0.8,1.0
3,0.230769,0.107229,0.006910,0.005682,0.0,1.0,0.0,1.0,0.0,1.0,...,0.4,0.4,0.4,0.4,0.4,1.0,0.50,0.2,0.8,0.4
4,0.692308,0.036955,0.000000,0.000000,1.0,1.0,0.0,1.0,0.0,1.0,...,0.8,1.0,1.0,0.6,0.6,0.8,0.75,0.6,0.6,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,0.346154,0.099960,0.000000,0.000000,1.0,0.0,1.0,1.0,0.0,1.0,...,0.8,0.6,0.8,0.8,0.6,0.4,0.75,0.8,1.0,0.8
129876,0.205128,0.124192,0.000000,0.000000,1.0,1.0,0.0,1.0,0.0,1.0,...,0.8,0.8,0.8,0.8,0.8,1.0,1.00,1.0,1.0,0.8
129877,0.128205,0.160945,0.000000,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,...,0.4,0.2,0.4,0.4,0.8,0.6,0.75,1.0,0.8,0.4
129878,0.089744,0.221325,0.000000,0.000000,1.0,1.0,0.0,1.0,0.0,1.0,...,0.8,0.8,0.8,0.8,0.6,0.4,1.00,0.8,1.0,0.8


### Robust scaling

In [62]:
from sklearn.preprocessing import RobustScaler

In [63]:
robust_scaler = RobustScaler()
robust_scaler.fit(X_imputed)

X_scaled = robust_scaler.transform(X_imputed)

In [64]:
X_scaled

array([[-1.125     , -0.2887218 ,  2.08333333, ...,  1.        ,
         0.5       ,  1.        ],
       [-0.625     , -0.45789474,  0.08333333, ..., -2.        ,
         0.        , -1.        ],
       [-0.58333333,  0.22406015,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.95833333, -0.01203008,  0.        , ...,  2.        ,
         0.        , -0.5       ],
       [-1.08333333,  0.21278195,  0.        , ...,  1.        ,
         0.5       ,  0.5       ],
       [ 0.08333333, -0.43609023,  0.        , ..., -2.        ,
        -1.5       , -1.        ]])

In [65]:
df_scaled = pd.DataFrame(X_scaled, columns = ['Age','Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes', 'Gender_Male','Customer Type_Loyal Customer', 'Customer Type_disloyal Customer', 'Type of Travel_Business travel', 'Type of Travel_Personal Travel', 'Class_Business', 'Class_Eco', 'Class_Eco Plus', 'satisfaction_neutral or dissatisfied', 'satisfaction_satisfied', 'Inflight wifi service','Departure/Arrival time convenient','Ease of Online booking','Gate location','Food and drink','Online boarding','Seat comfort','Inflight entertainment','On-board service','Leg room service','Baggage handling','Checkin service','Inflight service','Cleanliness'])

In [66]:
df_scaled

Unnamed: 0,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,...,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
0,-1.125000,-0.288722,2.083333,1.384615,1.0,0.0,0.0,-1.0,1.0,0.0,...,1.0,0.0,0.333333,0.5,0.0,-0.5,0.0,1.0,0.5,1.0
1,-0.625000,-0.457895,0.083333,0.461538,1.0,-1.0,1.0,0.0,0.0,1.0,...,-1.0,0.0,-1.000000,-1.5,-1.5,0.5,-0.5,-2.0,0.0,-1.0
2,-0.583333,0.224060,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.333333,0.5,0.0,-0.5,0.0,1.0,0.0,1.0
3,-0.625000,-0.212030,0.916667,0.692308,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.5,-0.5,-0.666667,-1.0,-1.0,0.5,-0.5,-2.0,0.0,-0.5
4,0.875000,-0.473684,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,...,0.5,1.0,0.333333,-0.5,-0.5,0.0,0.0,0.0,-0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,-0.250000,-0.239098,0.000000,0.000000,1.0,-1.0,1.0,0.0,0.0,1.0,...,0.5,0.0,0.000000,0.0,-0.5,-1.0,0.0,1.0,0.5,0.5
129876,-0.708333,-0.148872,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,...,0.5,0.5,0.000000,0.0,0.0,0.5,0.5,2.0,0.5,0.5
129877,-0.958333,-0.012030,0.000000,0.000000,0.0,0.0,0.0,-1.0,1.0,0.0,...,-0.5,-1.0,-0.666667,-1.0,0.0,-0.5,0.0,2.0,0.0,-0.5
129878,-1.083333,0.212782,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,1.0,...,0.5,0.5,0.000000,0.0,-0.5,-1.0,0.5,1.0,0.5,0.5


## Outlier Removal

## Feature Selection