# Customer Satisfaction Prediction for Invistico Airlines

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## Data Collection
The data used in this project is the dataset of Invistico Airlines provided in the prompt. The dataset has 129879 rows and 23 columns. The columns are as follows:
- satisfaction: Whether the customer is satisfied or dissatisfied
- Gender: whether the customer is male or female
- Customer Type: whether the customer is loyal or disloyal
- Age: age of the customer
- Type of travel: whether the customer is travelling for business or personal reasons
- Class: class of the flight (business, eco, or eco plus)
- Flight Distance: distance of the flight
- Seat comfort: rating of seat comfort
- Departure/Arrival time convenient: rating of departure and arrival time convenience
- Food and drink: rating of food and drink
- Gate location: rating of gate location
- Inflight wifi service: rating of inflight wifi service
- Inflight entertainment: rating of inflight entertainment
- Online support: rating of online support
- Ease of online booking: rating of ease of online booking
- On-board service: rating of on-board service
- Leg room service: rating of leg room service
- Baggage handling: rating of baggage handling
- Check-in service: rating of check-in service
- Cleanliness: rating of cleanliness
- Online boarding: rating of online boarding
- Departure Delay in Minutes: delay in departure
- Arrival Delay in Minutes: delay in arrival

The data is stored in a CSV file name "Airline data.csv". The data is loaded into a pandas dataframe.

In [3]:
data = pd.read_csv(r'Arline data.csv', encoding='latin1')

## Data Preprocessing

The head, tail, and 5 rows in the middle of the dataframe are displayed. The data types of the columns are displayed. The number of missing values in each column is displayed.

In [4]:
data.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [5]:
data.tail()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
129875,satisfied,Female,disloyal Customer,29,Personal Travel,Eco,1731,5,5,5,...,2,2,3,3,4,4,4,2,0,0.0
129876,dissatisfied,Male,disloyal Customer,63,Personal Travel,Business,2087,2,3,2,...,1,3,2,3,3,1,2,1,174,172.0
129877,dissatisfied,Male,disloyal Customer,69,Personal Travel,Eco,2320,3,0,3,...,2,4,4,3,4,2,3,2,155,163.0
129878,dissatisfied,Male,disloyal Customer,66,Personal Travel,Eco,2450,3,2,3,...,2,3,3,2,3,2,1,2,193,205.0
129879,dissatisfied,Female,disloyal Customer,38,Personal Travel,Eco,4307,3,4,3,...,3,4,5,5,5,3,3,3,185,186.0


In [6]:
data[len(data)//2:len(data)//2+5]

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
64940,satisfied,Female,Loyal Customer,16,Business travel,Eco Plus,3857,0,3,0,...,2,2,3,2,4,3,4,2,0,0.0
64941,satisfied,Female,Loyal Customer,44,Business travel,Business,3966,0,3,0,...,5,3,3,3,3,3,3,2,0,0.0
64942,satisfied,Male,Loyal Customer,14,Business travel,Eco,1520,0,3,0,...,5,5,4,3,2,5,2,5,18,16.0
64943,satisfied,Male,Loyal Customer,50,Business travel,Business,3954,0,3,0,...,5,1,1,1,1,4,1,2,0,0.0
64944,satisfied,Male,Loyal Customer,39,Business travel,Business,72,0,3,0,...,4,2,2,3,3,5,2,4,0,0.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129880 non-null  object 
 1   Gender                             129880 non-null  object 
 2   Customer Type                      129880 non-null  object 
 3   Age                                129880 non-null  int64  
 4   Type of Travel                     129880 non-null  object 
 5   Class                              129880 non-null  object 
 6   Flight Distance                    129880 non-null  int64  
 7   Seat comfort                       129880 non-null  int64  
 8   Departure/Arrival time convenient  129880 non-null  int64  
 9   Food and drink                     129880 non-null  int64  
 10  Gate location                      129880 non-null  int64  
 11  Inflight wifi service              1298

In [8]:
data.describe()

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
count,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129487.0
mean,39.427957,1981.409055,2.838597,2.990645,2.851994,2.990422,3.24913,3.383477,3.519703,3.472105,3.465075,3.485902,3.695673,3.340807,3.705759,3.352587,14.713713,15.091129
std,15.11936,1027.115606,1.392983,1.527224,1.443729,1.30597,1.318818,1.346059,1.306511,1.30556,1.270836,1.292226,1.156483,1.260582,1.151774,1.298715,38.071126,38.46565
min,7.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,1359.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,40.0,1925.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,0.0,0.0
75%,51.0,2544.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,5.0,4.0,5.0,4.0,12.0,13.0
max,85.0,6951.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


In [9]:
counts = data['satisfaction'].value_counts()
ratio = []
for i in counts:
    ratio.append(i/len(data))
string = ':'.join([str(round(elem, 2)) for elem in ratio])
print(f'Ratio of satisfaction to dissatisfaction is {string}')

Ratio of satisfaction to dissatisfaction is 0.55:0.45


The ratio of satisfied customers to dissatisfied customers is calculated. The ratio is 0.55:0.45. The data is slightly imbalanced. This may affect the performance of the model and at a later stage may require a balancing technique (e.g. oversampling, undersampling).

In [10]:
data.columns.to_list()

['satisfaction',
 'Gender',
 'Customer Type',
 'Age',
 'Type of Travel',
 'Class',
 'Flight Distance',
 'Seat comfort',
 'Departure/Arrival time convenient',
 'Food and drink',
 'Gate location',
 'Inflight wifi service',
 'Inflight entertainment',
 'Online support',
 'Ease of Online booking',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Cleanliness',
 'Online boarding',
 'Departure Delay in Minutes',
 'Arrival Delay in Minutes']

### Imputing Missing Values

In [11]:
data.isnull().sum()

satisfaction                           0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Seat comfort                           0
Departure/Arrival time convenient      0
Food and drink                         0
Gate location                          0
Inflight wifi service                  0
Inflight entertainment                 0
Online support                         0
Ease of Online booking                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Cleanliness                            0
Online boarding                        0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
dtype: int64

In [12]:
(393/129880)*100

0.3025870033877425

There are missing values in the columns "Arrival Delay in Minutes". The missing values are only 393, which is approximately 0.3% of the total data. The missing values are imputed with the mean of the column. I decided to impute the missing values with the mean because the column is numerical and the missing values are very few so using the mean will not be numerically significant.

In [13]:
data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype.kind in 'biufc' else x)

In [14]:
data.isnull().sum()

satisfaction                         0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Seat comfort                         0
Departure/Arrival time convenient    0
Food and drink                       0
Gate location                        0
Inflight wifi service                0
Inflight entertainment               0
Online support                       0
Ease of Online booking               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Cleanliness                          0
Online boarding                      0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
dtype: int64

## Feature Engineering (Augmentation)

I take a slight deviation from the data preprocessing, to examine if ome new features can be created from existing features. I do it at this stage so that the new features can be included in the data preprocessing steps.

Intuitively, the total delay in minutes can be calculated by adding the departure delay in minutes and the arrival delay in minutes. This new feature can be useful in predicting customer satisfaction. The new feature is added to the dataframe. Passengers experiencing long delays may have lower satisfaction.


In [15]:
data['Total Delay'] = data['Departure Delay in Minutes'] + data['Arrival Delay in Minutes']

A new binary feature "Delay" is created. If either the "Arrival Delay in Minutes" or "Departure Delay in Minutes" is greater than 0, then the "Delay" feature is set to 1, otherwise 0.

In [16]:
data['Delay'] = data['Total Delay'].apply(lambda x: 1 if x > 0 else 0)

Additionally, agggregated ratings can be calculated by adding all the ratings columns. This new feature can be useful in predicting customer satisfaction. This can be separated into on-flight features, booking features, and ground features.

Ground Rating: Average ratings for Baggage handling, Checkin service, and Gate location. This may reflect satisfaction with ground services separately from in-flight experience.

Booking Rating: Average of Ease of Online booking, Online support, and Online boarding. This could capture the convenience of digital interactions.

In [17]:
data['On-Flight Rating'] = data['Inflight entertainment'] + data['Seat comfort'] + data['Food and drink'] + data['Inflight wifi service'] + data['Cleanliness'] + data['Leg room service']
data['On-Flight Rating'] = data['On-Flight Rating']/6

data['Booking Rating'] = data['Ease of Online booking'] + data['Online boarding'] + data['Online support']
data['Booking Rating'] = data['Booking Rating']/3

data['Ground Rating'] = data['Baggage handling'] + data['Checkin service'] + data['Gate location']
data['Ground Rating'] = data['Ground Rating']/3

data['Rating'] = data['On-Flight Rating'] + data['Booking Rating'] + data['Ground Rating']
data['Rating'] = data['Rating']/3

Also, computing delay per distance can be useful because being delayed for a long time on a short distance flight may be more frustrating than being delayed for the same time on a long distance flight. This new feature is added to the dataframe.

In [18]:
data['Delay Proportion'] = data['Total Delay']/data['Flight Distance']

Another augmented feature is age group. The age of the passengers can be grouped into age groups. This can be useful in predicting customer satisfaction as different age groups may have varying expectations and tolerance levels. The age groups are as follows:
- <= 20
- 21-40
- 41-60
- => 60

In [19]:
data['Age Group'] = data['Age'].apply(lambda x: '0-20' if x <= 20 else '21-40' if x <= 40 else '41-60' if x <= 60 else '61-100')

To capture any age-based differences in satisfaction, the age is multiplied by the average rating eature created earlier. This new feature is added to the dataframe.

In [20]:
data['Age_Satisfaction'] = data['Rating'] * data['Age']

Another thing to note is that if seats are uncomfortable, AND the flight is long, the passenger may be more dissatisfied. Long flights with poor comfort may weigh more heavily on satisfaction than short flights with similar comfort levels. This can be captured by dividing the seat comfort rating by the flight distance. This new feature is added to the dataframe.

In [21]:
data['Comfort to Distance'] = data['Seat comfort']/data['Flight Distance']

The difference between the departure and arrival delay can be calculated. A long departure delay with an on-time arrival could be a positive sign of efficient recovery during the flight which customers may appreciate.

In [22]:
data['Arrival-Departure-Delay'] = data['Arrival Delay in Minutes'] - data['Departure Delay in Minutes']

In [23]:
data.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Delay,On-Flight Rating,Booking Rating,Ground Rating,Rating,Delay Proportion,Age Group,Age_Satisfaction,Comfort to Distance,Arrival-Departure-Delay
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,0,1.5,2.333333,3.333333,2.388889,0.0,61-100,155.277778,0.0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,1,1.5,2.333333,3.0,2.277778,0.249594,41-60,107.055556,0.0,-5.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,0,1.5,2.0,3.666667,2.388889,0.0,0-20,35.833333,0.0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,0,1.333333,2.333333,2.666667,2.111111,0.0,41-60,126.666667,0.0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,0,1.5,3.666667,3.0,2.722222,0.0,61-100,190.555556,0.0,0.0


In [24]:
data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype == 'float' or x.dtype == 'int' else x)

In [25]:
data.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Delay,On-Flight Rating,Booking Rating,Ground Rating,Rating,Delay Proportion,Age Group,Age_Satisfaction,Comfort to Distance,Arrival-Departure-Delay
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,0,1.5,2.333333,3.333333,2.388889,0.0,61-100,155.277778,0.0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,1,1.5,2.333333,3.0,2.277778,0.249594,41-60,107.055556,0.0,-5.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,0,1.5,2.0,3.666667,2.388889,0.0,0-20,35.833333,0.0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,0,1.333333,2.333333,2.666667,2.111111,0.0,41-60,126.666667,0.0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,0,1.5,3.666667,3.0,2.722222,0.0,61-100,190.555556,0.0,0.0


In [26]:
data.isnull().sum()

satisfaction                         0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Seat comfort                         0
Departure/Arrival time convenient    0
Food and drink                       0
Gate location                        0
Inflight wifi service                0
Inflight entertainment               0
Online support                       0
Ease of Online booking               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Cleanliness                          0
Online boarding                      0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
Total Delay                          0
Delay                                0
On-Flight Rating         

### Encoding Categorical Variables

I opted to use a OneHotEncoder to encode the categorical variables, except for the "satisfaction" column. The "satisfaction" column is the target variable and is encoded using a LabelEncoder.

I chose the OneHotEncoder because it is suitable for encoding categorical variables with more than two categories. The OneHotEncoder creates a binary column for each category in the categorical variable. This is useful for columns like the "Class" column, which has three categories (business, eco, eco plus). The OneHotEncoder will create three columns, one for each category.

Even thought that is the only column with more than two categories, I decided to use the OneHotEncoder for all the categorical variables to maintain consistency. Also, the model's accuracy improved by 3% when using the OneHotEncoder compared to the LabelEncoder.

Although the OneHotEncoder is suitable for encoding categorical variables with more than two categories, it is not suitable for encoding the target variable. The target variable has two categories (satisfied, dissatisfied). The LabelEncoder is suitable for encoding the target variable because it assigns a unique integer to each category. This is useful for binary classification problems.

In [27]:
cat_data = data.select_dtypes(include=['object'])
data = data.drop(cat_data.columns, axis=1)

In [28]:
cat_data.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Type of Travel,Class,Age Group
0,satisfied,Female,Loyal Customer,Personal Travel,Eco,61-100
1,satisfied,Male,Loyal Customer,Personal Travel,Business,41-60
2,satisfied,Female,Loyal Customer,Personal Travel,Eco,0-20
3,satisfied,Female,Loyal Customer,Personal Travel,Eco,41-60
4,satisfied,Female,Loyal Customer,Personal Travel,Eco,61-100


In [29]:
cat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   satisfaction    129880 non-null  object
 1   Gender          129880 non-null  object
 2   Customer Type   129880 non-null  object
 3   Type of Travel  129880 non-null  object
 4   Class           129880 non-null  object
 5   Age Group       129880 non-null  object
dtypes: object(6)
memory usage: 5.9+ MB


In [30]:
cat_data.nunique()

satisfaction      2
Gender            2
Customer Type     2
Type of Travel    2
Class             3
Age Group         4
dtype: int64

In [31]:
n_ary_data = cat_data.loc[:, cat_data.nunique() > 0]
satisfactions = n_ary_data['satisfaction']
n_ary_data = n_ary_data.drop('satisfaction', axis=1)

In [32]:
label_enc = LabelEncoder()
one_hot_enc = OneHotEncoder()

In [33]:
encoded_columns = []
for col in n_ary_data.columns:
    encoded_col = one_hot_enc.fit_transform(n_ary_data[[col]]).toarray()
    encoded_df = pd.DataFrame(encoded_col, columns=one_hot_enc.get_feature_names_out([col]))
    encoded_columns.append(encoded_df)

n_ary_data = pd.concat(encoded_columns, axis=1)

satisfactions = satisfactions.map({'satisfied': 1, 'dissatisfied': 0})

In [34]:
n_ary_data.head()

Unnamed: 0,Gender_Female,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,Age Group_0-20,Age Group_21-40,Age Group_41-60,Age Group_61-100
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [35]:
n_ary_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 13 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Gender_Female                    129880 non-null  float64
 1   Gender_Male                      129880 non-null  float64
 2   Customer Type_Loyal Customer     129880 non-null  float64
 3   Customer Type_disloyal Customer  129880 non-null  float64
 4   Type of Travel_Business travel   129880 non-null  float64
 5   Type of Travel_Personal Travel   129880 non-null  float64
 6   Class_Business                   129880 non-null  float64
 7   Class_Eco                        129880 non-null  float64
 8   Class_Eco Plus                   129880 non-null  float64
 9   Age Group_0-20                   129880 non-null  float64
 10  Age Group_21-40                  129880 non-null  float64
 11  Age Group_41-60                  129880 non-null  float64
 12  Ag

### Standardizing or Normalizing Numerical Variables (Scaling)

In [36]:
data.head()

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Total Delay,Delay,On-Flight Rating,Booking Rating,Ground Rating,Rating,Delay Proportion,Age_Satisfaction,Comfort to Distance,Arrival-Departure-Delay
0,65,265,0,0,0,2,2,4,2,3,...,0.0,0,1.5,2.333333,3.333333,2.388889,0.0,155.277778,0.0,0.0
1,47,2464,0,0,0,3,0,2,2,3,...,615.0,1,1.5,2.333333,3.0,2.277778,0.249594,107.055556,0.0,-5.0
2,15,2138,0,0,0,3,2,0,2,2,...,0.0,0,1.5,2.0,3.666667,2.388889,0.0,35.833333,0.0,0.0
3,60,623,0,0,0,3,3,4,3,1,...,0.0,0,1.333333,2.333333,2.666667,2.111111,0.0,126.666667,0.0,0.0
4,70,354,0,0,0,3,4,3,4,2,...,0.0,0,1.5,3.666667,3.0,2.722222,0.0,190.555556,0.0,0.0


To choose between standardizing or normalizing the numerical variables, I used the Kashpirovsky criterion. The Kashpirovsky criterion states that if the data is normally distributed, then standardizing is preferred. If the data is not normally distributed, then normalizing is preferred. I checked the distribution of the numerical variables using the Kashpirosky test. The test showed that the data is not normally distributed.

In [37]:
for column in data.columns:
    feature = data[column].dropna()
    stat, p = stats.kstest(feature, 'norm', args=(feature.mean(), feature.std()))
    print(f'Feature: {column}')
    print(f'Statistics={stat:.3f}, p-value={p:.3f}')
    if p > 0.05:
        print('Result: Data is Gaussian\n')
    else:
        print('Result: Data is not Gaussian\n')

Feature: Age
Statistics=0.055, p-value=0.000
Result: Data is not Gaussian

Feature: Flight Distance
Statistics=0.047, p-value=0.000
Result: Data is not Gaussian

Feature: Seat comfort
Statistics=0.154, p-value=0.000
Result: Data is not Gaussian

Feature: Departure/Arrival time convenient
Statistics=0.180, p-value=0.000
Result: Data is not Gaussian

Feature: Food and drink
Statistics=0.153, p-value=0.000
Result: Data is not Gaussian

Feature: Gate location
Statistics=0.159, p-value=0.000
Result: Data is not Gaussian

Feature: Inflight wifi service
Statistics=0.180, p-value=0.000
Result: Data is not Gaussian

Feature: Inflight entertainment
Statistics=0.229, p-value=0.000
Result: Data is not Gaussian

Feature: Online support
Statistics=0.237, p-value=0.000
Result: Data is not Gaussian

Feature: Ease of Online booking
Statistics=0.227, p-value=0.000
Result: Data is not Gaussian

Feature: On-board service
Statistics=0.221, p-value=0.000
Result: Data is not Gaussian

Feature: Leg room servi

### Approach 1: Standardization
One effective approach to preparing numerical columns is to standardize them by centering and scaling. This technique ensures that all numerical features operate on the same scale, which is particularly suitable for datasets that follow a Gaussian (normal) distribution. The standardization process is performed using the following formula:

$$ z = \frac{x - \mu}{\sigma} $$

Where:
- $ z $ is the standardized value.
- $ x $ is the original data point.
- $ \mu $ represents the mean of the column.
- $ \sigma $ is the standard deviation of the column.

By applying this transformation, the numerical data is adjusted to have a mean of zero and a standard deviation of one, facilitating more effective performance of algorithms that assume normally distributed input features.

### Approach 2: Normalization
Normalization is another common technique used to prepare numerical data for machine learning algorithms. This process scales the numerical data to a fixed range, typically between 0 and 1. Normalization is particularly useful when the numerical features do not follow a Gaussian distribution. The normalization process is performed using the following formula:

$$ x_{norm} = \frac{x - x_{min}}{x_{max} - x_{min}} $$

Where:
- $ x_{norm} $ is the normalized value.
- $ x $ is the original data point.
- $ x_{min} $ represents the minimum value of the column.
- $ x_{max} $ is the maximum value of the column.

By applying this transformation, the numerical data is adjusted to a common scale, ensuring that all features contribute equally to the model's learning process.

### Approach Chosen: Normalization

The best approach to use in this case is normalization as it is shown that columns do not follow a Gaussian distribution.

In [38]:
data = (data - data.min()) / (data.max() - data.min())

In [39]:
data.head()

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Total Delay,Delay,On-Flight Rating,Booking Rating,Ground Rating,Rating,Delay Proportion,Age_Satisfaction,Comfort to Distance,Arrival-Departure-Delay
0,0.74359,0.031155,0.0,0.0,0.0,0.4,0.4,0.8,0.4,0.6,...,0.0,0.0,0.222222,0.384615,0.583333,0.364865,0.0,0.393917,0.0,0.687545
1,0.512821,0.349804,0.0,0.0,0.0,0.6,0.0,0.4,0.4,0.6,...,0.19364,1.0,0.222222,0.384615,0.5,0.337838,0.062993,0.265134,0.0,0.680869
2,0.102564,0.302565,0.0,0.0,0.0,0.6,0.4,0.0,0.4,0.4,...,0.0,0.0,0.222222,0.307692,0.666667,0.364865,0.0,0.074926,0.0,0.687545
3,0.679487,0.083031,0.0,0.0,0.0,0.6,0.6,0.8,0.6,0.2,...,0.0,0.0,0.185185,0.384615,0.416667,0.297297,0.0,0.317507,0.0,0.687545
4,0.807692,0.044052,0.0,0.0,0.0,0.6,0.8,0.6,0.8,0.4,...,0.0,0.0,0.222222,0.692308,0.5,0.445946,0.0,0.488131,0.0,0.687545


In [40]:
data.describe()

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Total Delay,Delay,On-Flight Rating,Booking Rating,Ground Rating,Rating,Delay Proportion,Age_Satisfaction,Comfort to Distance,Arrival-Departure-Delay
count,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,...,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0
mean,0.415743,0.279874,0.567719,0.598129,0.570399,0.598084,0.649826,0.676695,0.703941,0.694421,...,0.009384,0.543032,0.611661,0.641877,0.585575,0.598074,0.005638,0.333882,0.026975,0.688049
std,0.193838,0.148836,0.278597,0.305445,0.288746,0.261194,0.263764,0.269212,0.261302,0.261112,...,0.02384,0.498147,0.171873,0.264222,0.190544,0.16856,0.021284,0.159336,0.054978,0.014389
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25641,0.189683,0.4,0.4,0.4,0.4,0.4,0.4,0.6,0.4,...,0.0,0.0,0.481481,0.461538,0.5,0.472973,0.0,0.206825,0.008239,0.684875
50%,0.423077,0.2717,0.6,0.6,0.6,0.6,0.6,0.8,0.8,0.8,...,0.00063,1.0,0.62963,0.692308,0.583333,0.608108,0.000261,0.323145,0.01452,0.687545
75%,0.564103,0.361397,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0,...,0.007557,1.0,0.740741,0.846154,0.75,0.72973,0.003633,0.450148,0.024155,0.687545
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Now that the data is preprocessed appropriately (encoding of categorical data and standardization of numerical values), the separate datasets are put back together.

In [41]:
satisfactions = pd.DataFrame(satisfactions, columns=['satisfaction'])

In [42]:
satisfactions.head()

Unnamed: 0,satisfaction
0,1
1,1
2,1
3,1
4,1


In [43]:
data = pd.concat([data, n_ary_data, satisfactions], axis=1)

In [44]:
data.head()

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,...,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,Age Group_0-20,Age Group_21-40,Age Group_41-60,Age Group_61-100,satisfaction
0,0.74359,0.031155,0.0,0.0,0.0,0.4,0.4,0.8,0.4,0.6,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
1,0.512821,0.349804,0.0,0.0,0.0,0.6,0.0,0.4,0.4,0.6,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
2,0.102564,0.302565,0.0,0.0,0.0,0.6,0.4,0.0,0.4,0.4,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1
3,0.679487,0.083031,0.0,0.0,0.0,0.6,0.6,0.8,0.6,0.2,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
4,0.807692,0.044052,0.0,0.0,0.0,0.6,0.8,0.6,0.8,0.4,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1


## Model Building

Now that the data is in appropriate form for model training, the data is split into training and testing sets. The training set is used to train the model, while the testing set is used to evaluate the model. The target variable is the "satisfaction" column. The data is split into 80% training and 20% testing.

A logistic regression model is built to predict customer satisfaction. The model is trained on the training set and evaluated on the testing set. The accuracy of the model is calculated.

In [45]:
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

class LogisticClassifier:
    """
    This class implements a logistic regression classifier using gradient descent.
    
    Parameters:
    - learning_rate: float, the learning rate of the algorithm for updating weights.
    - num_iterations: int, the number of iterations for training.
    """

    def __init__(self, learning_rate: float = 0.01, num_iterations: int = 1000):
        if num_iterations <= 0:
            raise ValueError("Number of iterations must be a positive integer.")
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.X_train, self.y_train = None, None
        self.X_test, self.y_test = None, None

    def load(self, data, target, frac=0.8, random_state=9, shuffle=True):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            data.drop(target, axis=1), data[target], test_size=1 - frac, 
            random_state=random_state, shuffle=shuffle
        )

    def __sigmoid(self, z: float) -> float:
        return np.where(z >= 0, 1 / (1 + np.exp(-z)), np.exp(z) / (1 + np.exp(z)))

    def __initialize_parameters(self, n_features: int) -> None:
        self.weights = np.zeros(n_features + 1)

    def __add_bias(self, X) -> np.array:
        return np.insert(X, 0, 1, axis=1)

    def train(self) -> None:
        if self.X_train is None or self.y_train is None:
            raise ValueError("Training data not loaded.")
        
        self.X_train = self.__add_bias(self.X_train)
        self.__initialize_parameters(self.X_train.shape[1] - 1)
        self.__gradient_descent()
        
        
    def __gradient_descent(self) -> None:
        prev_cost = float('inf')
        for i in range(self.num_iterations):
            z = np.dot(self.X_train, self.weights)
            h = self.__sigmoid(z)
            gradient = np.dot(self.X_train.T, (h - self.y_train)) / self.y_train.size
            self.weights -= self.learning_rate * gradient

            cost = -np.mean(self.y_train * np.log(h) + (1 - self.y_train) * np.log(1 - h))
            if np.abs(prev_cost - cost) < 1e-6:
                print(f"Early stopping at iteration {i+1}")
                break
            prev_cost = cost
        

    def predict_class(self, threshold=0.55) -> np.array:
        if self.weights is None:
            raise ValueError("Weights not initialized.")
        X_test_bias = self.__add_bias(self.X_test)
        return (self.__sigmoid(np.dot(X_test_bias, self.weights)) >= threshold).astype(int)

    def evaluate(self, metric='accuracy') -> float:
        y_pred = self.predict_class()
        
        if metric == 'accuracy':
            # Calculate model accuracy
            model_accuracy = np.mean(y_pred == self.y_test) * 100
            
            # Calculate majority class baseline accuracy
            majority_class = np.bincount(self.y_test).argmax()
            baseline_accuracy = np.mean(self.y_test == majority_class) * 100
            
            print(f"Model Accuracy: {model_accuracy:.2f}%")
            print(f"Baseline Accuracy (Majority Class): {baseline_accuracy:.2f}%")
            
            return model_accuracy
        else:
            raise ValueError(f"Unsupported metric: {metric}")

        

In [46]:
import numpy as np
from sklearn.model_selection import train_test_split

class DecisionTreeClassifier:
    """
    A simple decision tree classifier implementing binary splits based on Gini impurity.
    
    Parameters:
    - max_depth: int, the maximum depth of the tree to control overfitting.
    - min_samples_split: int, the minimum number of samples required to split a node.
    """
    
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None
        self.X_train, self.y_train = None, None
        self.X_test, self.y_test = None, None

    def load(self, data, target, frac=0.8, random_state=9, shuffle=True):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            data.drop(target, axis=1), data[target], test_size=1 - frac, 
            random_state=random_state, shuffle=shuffle
        )

    def __gini(self, y):
        """
        Calculate Gini impurity for a set of labels y.
        """
        m = len(y)
        if m == 0:
            return 0
        class_counts = np.bincount(y)
        return 1 - sum((count / m) ** 2 for count in class_counts if count > 0)

    def __split_dataset(self, X, y, feature_index, threshold):
        """
        Split dataset into left and right branches based on a threshold for a given feature.
        """
        left_mask = X[:, feature_index] < threshold
        right_mask = ~left_mask
        return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

    def __best_split(self, X, y):
        """
        Determine the best split for a dataset by testing all features and possible thresholds.
        Returns the feature index, threshold, and the split.
        """
        best_gini = float('inf')
        best_feature, best_threshold = None, None
        best_splits = None

        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, y_left, X_right, y_right = self.__split_dataset(X, y, feature_index, threshold)
                
                # Calculate weighted Gini impurity
                m, m_left, m_right = len(y), len(y_left), len(y_right)
                gini_split = (m_left / m) * self.__gini(y_left) + (m_right / m) * self.__gini(y_right)
                
                # Update best split if this one is better
                if gini_split < best_gini:
                    best_gini = gini_split
                    best_feature = feature_index
                    best_threshold = threshold
                    best_splits = (X_left, y_left, X_right, y_right)
        
        return best_feature, best_threshold, best_splits

    def __build_tree(self, X, y, depth):
        """
        Recursive function to build the tree.
        """
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))
        
        # Check stopping criteria
        if (depth >= self.max_depth or num_labels == 1 or num_samples < self.min_samples_split):
            leaf_value = self.__majority_vote(y)
            return {'leaf': True, 'value': leaf_value}
        
        # Find the best split
        feature_index, threshold, splits = self.__best_split(X, y)
        if splits is None:  # Stop if no further splitting is possible
            return {'leaf': True, 'value': self.__majority_vote(y)}
        
        # Recursively build the left and right subtrees
        X_left, y_left, X_right, y_right = splits
        left_subtree = self.__build_tree(X_left, y_left, depth + 1)
        right_subtree = self.__build_tree(X_right, y_right, depth + 1)
        
        # Return node information
        return {
            'leaf': False,
            'feature_index': feature_index,
            'threshold': threshold,
            'left': left_subtree,
            'right': right_subtree
        }

    def train(self):
        if self.X_train is None or self.y_train is None:
            raise ValueError("Training data not loaded.")
        self.tree = self.__build_tree(self.X_train.to_numpy(), self.y_train.to_numpy(), 0)

    def __majority_vote(self, y):
        """
        Returns the majority class label in an array.
        """
        return np.bincount(y).argmax()

    def __predict_sample(self, node, sample):
        """
        Predict the class for a single sample by traversing the tree.
        """
        if node['leaf']:
            return node['value']
        feature_index = node['feature_index']
        threshold = node['threshold']
        
        if sample[feature_index] < threshold:
            return self.__predict_sample(node['left'], sample)
        else:
            return self.__predict_sample(node['right'], sample)

    def predict_class(self):
        if self.tree is None:
            raise ValueError("Model not trained.")
        return np.array([self.__predict_sample(self.tree, sample) for sample in self.X_test.to_numpy()])

    def evaluate(self, metric='accuracy') -> float:
        y_pred = self.predict_class()
        
        if metric == 'accuracy':
            model_accuracy = np.mean(y_pred == self.y_test) * 100
            majority_class = np.bincount(self.y_test).argmax()
            baseline_accuracy = np.mean(self.y_test == majority_class) * 100
            
            print(f"Model Accuracy: {model_accuracy:.2f}%")
            print(f"Baseline Accuracy (Majority Class): {baseline_accuracy:.2f}%")
            
            return model_accuracy
        else:
            raise ValueError(f"Unsupported metric: {metric}")

In [49]:
import numpy as np
from sklearn.model_selection import train_test_split

class MultinomialNB:
    """
    A simple Multinomial Naive Bayes classifier implementing basic functionality.
    
    Parameters:
    - alpha: float, Laplace smoothing parameter.
    """
    
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.classes = None
        self.class_prior = None
        self.feature_likelihoods = None
        self.X_train, self.y_train = None, None
        self.X_test, self.y_test = None, None

    def load(self, data, target, frac=0.8, random_state=9, shuffle=True):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            data.drop(target, axis=1), data[target], test_size=1 - frac, 
            random_state=random_state, shuffle=shuffle
        )
        
    def train(self) -> None:
        """
        Fit the model to the training data.
        """
        self.classes = np.unique(self.y_train)
        num_samples, num_features = self.X_train.shape
        
        # Calculate class prior probabilities
        self.class_prior = np.zeros(len(self.classes))
        for idx, c in enumerate(self.classes):
            self.class_prior[idx] = np.sum(self.y_train == c) / num_samples
            
        # Initialize feature likelihoods
        self.feature_likelihoods = {}
        
        for c in self.classes:
            # Filter samples belonging to class c
            X_c = self.X_train[self.y_train == c]
            feature_counts = X_c.sum(axis=0) + self.alpha  # Add alpha for smoothing
            total_count = X_c.sum() + self.alpha * num_features  # Add alpha * num_features for normalization
            self.feature_likelihoods[c] = feature_counts / total_count

    def predict_class(self) -> np.array:
        """
        Predict class labels for the test data.
        """
        if self.feature_likelihoods is None:
            raise ValueError("Model not fitted yet.")
        
        predictions = []
        
        for sample in self.X_test.to_numpy():
            posteriors = []
            for idx, c in enumerate(self.classes):
                prior = np.log(self.class_prior[idx])
                likelihood = np.sum(np.log(self.feature_likelihoods[c]) * sample)
                posterior = prior + likelihood
                posteriors.append(posterior)
            predictions.append(self.classes[np.argmax(posteriors)])
        
        return np.array(predictions)

    def evaluate(self, metric='accuracy') -> float:
        y_pred = self.predict_class()
        
        if metric == 'accuracy':
            model_accuracy = np.mean(y_pred == self.y_test) * 100
            majority_class = np.bincount(self.y_test).argmax()
            baseline_accuracy = np.mean(self.y_test == majority_class) * 100
            
            print(f"Model Accuracy: {model_accuracy:.2f}%")
            print(f"Baseline Accuracy (Majority Class): {baseline_accuracy:.2f}%")
            
            return model_accuracy
        else:
            raise ValueError(f"Unsupported metric: {metric}")

    def tune(self, alpha_values: list) -> dict:
        """
        Tune the model by testing different alpha values for Laplace smoothing.
        
        Parameters:
        - alpha_values: list of alpha values to test.
        
        Returns:
        A dictionary of alpha values and their corresponding accuracies.
        """
        results = {}
        for alpha in alpha_values:
            self.alpha = alpha
            self.train()
            accuracy = self.evaluate(metric='accuracy')
            results[alpha] = accuracy
        return results

test_model = MultinomialNB(alpha=0.1)
test_model.load(data, 'satisfaction')
test_model.train()
accuracy = test_model.evaluate(metric='accuracy')

print(f"Model Accuracy: {accuracy:.2f}%")

Model Accuracy: 54.51%
Baseline Accuracy (Majority Class): 54.51%
Model Accuracy: 54.51%


## Testing the Model

In [None]:
# def evaluate_learning_rate(learning_rate):
#     model = LogisticClassifier(learning_rate=learning_rate, num_iterations=1000)
#     model.load(data, 'satisfaction')
#     model.train()
#     accuracy = model.evaluate()
#     return learning_rate, accuracy

# learning_range = np.linspace(0.00001, 1, 50)
# performance = {}

# for learning_rate in learning_range:
#     print(f"\nEvaluating learning rate: {learning_rate}\n")
#     results = evaluate_learning_rate(learning_rate)
#     print(f"Learning rate: {results[0]}, Accuracy: {results[1]}")
#     performance[learning_rate] = results[1]

# best_learning_rate = max(performance, key=performance.get)
# print(f"Best learning rate: {best_learning_rate}")


Evaluating learning rate: 1e-05

Model Accuracy: 45.49%
Baseline Accuracy (Majority Class): 54.51%
Learning rate: 1e-05, Accuracy: 45.49199260856175

Evaluating learning rate: 0.020417959183673468

Model Accuracy: 80.73%
Baseline Accuracy (Majority Class): 54.51%
Learning rate: 0.020417959183673468, Accuracy: 80.73221435170927

Evaluating learning rate: 0.04082591836734694

Model Accuracy: 82.03%
Baseline Accuracy (Majority Class): 54.51%
Learning rate: 0.04082591836734694, Accuracy: 82.02571604558054

Evaluating learning rate: 0.06123387755102041

Model Accuracy: 82.66%
Baseline Accuracy (Majority Class): 54.51%
Learning rate: 0.06123387755102041, Accuracy: 82.65706806282722

Evaluating learning rate: 0.08164183673469387

Model Accuracy: 83.00%
Baseline Accuracy (Majority Class): 54.51%
Learning rate: 0.08164183673469387, Accuracy: 83.00354173082846

Evaluating learning rate: 0.10204979591836734

Model Accuracy: 83.25%
Baseline Accuracy (Majority Class): 54.51%
Learning rate: 0.10204

In [None]:
# model = LogisticClassifier(learning_rate=best_learning_rate, num_iterations=1000)
# model.load(data, 'satisfaction')
# model.train()
# accuracy = model.evaluate()

# print(f"Accuracy: {accuracy:.2f}%")

Model Accuracy: 83.90%
Baseline Accuracy (Majority Class): 54.51%
Accuracy: 83.90%
