# Data cleaning and understanding

In [1]:
# Loading the raw dataset
import pandas as pd
import numpy as np
df = pd.read_csv("../data/raw/original_dataset.csv")

## 1. First observations

In [2]:
df.shape

(103904, 25)

In [3]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  str    
 3   Customer Type                      103904 non-null  str    
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  str    
 6   Class                              103904 non-null  str    
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      103904 non-null

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [5]:
#dropping unnamed columns
df.drop(columns=["Unnamed: 0"], inplace=True)

In [6]:
df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [7]:
df.describe()

Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
count,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103594.0
mean,64924.210502,39.379706,1189.448375,2.729683,3.060296,2.756901,2.976883,3.202129,3.250375,3.439396,3.358158,3.382363,3.351055,3.631833,3.30429,3.640428,3.286351,14.815618,15.178678
std,37463.812252,15.114964,997.147281,1.327829,1.525075,1.398929,1.277621,1.329533,1.349509,1.319088,1.332991,1.288354,1.315605,1.180903,1.265396,1.175663,1.312273,38.230901,38.698682
min,1.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,32533.75,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,64856.5,40.0,843.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,97368.25,51.0,1743.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,129880.0,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


## 2. Dataset cleaning

In [8]:
# column name standardization
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
      .str.replace("/", "_")
)

In [9]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   gender                             103904 non-null  str    
 2   customer_type                      103904 non-null  str    
 3   age                                103904 non-null  int64  
 4   type_of_travel                     103904 non-null  str    
 5   class                              103904 non-null  str    
 6   flight_distance                    103904 non-null  int64  
 7   inflight_wifi_service              103904 non-null  int64  
 8   departure_arrival_time_convenient  103904 non-null  int64  
 9   ease_of_online_booking             103904 non-null  int64  
 10  gate_location                      103904 non-null  int64  
 11  food_and_drink                     103904 non-null

In [10]:
df.head()

Unnamed: 0,id,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,...,inflight_entertainment,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


### 2.2. Categorical columns inspection and cleaning

In [11]:
# Identifying categorical columns
categorical_columns = df.select_dtypes(include="object").columns
categorical_columns

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_columns = df.select_dtypes(include="object").columns


Index(['gender', 'customer_type', 'type_of_travel', 'class', 'satisfaction'], dtype='str')

In [12]:
# Identifying unique values of categorical columns
for column in categorical_columns:
    print(f"\n{column.upper()}")
    print(df[column].value_counts())


GENDER
gender
Female    52727
Male      51177
Name: count, dtype: int64

CUSTOMER_TYPE
customer_type
Loyal Customer       84923
disloyal Customer    18981
Name: count, dtype: int64

TYPE_OF_TRAVEL
type_of_travel
Business travel    71655
Personal Travel    32249
Name: count, dtype: int64

CLASS
class
Business    49665
Eco         46745
Eco Plus     7494
Name: count, dtype: int64

SATISFACTION
satisfaction
neutral or dissatisfied    58879
satisfied                  45025
Name: count, dtype: int64


In [13]:
# Refinning categorical columns' values
for col in categorical_columns:
    df[col] = df[col].str.strip().str.title()

In [14]:
#checking:
for column in categorical_columns:
    print(f"\n{column.upper()}")
    print(df[column].value_counts())


GENDER
gender
Female    52727
Male      51177
Name: count, dtype: int64

CUSTOMER_TYPE
customer_type
Loyal Customer       84923
Disloyal Customer    18981
Name: count, dtype: int64

TYPE_OF_TRAVEL
type_of_travel
Business Travel    71655
Personal Travel    32249
Name: count, dtype: int64

CLASS
class
Business    49665
Eco         46745
Eco Plus     7494
Name: count, dtype: int64

SATISFACTION
satisfaction
Neutral Or Dissatisfied    58879
Satisfied                  45025
Name: count, dtype: int64


## Missing Values

In [15]:
df.isna().sum().sort_values(ascending=False)

arrival_delay_in_minutes             310
id                                     0
customer_type                          0
gender                                 0
type_of_travel                         0
class                                  0
flight_distance                        0
age                                    0
inflight_wifi_service                  0
departure_arrival_time_convenient      0
gate_location                          0
ease_of_online_booking                 0
online_boarding                        0
seat_comfort                           0
inflight_entertainment                 0
food_and_drink                         0
on-board_service                       0
leg_room_service                       0
checkin_service                        0
baggage_handling                       0
inflight_service                       0
cleanliness                            0
departure_delay_in_minutes             0
satisfaction                           0
dtype: int64

In [16]:
# Missing values as percentages
missing_summary = (
    df.isna()
      .mean()
      .mul(100)
      .round(2)
      .sort_values(ascending=False)
)

missing_summary

arrival_delay_in_minutes             0.3
id                                   0.0
customer_type                        0.0
gender                               0.0
type_of_travel                       0.0
class                                0.0
flight_distance                      0.0
age                                  0.0
inflight_wifi_service                0.0
departure_arrival_time_convenient    0.0
gate_location                        0.0
ease_of_online_booking               0.0
online_boarding                      0.0
seat_comfort                         0.0
inflight_entertainment               0.0
food_and_drink                       0.0
on-board_service                     0.0
leg_room_service                     0.0
checkin_service                      0.0
baggage_handling                     0.0
inflight_service                     0.0
cleanliness                          0.0
departure_delay_in_minutes           0.0
satisfaction                         0.0
dtype: float64

< 1% missing is considered negligible.

In [17]:
df[df.isna().any(axis=1)].head()

Unnamed: 0,id,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,...,inflight_entertainment,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
213,49608,Female,Loyal Customer,38,Business Travel,Eco,109,5,3,3,...,5,5,2,4,1,1,5,31,,Satisfied
1124,73442,Male,Loyal Customer,53,Personal Travel,Eco,1012,3,2,3,...,4,4,4,4,3,3,4,38,,Neutral Or Dissatisfied
1529,71178,Male,Loyal Customer,39,Business Travel,Business,733,2,5,5,...,2,2,2,2,2,2,3,11,,Neutral Or Dissatisfied
2004,72940,Female,Disloyal Customer,26,Business Travel,Business,1035,3,3,3,...,2,3,3,4,5,5,2,41,,Neutral Or Dissatisfied
2108,116374,Female,Loyal Customer,24,Personal Travel,Eco,417,2,1,2,...,5,1,4,2,1,2,5,1,,Neutral Or Dissatisfied


The missing values are limited exclusively to arrival_delay_minutes, while all other features (including departure delays, service ratings, and satisfaction labels) are complete, indicating that missingness is isolated and not associated with passenger satisfaction.

What can we do with the missing value now? it means we can impute using departure delay. we can do this because in airline operations, arrival delay is often closely tied to departure delay and departure delay is available so we can simply impute and it is consistent.

In [18]:
# Impute missing arrival delays
df["arrival_delay_in_minutes"] = df["arrival_delay_in_minutes"].fillna(
    df["departure_delay_in_minutes"]
)

In [19]:
#checking
df.isna().sum()

id                                   0
gender                               0
customer_type                        0
age                                  0
type_of_travel                       0
class                                0
flight_distance                      0
inflight_wifi_service                0
departure_arrival_time_convenient    0
ease_of_online_booking               0
gate_location                        0
food_and_drink                       0
online_boarding                      0
seat_comfort                         0
inflight_entertainment               0
on-board_service                     0
leg_room_service                     0
baggage_handling                     0
checkin_service                      0
inflight_service                     0
cleanliness                          0
departure_delay_in_minutes           0
arrival_delay_in_minutes             0
satisfaction                         0
dtype: int64

In [20]:
df[["arrival_delay_in_minutes", "departure_delay_in_minutes"]].dtypes

arrival_delay_in_minutes      float64
departure_delay_in_minutes      int64
dtype: object

arrival_delay_in_minutes was previously missing and pandas uses float. We imputed values so type stayed float. All values are whole numbers. In order to be on the safe side I change the float to int.

In [21]:
df["arrival_delay_in_minutes"] = df["arrival_delay_in_minutes"].astype(int)

## Duplication

In [22]:
df.duplicated().sum()

np.int64(0)

In [23]:
#checking for id duplication as it is a unique identifier
df.duplicated(subset=df.columns.drop("id")).sum()

np.int64(0)

A duplicate check confirmed that no fully duplicated records are present in this dataset.

## Validating numerical ranges & outliers

In [24]:
numerical_columns = df.select_dtypes(include=["int64", "float64"]).columns
numerical_columns

Index(['id', 'age', 'flight_distance', 'inflight_wifi_service',
       'departure_arrival_time_convenient', 'ease_of_online_booking',
       'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'on-board_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight_service',
       'cleanliness', 'departure_delay_in_minutes',
       'arrival_delay_in_minutes'],
      dtype='str')

In [25]:
# Excluding id column from numeric columns
numerical_columns_new = numerical_columns.drop("id")
numerical_columns_new

Index(['age', 'flight_distance', 'inflight_wifi_service',
       'departure_arrival_time_convenient', 'ease_of_online_booking',
       'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'on-board_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight_service',
       'cleanliness', 'departure_delay_in_minutes',
       'arrival_delay_in_minutes'],
      dtype='str')

In [26]:
#defining rating columns
rating_columns = [
    "inflight_wifi_service",
    "departure_arrival_time_convenient",
    "ease_of_online_booking",
    "gate_location",
    "food_and_drink",
    "online_boarding",
    "seat_comfort",
    "inflight_entertainment",
    "on-board_service",
    "leg_room_service",
    "baggage_handling",
    "checkin_service",
    "inflight_service",
    "cleanliness"
]

In [27]:
#validating rating columns
df[rating_columns].agg(["min", "max"])

Unnamed: 0,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,inflight_entertainment,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness
min,0,0,0,0,0,0,0,0,0,0,1,0,0,0
max,5,5,5,5,5,5,5,5,5,5,5,5,5,5


All service rating variables fall within the expected 0–5 scale, and no out-of-range values were identified.

In [28]:
#Validating delays
df[["departure_delay_in_minutes", "arrival_delay_in_minutes"]].describe()

Unnamed: 0,departure_delay_in_minutes,arrival_delay_in_minutes
count,103904.0,103904.0
mean,14.815618,15.245072
std,38.230901,38.808674
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,12.0,13.0
max,1592.0,1584.0


Departure and arrival delays are non-negative and highly right-skewed, with most flights on time or minimally delayed (median = 0) and a small number of extreme but plausible delays that should be retained as real operational events. 

In [29]:
#validating age
df["age"].describe()

count    103904.000000
mean         39.379706
std          15.114964
min           7.000000
25%          27.000000
50%          40.000000
75%          51.000000
max          85.000000
Name: age, dtype: float64

Most values are within a realistic human range. Age values were validated and found to lie within a realistic human range (7–85 years), indicating no invalid or implausible entries.

In [30]:
#validating flight_distance
df["flight_distance"].describe()


count    103904.000000
mean       1189.448375
std         997.147281
min          31.000000
25%         414.000000
50%         843.000000
75%        1743.000000
max        4983.000000
Name: flight_distance, dtype: float64

Flight distance values fall within realistic operational ranges, with no zero or negative distances and a right-skewed distribution reflecting a mix of short- and long-haul flights.

In [31]:
#checking
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype
---  ------                             --------------   -----
 0   id                                 103904 non-null  int64
 1   gender                             103904 non-null  str  
 2   customer_type                      103904 non-null  str  
 3   age                                103904 non-null  int64
 4   type_of_travel                     103904 non-null  str  
 5   class                              103904 non-null  str  
 6   flight_distance                    103904 non-null  int64
 7   inflight_wifi_service              103904 non-null  int64
 8   departure_arrival_time_convenient  103904 non-null  int64
 9   ease_of_online_booking             103904 non-null  int64
 10  gate_location                      103904 non-null  int64
 11  food_and_drink                     103904 non-null  int64
 12  online_boardi

In [32]:
#Exporting
df.to_csv(
    "../data/clean/clean_dataset.csv",
    index=False,
    sep=";",
    encoding="utf-8"
)