In [10]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [11]:
file = 'airline_passenger_satisfaction.csv'
airline_data = pd.read_csv(file)

In [12]:
airline_data.head()

Unnamed: 0.1,Unnamed: 0,Gender,customer_type,age,type_of_travel,customer_class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,...,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
0,0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [13]:
# Here we can see that the arrival_delay_in_minutes has 393 rows with blank data. These should be removed. 
print(airline_data.isna().sum())

Unnamed: 0                             0
Gender                                 0
customer_type                          0
age                                    0
type_of_travel                         0
customer_class                         0
flight_distance                        0
inflight_wifi_service                  0
departure_arrival_time_convenient      0
ease_of_online_booking                 0
gate_location                          0
food_and_drink                         0
online_boarding                        0
seat_comfort                           0
inflight_entertainment                 0
onboard_service                        0
leg_room_service                       0
baggage_handling                       0
checkin_service                        0
inflight_service                       0
cleanliness                            0
departure_delay_in_minutes             0
arrival_delay_in_minutes             393
satisfaction                           0
dtype: int64


In [14]:
pre_clean_length = len(airline_data)
print(f"The data length pre-cleaning was {pre_clean_length}")
airline_data_clean = airline_data.dropna()
print(f"The data length post-cleaning was {len(airline_data_clean)}")
difference = pre_clean_length - len(airline_data_clean)
print(f"{difference} rows were removed.")

The data length pre-cleaning was 129880
The data length post-cleaning was 129487
393 rows were removed.


In [15]:
airline_data_clean.to_csv('Tableu_Ready_Airline_Data', index=False)

# Prep data for ML

In [16]:
# Transform binary data into 0s and 1s
airline_data_clean[['Gender']] = airline_data_clean[['Gender']].replace(['Male', 'Female'], [0,1])
airline_data_clean[['customer_type']] = airline_data_clean[['customer_type']].replace(['Loyal Customer', 'disloyal Customer'], [0,1])
airline_data_clean[['type_of_travel']] = airline_data_clean[['type_of_travel']].replace(['Personal Travel', 'Business travel'], [0,1])
airline_data_clean[['satisfaction']] = airline_data_clean[['satisfaction']].replace(['neutral or dissatisfied', 'satisfied'], [0,1])
airline_data_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0.1,Unnamed: 0,Gender,customer_type,age,type_of_travel,customer_class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,...,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
0,0,0,0,13,0,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,0
1,1,0,1,25,1,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,0
2,2,1,0,26,1,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,3,1,0,25,1,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,4,0,0,61,1,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,129875,0,1,34,1,Business,526,3,3,3,...,4,3,2,4,4,5,4,0,0.0,0
129876,129876,0,0,23,1,Business,646,4,4,4,...,4,4,5,5,5,5,4,0,0.0,1
129877,129877,1,0,17,0,Eco,828,2,5,1,...,2,4,3,4,5,4,2,0,0.0,0
129878,129878,0,0,14,1,Business,1127,3,3,3,...,4,3,2,5,4,5,4,0,0.0,1


In [17]:
# One hot encode the object column (customer_class)
airline_data_clean = pd.get_dummies(airline_data_clean)

In [18]:
# Drop extra column
airline_data_clean = airline_data_clean.drop('Unnamed: 0', axis=1)
airline_data_clean

Unnamed: 0,Gender,customer_type,age,type_of_travel,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,...,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction,customer_class_Business,customer_class_Eco,customer_class_Eco Plus
0,0,0,13,0,460,3,4,3,1,5,...,4,4,5,5,25,18.0,0,0,0,1
1,0,1,25,1,235,3,2,3,3,1,...,3,1,4,1,1,6.0,0,1,0,0
2,1,0,26,1,1142,2,2,2,2,5,...,4,4,4,5,0,0.0,1,1,0,0
3,1,0,25,1,562,2,5,5,5,2,...,3,1,4,2,11,9.0,0,1,0,0
4,0,0,61,1,214,3,3,3,3,4,...,4,3,3,3,0,0.0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,0,1,34,1,526,3,3,3,1,4,...,4,4,5,4,0,0.0,0,1,0,0
129876,0,0,23,1,646,4,4,4,4,4,...,5,5,5,4,0,0.0,1,1,0,0
129877,1,0,17,0,828,2,5,1,5,2,...,4,5,4,2,0,0.0,0,0,1,0
129878,0,0,14,1,1127,3,3,3,3,4,...,5,4,5,4,0,0.0,1,1,0,0


# Supervised ML

In [19]:
airline_data_clean['satisfaction'].value_counts()

0    73225
1    56262
Name: satisfaction, dtype: int64

In [20]:
airline_data_clean.columns

Index(['Gender', 'customer_type', 'age', 'type_of_travel', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'onboard_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'satisfaction', 'customer_class_Business', 'customer_class_Eco',
       'customer_class_Eco Plus'],
      dtype='object')

In [21]:
 # Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape() to create this

X = airline_data_clean[['Gender', 'customer_type', 'age', 'type_of_travel', 'flight_distance',
       'inflight_wifi_service', 'departure_arrival_time_convenient',
       'ease_of_online_booking', 'gate_location', 'food_and_drink',
       'online_boarding', 'seat_comfort', 'inflight_entertainment',
       'onboard_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'inflight_service', 'cleanliness',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes', 
        'customer_class_Business', 'customer_class_Eco',
       'customer_class_Eco Plus']]
y = airline_data_clean['satisfaction']

print("Shape: ", X.shape, y.shape)

Shape:  (129487, 24) (129487,)


In [22]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
# Train a Logistic Regression model print the model score
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8040570457704783
Testing Data Score: 0.8057271716298036


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [24]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=42, n_estimators=100).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9999897029295165
Testing Score: 0.9610156925738292
