# Dependencies

In [None]:
!pip install numpy==1.22.4
!pip install matplotlib==3.7.1
!pip install mglearn==0.2.0
!pip install pandas==2.0.3
!pip install scikit-learn==1.3.2
!pip install seaborn==0.13.1

In [None]:
!pip freeze

# Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display
import seaborn as sns
from sklearn import datasets
from sklearn.cluster import KMeans

In [2]:
df=pd.read_csv("airline_satisfaction.csv")

In [3]:
y = df.satisfaction.values

categorical_variables = ["Gender","Customer Type","Type of Travel","Class"]

quantitative_variables = ['Age','Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes']

ordinal_variables = ["Inflight wifi service", "Departure/Arrival time convenient","Ease of Online booking", 
                     "Gate location","Food and drink","Online boarding","Seat comfort","Inflight entertainment",
                     "On-board service","Leg room service","Baggage handling","Checkin service","Inflight service",
                     "Cleanliness"]

X = df[categorical_variables + quantitative_variables + ordinal_variables].values
X_quantitative = df[quantitative_variables].values
X_categorical = df[categorical_variables].values
X_ordinal = df[ordinal_variables].values

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


# Data Preprocessing

## Encoding Variables

### One-Hot Encoding
for quantitative variables

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
one_hot_encoder = OneHotEncoder()

In [7]:
one_hot_encoder.fit(X_categorical)
one_hot_encoded_X = one_hot_encoder.transform(X_categorical)

In [8]:
one_hot_encoded_X.toarray()

array([[0., 1., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 1., ..., 1., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 1., 0.]])

In [9]:
one_hot_encoder.get_feature_names_out()

array(['x0_Female', 'x0_Male', 'x1_Loyal Customer',
       'x1_disloyal Customer', 'x2_Business travel', 'x2_Personal Travel',
       'x3_Business', 'x3_Eco', 'x3_Eco Plus'], dtype=object)

In [10]:
one_hot_encoded_X

<129880x9 sparse matrix of type '<class 'numpy.float64'>'
	with 519520 stored elements in Compressed Sparse Row format>

In [11]:
df_quantitative=df[quantitative_variables]

In [12]:
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols)

print("Number of new columns generated after one-hot encoding:", df_encoded.shape[1] - df.shape[1])

Number of new columns generated after one-hot encoding: 6


In [13]:
X_encoded = pd.concat([df_quantitative, df_encoded], axis=1)

### Ordinal Encoding

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
label_encoder = LabelEncoder()

In [16]:
encoded_vars = []
for ordinal_var in X_ordinal.T:
    encoded_vars.append(label_encoder.fit_transform(ordinal_var))
    print(f"===== Encoded classes {ordinal_var}: {label_encoder.classes_}")

===== Encoded classes [3 3 2 ... 2 3 2]: [0 1 2 3 4 5]
===== Encoded classes [4 2 2 ... 5 3 5]: [0 1 2 3 4 5]
===== Encoded classes [3 3 2 ... 1 3 2]: [0 1 2 3 4 5]
===== Encoded classes [1 3 2 ... 5 3 5]: [0 1 2 3 4 5]
===== Encoded classes [5 1 5 ... 2 4 4]: [0 1 2 3 4 5]
===== Encoded classes [3 3 5 ... 1 4 2]: [0 1 2 3 4 5]
===== Encoded classes [5 1 5 ... 2 4 2]: [0 1 2 3 4 5]
===== Encoded classes [5 1 5 ... 2 4 1]: [0 1 2 3 4 5]
===== Encoded classes [4 1 4 ... 4 3 1]: [0 1 2 3 4 5]
===== Encoded classes [3 5 3 ... 3 2 2]: [0 1 2 3 4 5]
===== Encoded classes [4 3 4 ... 4 5 1]: [1 2 3 4 5]
===== Encoded classes [4 1 4 ... 5 4 1]: [0 1 2 3 4 5]
===== Encoded classes [5 4 4 ... 4 5 1]: [0 1 2 3 4 5]
===== Encoded classes [5 1 5 ... 2 4 1]: [0 1 2 3 4 5]


In [17]:
X_ordinal = np.array(encoded_vars).T

In [18]:
X_ordinal

array([[3, 4, 3, ..., 4, 5, 5],
       [3, 2, 3, ..., 1, 4, 1],
       [2, 2, 2, ..., 4, 4, 5],
       ...,
       [2, 5, 1, ..., 5, 4, 2],
       [3, 3, 3, ..., 4, 5, 4],
       [2, 5, 2, ..., 1, 1, 1]])

In [19]:
ordinal_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}

for col in ordinal_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

In [20]:
X_labelled_categories = np.concatenate([df[col].values.reshape(-1, 1) for col in ordinal_cols], axis=1)

In [21]:
X_encoded=np.concatenate([X_ordinal, X_labelled_categories],axis=1)

In [22]:
print(X_encoded)

[[3 4 3 ... 1 2 0]
 [3 2 3 ... 0 0 0]
 [2 2 2 ... 0 0 1]
 ...
 [2 5 1 ... 1 1 0]
 [3 3 3 ... 0 0 1]
 [2 5 2 ... 1 1 0]]


## Imputing Missing Values

In [23]:
from sklearn.impute import SimpleImputer

In [24]:
simple_imputer = SimpleImputer(strategy="mean")
simple_imputer.fit(X_quantitative)

X_imputed = simple_imputer.transform(X_quantitative)

In [25]:
X_imputed

array([[1.300e+01, 4.600e+02, 2.500e+01, 1.800e+01],
       [2.500e+01, 2.350e+02, 1.000e+00, 6.000e+00],
       [2.600e+01, 1.142e+03, 0.000e+00, 0.000e+00],
       ...,
       [1.700e+01, 8.280e+02, 0.000e+00, 0.000e+00],
       [1.400e+01, 1.127e+03, 0.000e+00, 0.000e+00],
       [4.200e+01, 2.640e+02, 0.000e+00, 0.000e+00]])

trash

In [None]:
model = KMeans(n_clusters=3)

In [None]:
model.fit(X_quantitative)

In [None]:
all_predictions = model.predict(df.data)

In [None]:
print(all_predictions)

In [None]:
from sklearn import datasets
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
model = TSNE(learning_rate=100)

In [None]:
transformed = model.fit_transform(df)

In [None]:
x_axis = transformed[:, 0]
y_axis = transformed[:, 1]

plt.scatter(x_axis, y_axis, c=iris_df.target)
plt.show()