In [1]:
import itertools
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Input

In [2]:
train = pd.read_csv("Dataset/train.csv", encoding = "ISO-8859-1")
test = pd.read_csv("Dataset/test.csv", encoding = "ISO-8859-1")
df = pd.concat([train, test], axis=0)
df.to_csv("Dataset/raw_dataset.csv")

In [3]:
df.shape

(129880, 25)

In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
count,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129487.0
mean,44158.7,64940.5,39.427957,1190.316392,2.728696,3.057599,2.756876,2.976925,3.204774,3.252633,3.441361,3.358077,3.383023,3.350878,3.632114,3.306267,3.642193,3.286326,14.713713,15.091129
std,31207.377062,37493.270818,15.11936,997.452477,1.32934,1.526741,1.40174,1.27852,1.329933,1.350719,1.319289,1.334049,1.287099,1.316252,1.180025,1.266185,1.176669,1.313682,38.071126,38.46565
min,0.0,1.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,16234.75,32470.75,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,38963.5,64940.5,40.0,844.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,71433.25,97410.25,51.0,1744.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,103903.0,129880.0,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


In [5]:
# 1. Eliminamos las columnas inutiles
df.drop(["Unnamed: 0", "id"], axis=1, inplace=True)

In [6]:
# 2. Eliminamos filas con datos nulos
df.dropna(inplace=True)

In [7]:
# 3. Transformamos la columna satisfaction en 1s (satisfied) y 0s (neutral or dissatisfied)
df['satisfaction'] = df['satisfaction'].map({'satisfied': 1, 'neutral or dissatisfied': 0})


In [8]:
# 4 Correlacion
print(df.corr(method ='pearson'))

  print(df.corr(method ='pearson'))


                                        Age  Flight Distance  \
Age                                1.000000         0.099863   
Flight Distance                    0.099863         1.000000   
Inflight wifi service              0.015779         0.006554   
Departure/Arrival time convenient  0.036780        -0.018901   
Ease of Online booking             0.022294         0.064959   
Gate location                     -0.000709         0.005378   
Food and drink                     0.023283         0.057136   
Online boarding                    0.207485         0.215082   
Seat comfort                       0.159229         0.157825   
Inflight entertainment             0.074990         0.130518   
On-board service                   0.056743         0.111224   
Leg room service                   0.038992         0.134548   
Baggage handling                  -0.048192         0.064810   
Checkin service                    0.033182         0.073635   
Inflight service                  -0.051

In [9]:
# 4. Eliminamos estos datos porque tienen poca correlación con el valor a determinar (satisfied) TODO: agregar los valores de correlacion para argumentar
df.drop(["Departure/Arrival time convenient", "Gate location", "Departure Delay in Minutes", "Arrival Delay in Minutes"], axis=1, inplace=True)

In [10]:
for col in df.columns:
    print(f"columna {col}", str(df[col].nunique()), str(df[col].dtype))

columna Gender 2 object
columna Customer Type 2 object
columna Age 75 int64
columna Type of Travel 2 object
columna Class 3 object
columna Flight Distance 3821 int64
columna Inflight wifi service 6 int64
columna Ease of Online booking 6 int64
columna Food and drink 6 int64
columna Online boarding 6 int64
columna Seat comfort 6 int64
columna Inflight entertainment 6 int64
columna On-board service 6 int64
columna Leg room service 6 int64
columna Baggage handling 5 int64
columna Checkin service 6 int64
columna Inflight service 6 int64
columna Cleanliness 6 int64
columna satisfaction 2 int64


In [11]:
# 5. One Hot Encoding
gender_encoding = pd.get_dummies(df["Gender"])
customer_type_encoding = pd.get_dummies(df["Customer Type"])
travel_type_encoding = pd.get_dummies(df["Type of Travel"])
class_encoding = pd.get_dummies(df["Class"])

encoded_columns = pd.concat(
    [gender_encoding, customer_type_encoding, travel_type_encoding, class_encoding],
    axis=1,
)
satisfaction = df["satisfaction"]

df.drop(["Gender", "Customer Type", "Type of Travel", "Class", "satisfaction"], axis=1, inplace=True)

df = pd.concat([df, encoded_columns, satisfaction], axis=1)


In [12]:
# 6. Ver Outliers
def outliers_removal(column):
    return df[
        (df[column] <= df[column].quantile(0.99))
        & (df[column] >= df[column].quantile(0.01))
    ]


for column in df.columns:
    df = outliers_removal(column)

In [13]:
df.describe()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,...,Female,Male,Loyal Customer,disloyal Customer,Business travel,Personal Travel,Business,Eco,Eco Plus,satisfaction
count,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0,...,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0,124855.0
mean,39.229482,1174.666693,2.737271,2.772096,3.209547,3.252997,3.441913,3.360666,3.387169,3.367298,...,0.505667,0.494333,0.813512,0.186488,0.692083,0.307917,0.478203,0.449874,0.071923,0.435089
std,14.712553,961.19153,1.324824,1.388592,1.326763,1.351838,1.321537,1.333619,1.285225,1.299235,...,0.49997,0.49997,0.389502,0.389502,0.461634,0.461634,0.499527,0.497483,0.258362,0.495771
min,8.0,101.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,421.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,40.0,846.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,51.0,1726.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
max,70.0,3884.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
# 5. Normalizamos con MinMax

scaler = MinMaxScaler()

df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [15]:
df_normalized

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,...,Female,Male,Loyal Customer,disloyal Customer,Business travel,Personal Travel,Business,Eco,Eco Plus,satisfaction
0,0.080645,0.094898,0.6,0.6,1.00,0.6,1.00,1.00,0.75,0.50,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.274194,0.035422,0.6,0.6,0.00,0.6,0.00,0.00,0.00,1.00,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.290323,0.275178,0.4,0.4,1.00,1.0,1.00,1.00,0.75,0.50,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,0.274194,0.121861,0.4,1.0,0.25,0.4,0.25,0.25,0.25,1.00,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.854839,0.029870,0.6,0.6,0.75,1.0,1.00,0.50,0.50,0.75,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124850,0.419355,0.112345,0.6,0.6,0.75,0.6,0.75,0.75,0.50,0.25,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
124851,0.241935,0.144066,0.8,0.8,0.75,0.8,0.75,0.75,0.75,1.00,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
124852,0.145161,0.192176,0.4,0.2,0.25,0.2,0.25,0.25,0.75,0.50,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
124853,0.096774,0.271213,0.6,0.6,0.75,0.8,0.75,0.75,0.50,0.25,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [16]:
df["satisfaction"].value_counts()

0    70532
1    54323
Name: satisfaction, dtype: int64

In [17]:
df_normalized.to_csv('df.csv', index=False)