In [15]:
import pandas as pd

df = pd.read_csv('airlines_flights_data.csv')
df.head(10)

Unnamed: 0,index,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
5,5,Vistara,UK-945,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.33,1,5955
6,6,Vistara,UK-927,Delhi,Morning,zero,Morning,Mumbai,Economy,2.08,1,6060
7,7,Vistara,UK-951,Delhi,Afternoon,zero,Evening,Mumbai,Economy,2.17,1,6060
8,8,GO_FIRST,G8-334,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.17,1,5954
9,9,GO_FIRST,G8-336,Delhi,Afternoon,zero,Evening,Mumbai,Economy,2.25,1,5954


In [16]:
df.dtypes

index                 int64
airline              object
flight               object
source_city          object
departure_time       object
stops                object
arrival_time         object
destination_city     object
class                object
duration            float64
days_left             int64
price                 int64
dtype: object

In [17]:
# Removendo a coluna de índice
df.drop(columns='index', inplace=True)

# Mantendo apenas as colunas numéricas
df_num = df.select_dtypes(include='number')

df_num.head(10)

Unnamed: 0,duration,days_left,price
0,2.17,1,5953
1,2.33,1,5953
2,2.17,1,5956
3,2.25,1,5955
4,2.33,1,5955
5,2.33,1,5955
6,2.08,1,6060
7,2.17,1,6060
8,2.17,1,5954
9,2.25,1,5954


In [18]:
df_obj = df.select_dtypes(include='object')

df_obj.nunique()

airline                6
flight              1561
source_city            6
departure_time         6
stops                  3
arrival_time           6
destination_city       6
class                  2
dtype: int64

In [19]:
# Armazenando a variável alvo separadamente
y = df_obj['class']

# Aplicando mapeamento binário (0 e 1) na variável alvo
classes = y.unique()
mapping = {cls: i for i, cls in enumerate(sorted(classes))}
print("Mapeamento aplicado ao target:", mapping)

# Convertendo as classes originais para os valores mapeados
y_mapped = y.map(mapping)

Mapeamento aplicado ao target: {'Business': 0, 'Economy': 1}


In [20]:
# Removendo a coluna 'flight' (alta cardinalidade) e a coluna 'class' (alvo)
df_cat = df_obj.drop(columns=['flight', 'class'])

# Aplicando one-hot encoding para as variáveis categóricas restantes
X = pd.get_dummies(df_cat, drop_first=False)

# Convertendo colunas booleanas para inteiros (0 e 1)
X = X.astype(int)

# Unificando o one-hot encoding com as colunas numéricas
X = pd.concat([df_num, X], axis=1)

X.head(10)

Unnamed: 0,duration,days_left,price,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,source_city_Bangalore,...,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai
0,2.17,1,5953,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
1,2.33,1,5953,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
2,2.17,1,5956,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2.25,1,5955,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,2.33,1,5955,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
5,2.33,1,5955,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
6,2.08,1,6060,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
7,2.17,1,6060,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
8,2.17,1,5954,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
9,2.25,1,5954,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [21]:
# Checando o alinhamento dos indexes
print("Indexes are aligned!" if X.index.equals(df_num.index) else "Indexes are not aligned!")

Indexes are aligned!


In [22]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Dividindo os dados de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.2, random_state=42)

# Inicializa o scaler
scaler = MinMaxScaler()

# Normalizando os dados de treino
X_train_scaled = scaler.fit_transform(X_train)

# Normalizando os dados de teste
X_test_scaled = scaler.transform(X_test)

In [23]:
# Converte para DataFrame e adiciona a coluna alvo
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
train_df['target'] = y_train.values

test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
test_df['target'] = y_test.values

# Salva como CSV os dados de treino e teste
train_df.to_csv('dados_treino.csv', index=False)
test_df.to_csv('dados_teste.csv', index=False)

In [None]:
train_df.shape

(240122, 37)

In [28]:
train_df.head(15)

Unnamed: 0,duration,days_left,price,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,source_city_Bangalore,...,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai,target
0,0.379388,0.104167,0.101823,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1,0.125918,0.25,0.072438,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1
2,0.415102,0.895833,0.449945,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
3,0.192245,0.208333,0.055934,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
4,0.523878,0.083333,0.446001,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
5,0.096939,0.479167,0.033624,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
6,0.083469,0.479167,0.033837,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
7,0.440612,0.708333,0.50082,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
8,0.154898,0.625,0.050891,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
9,0.479592,0.291667,0.025794,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1


In [25]:
test_df.shape

(60031, 37)

In [30]:
test_df.head(15)

Unnamed: 0,duration,days_left,price,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,source_city_Bangalore,...,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai,target
0,0.386122,0.8125,0.051334,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1,0.183673,0.854167,0.52249,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,0.197347,0.833333,0.041733,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,0.27898,0.270833,0.484192,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
4,0.151429,0.395833,0.044873,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
5,0.204082,0.083333,0.028287,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
6,0.023878,0.125,0.186388,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
7,0.141224,0.9375,0.022588,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
8,0.520408,0.895833,0.255194,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
9,0.394694,0.020833,0.62096,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
