### VIVA-DataSky


In [None]:
#%% MODULES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
#%%
# FIRST MODEL FOR GETTING WHICH PRODUCT TYPES CORRESPOND TO THE FLIGHT
# STEP 1: Preprocessing data
data = pd.read_csv('Flights and Sales.csv')
#%%
# STEP 2: Transforming categorical variables to numerical using One-Hot Encoding
data_encoded = pd.get_dummies(data, columns=['PRODUCTTYPE', 'DESTINATION_TYPE'])
data_encoded = data_encoded[~data_encoded["PASSENGERS"].isna()] # Excluding all 2024 flights
#%%
columns_to_predict = ["PRODUCTTYPE_"+x for x in list(set(data["PRODUCTTYPE"].values.tolist()))] # All One-Hot Encoding columns to iterate and predicted
differences = [ ] # For saving the discrepancies between the actual value and the predicted value

for column in columns_to_predict:
    # STEP 3: Setting our features (X) - Only numerical columns
    X = data_encoded.drop(columns=[column, 'AERONAVE', 'FLIGHT_ID', "PRODUCTNAME", "DEPARTURESTATION", "FECHA_SALIDA"])
    # STEP 4: Setting our target variable for predicting
    y = data_encoded[column]

    # STEP 5: Splitting Training (80%) and Testing (20%) dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # STEP 6: Creating and training logistic regression model with the required iterations and training data
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # STEP 7: Making the prediction for the target variable
    y_pred = model.predict(X_test)

    # STEP 8: Setting metrics for analyzing results
    differences.append([(a, b) for a, b in zip(y_test, y_pred) if a != b]) # Saving differences to a list for analysis comparison
    print(f"Classification Report for {column}:")
    print(classification_report(y_test, y_pred))
    print(f'Accuracy: {model.score(X_test, y_test)}')

In [None]:

# SECOND MODEL FOR GETTING THE QUANTITY FOR EACH PRODUCT
# Using Gradient Boosting Model boosting performance with decision trees.
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#%%
# STEP 1 : Getting the data
# STEP 2 : Setting features (numerical variables) and target variable
featureColumns = data_encoded.drop(columns=[ 'PRODUCTTYPE_Antros', 'PRODUCTTYPE_Bebidas Calientes', 'PRODUCTTYPE_Botanas', 'PRODUCTTYPE_Galletas',
                                'PRODUCTTYPE_Lacteos', 'PRODUCTTYPE_Licores', 'PRODUCTTYPE_Perecederos', 'PRODUCTTYPE_Refrescos', 'PRODUCTTYPE_Sopas',
                               'AERONAVE', 'FLIGHT_ID', "PRODUCTNAME", "DEPARTURESTATION", "FECHA_SALIDA"])
X = featureColumns
y = data_encoded["QUANTITY"]
# STEP 3: Splitting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#%%
# predictsFor2024 = data_encoded[data_encoded["PASSENGERS"].isna()] # Excluding all 2024 flights
#%%
# STEP 4 : Defining and training the Gradient Boosting model
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_regressor.fit(X_train, y_train)
#%%
# STEP 5: Predicting the target variable
y_pred = gb_regressor.predict(X_test)
#%%
# STEP 6 : Processing predicted data for more accuracy
y_pred_rounded = np.round(y_pred)
print("Rounded Predictions:", y_pred_rounded)
#%%
# STEP 7: Evaluating model's perfomance
mse = mean_squared_error(y_test, y_pred_rounded)
print("Mean Squared Error:", mse)
# %%
# READING INPUT FLIGHTS
data = pd.read_csv('predicciones_2024.csv')
# %%
# PREPROCESSING DATA FOR MODEL
data["PASSENGERS"] = 0
data["QUANTITY"] = 0
data = pd.get_dummies(data, columns=['DESTINATION_TYPE'])
missing_columns = ['DESTINATION_TYPE_Ciudad Fronteriza', 'DESTINATION_TYPE_Ciudad Principal', 'DESTINATION_TYPE_Ecoturismo', 'DESTINATION_TYPE_MX Amigos y Familia', 'DESTINATION_TYPE_Playa']
for column in missing_columns:
    if column not in data.columns:
        data[column] = 0
data = data[featureColumns.columns]
#%%
# PREDICTING 5 DAYS FOR 2024
y_pred = gb_regressor.predict(data)
# %%