In [None]:
# Instalación de dependencias necesarias
!pip install kagglehub[pandas-datasets] plotly pandas scikit-learn xgboost tensorflow shap

In [None]:
# importación de librerías
import kagglehub
from kagglehub import KaggleDatasetAdapter
import plotly.express as px
import plotly.figure_factory as ff
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU
import shap
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
# declaración de variables
kaggle_handler="jaklinmalkoc/predict-future-sales-retail-dataset-en"
kaggle_paths = {
  "item_categories": "item_categories.csv",
  "items": "items.csv",
  "sales_train": "sales_train.csv",
  "shops": "shops.csv",
  "test": "test.csv"
}
data_sets: dict[str, DataFrame] = {}

In [None]:
# Cargar los datasets desde Kaggle
for kaggle_path in kaggle_paths.values():
  data_sets[kaggle_path] = kagglehub.load_dataset(
      KaggleDatasetAdapter.PANDAS,
      kaggle_handler,
      path=kaggle_path
  )

# mostrar datasets
for dataset_name, dataset in data_sets.items():
  print(f"{dataset_name}:")
  print(dataset.head())

In [None]:
# Asignar el dataset de ventas a una variable
sales_train = data_sets["sales_train.csv"]

# Convertir la columna 'date' a formato datetime
sales_train['date'] = pd.to_datetime(sales_train['date'], dayfirst=True)


# Gráficos

## Histograma de precios (item_price)

In [None]:
fig_price = px.histogram(
    sales_train,
    x='item_price',
    nbins=50,
    labels={'item_price': 'Precio del producto', 'count': 'Frecuencia'},
    title='Distribución de Precios de los Productos (item_price)',
    color_discrete_sequence=['#4A90E2']  # Azul suave
)

fig_price.update_traces(texttemplate='%{y}', textposition='outside')

fig_price.update_layout(
    xaxis_title='Precio del producto',
    yaxis_title='Frecuencia de transacciones',
    title_x=0.5
)

fig_price.show()

## Histograma de (item_cnt_day)

In [None]:
fig_cnt = px.histogram(
    sales_train,
    x='item_cnt_day',
    nbins=50,
    labels={'item_cnt_day': 'Cantidad vendida por día', 'count': 'Frecuencia'},
    title='Distribución de Cantidad Vendida por Día (item_cnt_day)',
    color_discrete_sequence=['#4A90E2'] 
)

fig_cnt.update_traces(texttemplate='%{y}', textposition='outside')

fig_cnt.update_layout(
    xaxis_title='Unidades vendidas por día (item_cnt_day)',
    yaxis_title='Frecuencia de observaciones',
    title_x=0.5
)

fig_cnt.show()

## Tendencia temporal de ventas por fecha

In [None]:
daily_sales = sales_train.groupby('date')['item_cnt_day'].sum().reset_index()

fig_trend = px.line(
    daily_sales,
    x='date',
    y='item_cnt_day',
    labels={'date': 'Fecha', 'item_cnt_day': 'Cantidad total vendida'},
    title='Tendencia Temporal de Ventas Diarias'
)

fig_trend.update_layout(
    xaxis_title='Fecha',
    yaxis_title='Unidades totales vendidas por día',
    title_x=0.5
)

fig_trend.show()

## Matriz de correlación entre item_price y item_cnt_day

In [None]:
corr = sales_train[['item_price', 'item_cnt_day']].corr()

# Heatmap
fig_heatmap = ff.create_annotated_heatmap(
    z=corr.values,
    x=['Precio (item_price)', 'Cantidad (item_cnt_day)'],
    y=['Precio (item_price)', 'Cantidad (item_cnt_day)']
)

fig_heatmap.update_layout(
    title='Mapa de Calor de Correlación entre Precio y Cantidad',
    xaxis_title='Variables',
    yaxis_title='Variables',
    title_x=0.5
)

fig_heatmap.show()

# Métricas y análisis

In [None]:
# Preparar X e y, y dividir en entrenamiento/validación (20%) sin shuffle
X = sales_train.drop(columns=['item_cnt_day'])
y = sales_train['item_cnt_day']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Preparación de datos
# - Eliminar la columna 'date' porque no es numérica y causará errores en modelos de sklearn.
# - Si se necesita información temporal, extraer características temporales (ej. año, mes, día, día_semana).
# - Verificar y tratar valores nulos antes de entrenar (imputación o eliminación).
# - Considerar escalar/normalizar las características para modelos sensibles a la escala (SVR, MLP, redes).
X_train_numeric = X_train.drop(columns=['date'])
X_val_numeric = X_val.drop(columns=['date'])

# Importar solo lo que no fue importado previamente en el notebook
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# -------------------------
# REGRESIÓN LINEAL
# -------------------------
lr = LinearRegression()
lr.fit(X_train_numeric, y_train)
pred_lr = lr.predict(X_val_numeric)

# -------------------------
# SVR (Support Vector Regressor)
# -------------------------
svr = SVR()
svr.fit(X_train_numeric, y_train)
pred_svr = svr.predict(X_val_numeric)

# -------------------------
# RANDOM FOREST
# -------------------------
rf = RandomForestRegressor(n_estimators=200, random_state=0)
rf.fit(X_train_numeric, y_train)
pred_rf = rf.predict(X_val_numeric)

# -------------------------
# XGBOOST
# -------------------------
xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=0
)
xgb_model.fit(X_train_numeric, y_train)
pred_xgb = xgb_model.predict(X_val_numeric)

# -------------------------
# MLP (Red neuronal densa)
# -------------------------
mlp = MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=200, random_state=0)
mlp.fit(X_train_numeric, y_train)
pred_mlp = mlp.predict(X_val_numeric)

# -------------------------
# LSTM
# -------------------------
# reshape para LSTM: (samples, timesteps, features)
X_train_lstm = np.array(X_train_numeric).reshape(len(X_train_numeric), 1, X_train_numeric.shape[1])
X_val_lstm = np.array(X_val_numeric).reshape(len(X_val_numeric), 1, X_val_numeric.shape[1])

model_lstm = Sequential([
    LSTM(64, activation='relu', input_shape=(1, X_train_numeric.shape[1])),
    Dense(1)
])
model_lstm.compile(optimizer='adam', loss='mse')
model_lstm.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=0)
pred_lstm = model_lstm.predict(X_val_lstm).flatten()

# -------------------------
# GRU
# -------------------------
model_gru = Sequential([
    GRU(64, activation='relu', input_shape=(1, X_train_numeric.shape[1])),
    Dense(1)
])
model_gru.compile(optimizer='adam', loss='mse')
model_gru.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=0)
pred_gru = model_gru.predict(X_val_lstm).flatten()

## Cálculo de métricas y tabla

In [None]:
# Cálculo de métricas
def calcular_metricas(y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    return mae, rmse, mape, r2

mae_lr,  rmse_lr,  mape_lr,  r2_lr  = calcular_metricas(y_val, pred_lr)
mae_svr, rmse_svr, mape_svr, r2_svr = calcular_metricas(y_val, pred_svr)
mae_rf,  rmse_rf,  mape_rf,  r2_rf  = calcular_metricas(y_val, pred_rf)
mae_xgb, rmse_xgb, mape_xgb, r2_xgb = calcular_metricas(y_val, pred_xgb)
mae_mlp, rmse_mlp, mape_mlp, r2_mlp = calcular_metricas(y_val, pred_mlp)
mae_lstm,rmse_lstm,mape_lstm,r2_lstm = calcular_metricas(y_val, pred_lstm)
mae_gru, rmse_gru, mape_gru, r2_gru = calcular_metricas(y_val, pred_gru)

# Tabla de métricas
metricas = {
    'Modelo': ['Regresión Lineal', 'SVR', 'Random Forest', 'XGBoost', 'MLP', 'LSTM', 'GRU'],
    'MAE':   [mae_lr, mae_svr, mae_rf, mae_xgb, mae_mlp, mae_lstm, mae_gru],
    'RMSE':  [rmse_lr, rmse_svr, rmse_rf, rmse_xgb, rmse_mlp, rmse_lstm, rmse_gru],
    'MAPE':  [mape_lr, mape_svr, mape_rf, mape_xgb, mape_mlp, mape_lstm, mape_gru],
    'R²':    [r2_lr, r2_svr, r2_rf, r2_xgb, r2_mlp, r2_lstm, r2_gru]
}

df_metricas = pd.DataFrame(metricas)
df_metricas = df_metricas.sort_values(by='RMSE')
print(df_metricas)