# ML MODELS

## Ridge

In [79]:
# Imports básicos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Imports de sklearn
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Configuración de visualización
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [80]:
# Cargar el dataset de saturación
df = pd.read_csv('../data/processed/weekly_sales_saturation.csv')

# Ver las primeras filas
print("=" * 50)
print("PRIMERAS 5 FILAS DEL DATASET")
print("=" * 50)
print(df.head())

# Ver información del dataset
print("\n" + "=" * 50)
print("INFORMACIÓN DEL DATASET")
print("=" * 50)
print(f"Filas: {df.shape[0]}")
print(f"Columnas: {df.shape[1]}")
print(f"\nColumnas disponibles:\n{df.columns.tolist()}")

# Verificar tipos de datos
print("\n" + "=" * 50)
print("TIPOS DE DATOS")
print("=" * 50)
print(df.dtypes)

# Verificar valores nulos
print("\n" + "=" * 50)
print("VALORES NULOS")
print("=" * 50)
print(df.isnull().sum())

PRIMERAS 5 FILAS DEL DATASET
   Digital_sat  TV_sat  OOH_sat  promo  trend  Month  holiday_a  holiday_b  \
0     0.142295     0.0      0.0      0      1      1          1          0   
1     0.296500     0.0      0.0      1      2      1          0          0   
2     0.383901     0.0      0.0      0      3      1          0          0   
3     0.523305     0.0      0.0      1      4      1          0          0   
4     0.461213     0.0      0.0      0      5      2          0          0   

   holiday_c     Sales        Date  
0          0  26129335  2013-01-06  
1          0  49275222  2013-01-13  
2          0  34377765  2013-01-20  
3          0  46040169  2013-01-27  
4          0  38466029  2013-02-03  

INFORMACIÓN DEL DATASET
Filas: 135
Columnas: 11

Columnas disponibles:
['Digital_sat', 'TV_sat', 'OOH_sat', 'promo', 'trend', 'Month', 'holiday_a', 'holiday_b', 'holiday_c', 'Sales', 'Date']

TIPOS DE DATOS
Digital_sat    float64
TV_sat         float64
OOH_sat        float64
pro

### model prep

In [81]:
feature_cols= [
    'Digital_sat',
    'TV_sat',
    'OOH_sat',
    'promo',
    'trend',
    'Month',
    'holiday_a'
]

X= df[feature_cols]
y = df['Sales']
dates= df['Date']

In [82]:
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"dates shape: {dates.shape}")


X shape: (135, 7)
y shape: (135,)
dates shape: (135,)


In [83]:
# Divide en orden: primero train, después test
split_idx = int(len(X) * 0.8)

X_train = X[:split_idx]   # Primeras 80% semanas
X_test = X[split_idx:]    # Últimas 20% semanas
y_train = y[:split_idx]
y_test = y[split_idx:]
dates_train = dates[:split_idx]
dates_test = dates[split_idx:]

print(X_train.shape)
print(X_test.shape)

(108, 7)
(27, 7)


In [84]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)




### Ridge

In [85]:

# alpha list 
param_grid = {'alpha': [50, 100, 200, 500, 1000]}

# time series split 
tscv = TimeSeriesSplit(n_splits=5)

# Grid search CV
grid_search = GridSearchCV(
    estimator= Ridge(),
    param_grid= param_grid,
    cv= tscv,
    scoring= 'r2',
    n_jobs=1
)

grid_search.fit(X_train_scaled,y_train)

# Mejor alpha encontrado
print(f"Mejor alpha: {grid_search.best_params_['alpha']}")

# Mejor score (R² en training con CV)
print(f"Mejor R² (CV): {grid_search.best_score_:.4f}")

# Ver todos los resultados
results_df = pd.DataFrame(grid_search.cv_results_)
print(results_df[['param_alpha', 'mean_test_score', 'std_test_score']])


Mejor alpha: 50
Mejor R² (CV): 0.4248
   param_alpha  mean_test_score  std_test_score
0           50         0.424776        0.181724
1          100         0.318765        0.147923
2          200         0.196289        0.122319
3          500         0.058580        0.098097
4         1000        -0.009463        0.091587


In [86]:
# Modelo final con mejor alpha
best_model = Ridge(alpha=50)
best_model.fit(X_train_scaled, y_train)

# Predicciones
y_pred_train = best_model.predict(X_train_scaled)
y_pred_test = best_model.predict(X_test_scaled)

r2 = r2_score(y_test, y_pred_test)
print(r2)

0.26942304368170034


In [87]:
 #1. Compara predicciones vs valores reales
print("Test Set - Primeras 10 observaciones:")
comparison = pd.DataFrame({
    'y_real': y_test[:10].values,
    'y_pred': y_pred_test[:10],
    'error': y_test[:10].values - y_pred_test[:10]
})
print(comparison)

# 2. Compara rangos
print(f"\nRango y_test: {y_test.min():.0f} - {y_test.max():.0f}")
print(f"Rango y_pred_test: {y_pred_test.min():.0f} - {y_pred_test.max():.0f}")

# 3. Verifica R² en training (debería ser alto)
r2_train = r2_score(y_train, y_pred_train)
print(f"\nR² Training: {r2_train:.4f}")
print(f"R² Test: {r2:.4f}")

# 4. Calcula baseline (predecir solo la media)
baseline_pred = np.full(len(y_test), y_train.mean())
r2_baseline = r2_score(y_test, baseline_pred)
print(f"R² Baseline (media): {r2_baseline:.4f}")

Test Set - Primeras 10 observaciones:
     y_real        y_pred         error
0  50783355  4.602515e+07  4.758206e+06
1  50548507  4.597827e+07  4.570240e+06
2  39497832  3.631331e+07  3.184524e+06
3  48295825  4.500307e+07  3.292750e+06
4  39936680  3.642484e+07  3.511837e+06
5  52123970  4.556520e+07  6.558767e+06
6  39436926  3.575331e+07  3.683618e+06
7  50662424  4.408906e+07  6.573364e+06
8  39585055  3.549100e+07  4.094058e+06
9  55923085  4.596077e+07  9.962310e+06

Rango y_test: 34879894 - 55923085
Rango y_pred_test: 34914677 - 46025149

R² Training: 0.6719
R² Test: 0.2694
R² Baseline (media): -0.2153


In [88]:
# 1. Reducir features - quitar las holidays menos frecuentes
feature_cols_simple = [
    'Digital_sat',
    'TV_sat', 
    'OOH_sat',
    'promo',
    'trend',
    'Month'
]

# 2. Recrear X con menos features
X_simple = df[feature_cols_simple]

# 3. Hacer el mismo split temporal
split_idx = int(len(X_simple) * 0.8)
X_train_simple = X_simple[:split_idx]
X_test_simple = X_simple[split_idx:]

# 4. SIN SCALING (las features ya están normalizadas)
# Digital_sat, TV_sat, OOH_sat ya están en [0,1]
# No necesitas StandardScaler para Ridge con features ya normalizadas

# 5. GridSearch con alphas más altos
param_grid = {'alpha': [10, 50, 100, 200, 500]}
tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    Ridge(),
    param_grid,
    cv=tscv,
    scoring='r2'
)

grid_search.fit(X_train_simple, y_train)

# 6. Evaluar
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test_simple)

print(f"Mejor alpha: {grid_search.best_params_['alpha']}")
print(f"R² CV: {grid_search.best_score_:.4f}")
print(f"R² Test: {r2_score(y_test, y_pred_test):.4f}")

Mejor alpha: 10
R² CV: 0.3133
R² Test: -0.1239


In [89]:
# Analicemos las diferencias entre train y test
print("=" * 60)
print("COMPARACIÓN TRAIN vs TEST")
print("=" * 60)

# 1. Estadísticas básicas
print("\nSales - Train:")
print(f"  Media: {y_train.mean():,.0f}")
print(f"  Std: {y_train.std():,.0f}")
print(f"  Min: {y_train.min():,.0f}")
print(f"  Max: {y_train.max():,.0f}")

print("\nSales - Test:")
print(f"  Media: {y_test.mean():,.0f}")
print(f"  Std: {y_test.std():,.0f}")
print(f"  Min: {y_test.min():,.0f}")
print(f"  Max: {y_test.max():,.0f}")

# 2. Distribución de features clave
print("\n" + "=" * 60)
print("FEATURES - Comparación de medias")
print("=" * 60)
for col in feature_cols_simple:
    train_mean = X_train_simple[col].mean()
    test_mean = X_test_simple[col].mean()
    diff_pct = ((test_mean - train_mean) / train_mean * 100) if train_mean != 0 else 0
    print(f"{col:15} | Train: {train_mean:8.4f} | Test: {test_mean:8.4f} | Diff: {diff_pct:+6.1f}%")

# 3. Ver las fechas del test set
print("\n" + "=" * 60)
print("PERÍODO DEL TEST SET")
print("=" * 60)
dates_train = dates[:split_idx]
dates_test = dates[split_idx:]
print(f"Train: {dates_train.iloc[0]} a {dates_train.iloc[-1]}")
print(f"Test:  {dates_test.iloc[0]} a {dates_test.iloc[-1]}")

COMPARACIÓN TRAIN vs TEST

Sales - Train:
  Media: 42,888,583
  Std: 8,020,733
  Min: 26,129,335
  Max: 75,371,329

Sales - Test:
  Media: 45,970,878
  Std: 6,769,637
  Min: 34,879,894
  Max: 55,923,085

FEATURES - Comparación de medias
Digital_sat     | Train:   0.4615 | Test:   0.4754 | Diff:   +3.0%
TV_sat          | Train:   0.2781 | Test:   0.0885 | Diff:  -68.2%
OOH_sat         | Train:   0.3046 | Test:   0.0000 | Diff: -100.0%
promo           | Train:   0.5278 | Test:   0.5556 | Diff:   +5.3%
trend           | Train:  54.5000 | Test: 122.0000 | Diff: +123.9%
Month           | Train:   6.3519 | Test:   4.5926 | Diff:  -27.7%

PERÍODO DEL TEST SET
Train: 2013-01-06 a 2015-01-25
Test:  2015-02-01 a 2015-08-02


revisar m,i nb 01, para ver por que en tv y ooh solo puse 3 spends?