# ML MODELS

## Ridge

In [80]:
# Imports básicos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Imports de sklearn
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Configuración de visualización
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [81]:
# Cargar el dataset de saturación
df = pd.read_csv('../data/processed/weekly_sales_saturation.csv')

# Ver las primeras filas
print("=" * 50)
print("PRIMERAS 5 FILAS DEL DATASET")
print("=" * 50)
print(df.head())

# Ver información del dataset
print("\n" + "=" * 50)
print("INFORMACIÓN DEL DATASET")
print("=" * 50)
print(f"Filas: {df.shape[0]}")
print(f"Columnas: {df.shape[1]}")
print(f"\nColumnas disponibles:\n{df.columns.tolist()}")

# Verificar tipos de datos
print("\n" + "=" * 50)
print("TIPOS DE DATOS")
print("=" * 50)
print(df.dtypes)

# Verificar valores nulos
print("\n" + "=" * 50)
print("VALORES NULOS")
print("=" * 50)
print(df.isnull().sum())

PRIMERAS 5 FILAS DEL DATASET
   Digital_sat  TV_sat  OOH_sat  promo  trend  Month  holiday_a  holiday_b  \
0     0.142295     0.0      0.0      0      1      1          1          0   
1     0.296500     0.0      0.0      1      2      1          0          0   
2     0.383901     0.0      0.0      0      3      1          0          0   
3     0.523305     0.0      0.0      1      4      1          0          0   
4     0.461213     0.0      0.0      0      5      2          0          0   

   holiday_c     Sales        Date  
0          0  26129335  2013-01-06  
1          0  49275222  2013-01-13  
2          0  34377765  2013-01-20  
3          0  46040169  2013-01-27  
4          0  38466029  2013-02-03  

INFORMACIÓN DEL DATASET
Filas: 135
Columnas: 11

Columnas disponibles:
['Digital_sat', 'TV_sat', 'OOH_sat', 'promo', 'trend', 'Month', 'holiday_a', 'holiday_b', 'holiday_c', 'Sales', 'Date']

TIPOS DE DATOS
Digital_sat    float64
TV_sat         float64
OOH_sat        float64
pro

### model prep

In [82]:
feature_cols= [
    'Digital_sat',
    'TV_sat',
    'OOH_sat',
    'promo',
    'trend',
    'Month',
    'holiday_a'
]

X= df[feature_cols]
y = df['Sales']
dates= df['Date']

In [83]:
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"dates shape: {dates.shape}")


X shape: (135, 7)
y shape: (135,)
dates shape: (135,)


In [84]:
# Divide en orden: primero train, después test
split_idx = int(len(X) * 0.8)

X_train = X[:split_idx]   # Primeras 80% semanas
X_test = X[split_idx:]    # Últimas 20% semanas
y_train = y[:split_idx]
y_test = y[split_idx:]
dates_train = dates[:split_idx]
dates_test = dates[split_idx:]

print(X_train.shape)
print(X_test.shape)

(108, 7)
(27, 7)


In [85]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)




### Ridge

In [86]:

# alpha list 
param_grid = {'alpha': [1,5,10,20,50,100,200]}

# time series split 
tscv = TimeSeriesSplit(n_splits=5)

# Grid search CV
grid_search = GridSearchCV(
    estimator= Ridge(),
    param_grid= param_grid,
    cv= tscv,
    scoring= 'r2',
    n_jobs=1
)

grid_search.fit(X_train_scaled,y_train)

# Mejor alpha encontrado
print(f"Mejor alpha: {grid_search.best_params_['alpha']}")

# Mejor score (R² en training con CV)
print(f"Mejor R² (CV): {grid_search.best_score_:.4f}")

# Ver todos los resultados
results_df = pd.DataFrame(grid_search.cv_results_)
print(results_df[['param_alpha', 'mean_test_score', 'std_test_score']])


Mejor alpha: 10
Mejor R² (CV): 0.5426
   param_alpha  mean_test_score  std_test_score
0            1         0.512095        0.473932
1            5         0.540688        0.416775
2           10         0.542584        0.358778
3           20         0.520612        0.283879
4           50         0.433179        0.196374
5          100         0.322956        0.154940
6          200         0.198101        0.124336


In [90]:
# Modelo final con mejor alpha
best_model = Ridge(alpha=10)
best_model.fit(X_train_scaled, y_train)

# Predicciones
y_pred_train = best_model.predict(X_train_scaled)
y_pred_test = best_model.predict(X_test_scaled)

r2 = r2_score(y_test, y_pred_test)
print(r2)

0.7263623332535649


In [88]:
 #1. Compara predicciones vs valores reales
print("Test Set - Primeras 10 observaciones:")
comparison = pd.DataFrame({
    'y_real': y_test[:10].values,
    'y_pred': y_pred_test[:10],
    'error': y_test[:10].values - y_pred_test[:10]
})
print(comparison)

# 2. Compara rangos
print(f"\nRango y_test: {y_test.min():.0f} - {y_test.max():.0f}")
print(f"Rango y_pred_test: {y_pred_test.min():.0f} - {y_pred_test.max():.0f}")

# 3. Verifica R² en training (debería ser alto)
r2_train = r2_score(y_train, y_pred_train)
print(f"\nR² Training: {r2_train:.4f}")
print(f"R² Test: {r2:.4f}")

# 4. Calcula baseline (predecir solo la media)
baseline_pred = np.full(len(y_test), y_train.mean())
r2_baseline = r2_score(y_test, baseline_pred)
print(f"R² Baseline (media): {r2_baseline:.4f}")

Test Set - Primeras 10 observaciones:
     y_real        y_pred         error
0  50783355  4.662046e+07  4.162897e+06
1  50548507  4.611041e+07  4.438094e+06
2  39497832  3.332889e+07  6.168941e+06
3  48295825  4.885339e+07 -5.575609e+05
4  39936680  3.779249e+07  2.144193e+06
5  52123970  4.994583e+07  2.178141e+06
6  39436926  3.891493e+07  5.219953e+05
7  50662424  5.099743e+07 -3.350023e+05
8  39585055  3.994663e+07 -3.615753e+05
9  55923085  5.248337e+07  3.439715e+06

Rango y_test: 34879894 - 55923085
Rango y_pred_test: 32727452 - 52483370

R² Training: 0.7531
R² Test: 0.7264
R² Baseline (media): -0.2153


# Features to implement
1. remove linear trend -> fourier terms
2. calcular ROI por canal
3. elasticnet
4. lasso
5. bayesian ridge
6. probar robyn/lightMMM
7. xboost?
