# Linear Model

In [17]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression

In [18]:
# Definir rutas en Drive
BASE_DATOS = r'C:\Users\Elisabeth\Desktop\MAESTRIA_AUSTRAL\Labo_III\labo3-2025v\datasets'
BASE_INTERMEDIOS   = r'C:\Users\Elisabeth\Desktop\MAESTRIA_AUSTRAL\Labo_III\labo3-2025v\entrega_final\intermedios'
SALIDAS   = r'C:\Users\Elisabeth\Desktop\MAESTRIA_AUSTRAL\Labo_III\labo3-2025v\entrega_final\output'

In [19]:
# Carga archivo CSVsep="\t")
df = pd.read_csv(os.path.join(BASE_DATOS, "sell-in.txt"),sep="\t")
df = df[['product_id', 'periodo', 'tn']]
df_limpio = pd.read_csv(os.path.join(BASE_INTERMEDIOS, "df_limpio_product_id.csv"),sep="\t")

In [21]:
# Agrupamos por Product ID
df_grouped = (
    df.groupby(['product_id', 'periodo'], as_index=False)['tn']
      .sum()
)

df_grouped = df_grouped.sort_values(['product_id', 'periodo'])
df_grouped['clase'] = (
    df_grouped.groupby('product_id')['tn'].shift(-2)
)

In [22]:
# Calculamos Lags
for lag in range(1, 12):
    df_grouped[f'tn_{lag}'] = df_grouped.groupby('product_id')['tn'].shift(lag)
# df_grouped[df_grouped['product_id'] == 20089]

In [23]:
# 1. Crear el rango completo de periodos
periodos_completos = sorted(df_grouped['periodo'].unique())
set_periodos = set(periodos_completos)

# 2. Crear una lista para guardar los productos con datos faltantes
productos_con_faltantes = []

# 3. Iterar por producto y verificar si cubre todos los períodos
for product_id, grupo in df_grouped.groupby('product_id'):
    periodos_producto = set(grupo['periodo'])
    if periodos_producto != set_periodos:
        productos_con_faltantes.append(product_id)

# 4. Mostrar el resultado
print(f"🔎 Productos con al menos un periodo faltante: {len(productos_con_faltantes)}")
print(productos_con_faltantes)

🔎 Productos con al menos un periodo faltante: 697
[20032, 20034, 20036, 20040, 20049, 20060, 20064, 20083, 20085, 20089, 20098, 20104, 20110, 20112, 20126, 20127, 20128, 20130, 20131, 20135, 20141, 20143, 20147, 20149, 20150, 20154, 20156, 20159, 20164, 20170, 20172, 20174, 20186, 20191, 20192, 20195, 20199, 20202, 20203, 20210, 20213, 20214, 20217, 20218, 20221, 20223, 20229, 20236, 20237, 20243, 20245, 20247, 20248, 20257, 20258, 20260, 20261, 20262, 20266, 20274, 20286, 20287, 20293, 20294, 20297, 20298, 20306, 20313, 20318, 20319, 20323, 20331, 20333, 20334, 20337, 20339, 20340, 20343, 20344, 20347, 20348, 20351, 20355, 20363, 20364, 20368, 20369, 20370, 20371, 20373, 20377, 20378, 20387, 20389, 20390, 20391, 20392, 20393, 20395, 20397, 20402, 20403, 20405, 20408, 20414, 20415, 20417, 20420, 20423, 20425, 20426, 20427, 20430, 20431, 20436, 20437, 20439, 20440, 20441, 20442, 20444, 20445, 20446, 20447, 20448, 20451, 20452, 20453, 20455, 20456, 20457, 20458, 20459, 20460, 20461, 2046

In [24]:
# MODELO
magicos = [20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
           20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046,
           20049, 20051, 20052, 20053, 20055, 20008, 20001, 20017,
           20086, 20180, 20193, 20320, 20532, 20612, 20637, 20807, 20838]

#dataset para entrenamiento
train_df = df_grouped[(df_grouped['periodo'] == 201812) & (df_grouped['product_id'].isin(magicos))].copy()

features = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
train_df = train_df.dropna(subset=['clase'] + features)

X_train = train_df[features]
y_train = train_df['clase']

model = LinearRegression().fit(X_train, y_train)

# Lista de features usadas
features = ['tn'] + [f'tn_{i}' for i in range(1, 12)]

# Coeficientes asociados a cada variable
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef:.6f}")

# Intercepto del modelo
print(f"\nIntercepto: {model.intercept_:.6f}")

# Dataset para prediccion
test_df = df_grouped[df_grouped['periodo'] == 201912].copy()
test_df['complete'] = test_df[features].notna().all(axis=1)

# Aplicar modelo solo a registros completos
df_complete = test_df[test_df['complete']].copy()
df_complete['pred'] = model.predict(df_complete[features])

# Calcular promedio para incompletos
avg_incompletos = test_df.loc[~test_df['complete'], 'tn'].sum()

print("✅ Registros completos:", len(df_complete))
print("🔢 TN total de completos:", df_complete['tn'].sum())
print("❌ Registros incompletos:", (~test_df['complete']).sum())
print("🔢 TN total de incompletos:", avg_incompletos)

tn: -0.001339
tn_1: 0.236558
tn_2: 0.178208
tn_3: -0.060031
tn_4: -0.161875
tn_5: -0.007775
tn_6: 0.151936
tn_7: 0.043933
tn_8: 0.142839
tn_9: 0.103804
tn_10: 0.119211
tn_11: 0.073671

Intercepto: 0.441467
✅ Registros completos: 751
🔢 TN total de completos: 24181.9075
❌ Registros incompletos: 176
🔢 TN total de incompletos: 2035.15978


In [25]:
df_complete.shape

(751, 17)

In [None]:
# === 1. Cargar productos objetivo ===
productos_pred = pd.read_csv(os.path.join(BASE_DATOS, "productos_pred.txt"), sep="\t")

# === 2. Filtrar df_grouped a periodo 201912 ===
test_df = df_grouped[df_grouped['periodo'] == 201912].copy()

# === 3. Calcular campo 'complete' (sin nulos en tn, tn_1...tn_11) ===
features = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
test_df['complete'] = test_df[features].notna().all(axis=1)

# === 4. Predecir con regresión para los completos ===
df_complete = test_df[test_df['complete']].copy()
df_complete['pred'] = model.predict(df_complete[features])

# Nueva estrategia: para cada producto incompleto, usar el promedio de los últimos 12 meses disponibles (ignorando NaN)
df_incomplete = test_df[~test_df['complete']].copy()

# Calcular promedio de las columnas tn, tn_1 ... tn_11 en la fila
df_incomplete['pred'] = df_incomplete[features].mean(axis=1, skipna=True)
df_incomplete['metodo'] = 'promedio_12m'

# Marcar método modelo para los completos
df_complete['metodo'] = 'modelo'

# Unir completos e incompletos
df_pred_todos = pd.concat([df_complete, df_incomplete], axis=0)

# === 7. Hacer merge con productos_pred (por product_id)
df_final = productos_pred.merge(df_pred_todos[['product_id', 'pred']], on='product_id', how='left')

# Crear carpeta si no existe
os.makedirs(SALIDAS, exist_ok=True)

# Guardar CSV
df_final.to_csv(
    os.path.join(SALIDAS, "pred_modelo_RL.csv"),
    index=False
)

print("✅ Archivo generado:.../pred_modelo_RL.csv")

✅ Archivo generado:.../predicciones_productos_clase81.csv


In [27]:
df_final.shape

(780, 2)

In [34]:
# ===  Filtrar df_grouped a periodo 201910 ===
valid_df = df_grouped[df_grouped['periodo'] == 201910].copy()

# === Calcular campo 'complete' (sin nulos en tn, tn_1...tn_11) ===
features = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
valid_df['complete'] = valid_df[features].notna().all(axis=1)

# === Predecir con regresión para los completos ===
df_complete_valid = valid_df[valid_df['complete']].copy()
df_complete_valid['tn_pred'] = model.predict(df_complete_valid[features])

df_incomplete_valid = valid_df[~valid_df['complete']].copy()
df_incomplete_valid['tn_pred'] = df_incomplete_valid[features].mean(axis=1, skipna=True)

# Unir completos e incompletos
df_pred_valid = pd.concat([df_complete_valid, df_incomplete_valid], axis=0)



In [38]:
df_pred_valid.head()
df_pred_valid.shape[0]

952

In [39]:
# ===  Filtrar df_grouped a periodo 201910 ===
valid_df_201912 = df_grouped[df_grouped['periodo'] == 201912].copy()

In [41]:
# 1) Asegúrate de que valid_df_201912 sólo tenga product_id y tn real
df_real = valid_df_201912[['product_id','tn']].rename(columns={'tn':'tn_real'})

In [43]:
# 2) Merge en df_pred_valid para traer tn_real
df_pred_valid = df_pred_valid.merge(
    df_real,
    on='product_id',
    how='left'    # si algún product_id de pred no está en real, quedará NaN
)

In [44]:
# === calcular errores absolutos y relativos ===
df_pred_valid['abs_error'] = (df_pred_valid['tn_real'] - df_pred_valid['tn_pred']).abs()
df_pred_valid['rel_error'] = df_pred_valid['abs_error'] / df_pred_valid['tn_real']

# === Calcular Total Forecast Error (TFE) ===
total_error = df_pred_valid['abs_error'].sum()
total_sales = df_pred_valid['tn_real'].sum()
tfe = total_error / total_sales
print(f"\n📉 Total Forecast Error (TFE): {tfe:.4f}")


📉 Total Forecast Error (TFE): 0.3020


In [50]:
# === Construir el DataFrame resumen con product_id, modelo, tn_pred, abs_error y rel_error ===
df_errors = pd.DataFrame({
  
    'product_id': df_pred_valid['product_id'],
    'tn_pred_RL': df_pred_valid['tn_pred'],
    'abs_error_RL': df_pred_valid['abs_error']
})

# 6. Mostrar resultado
print(df_errors.head())

   product_id   tn_pred_RL  abs_error_RL
0       20001  1260.707460    243.981100
1       20002   952.585776    134.722774
2       20003   689.528570    202.972720
3       20004   470.124722    167.775298
4       20005   400.887363    192.357067


In [51]:
# Guardar CSV
df_errors.to_csv(
    os.path.join(SALIDAS, "error_modelo_RL.csv"),
    index=False
)