# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import *

In [2]:
df = pd.read_csv('sell_in_filtrado_con_tn.csv', sep='\t')
df.shape

(7781619, 6)

In [3]:
df.head()

Unnamed: 0,period,product_id,customer_id,tn,cat1,target
0,2017-01,20001,10001,99.43861,HC,92.46537
1,2017-02,20001,10001,198.84365,HC,13.29728
2,2017-03,20001,10001,92.46537,HC,101.00563
3,2017-04,20001,10001,13.29728,HC,128.04792
4,2017-05,20001,10001,101.00563,HC,101.20711


In [4]:
# 1) Extraer el mapeo único product_id → cat1 usando solo filas donde cat1 NO sea NaN
df_prod_cat1 = (
    df[['product_id','cat1']]
    .dropna(subset=['cat1'])                # descartamos filas donde cat1 es NaN
    .drop_duplicates(subset=['product_id'])  # nos quedamos con la primera ocurrencia por product_id
    .reset_index(drop=True)
)

# Verificamos si, para algún product_id, hay más de un cat1 distinto (inconsistencias)
inconsistencias = (
    df[['product_id','cat1']]
    .dropna(subset=['cat1'])
    .drop_duplicates()
    .groupby('product_id')['cat1']
    .nunique()
    .loc[lambda s: s > 1]
)
if len(inconsistencias) > 0:
    print("Estos product_id tienen más de una categoría cat1 registrada:")
    print(inconsistencias)
else:
    print("Todos los product_id tienen una sola cat1 consistente (ignorando NaN).")

# Ahora df_prod_cat1 tiene dos columnas: ['product_id','cat1'], con un único cat1 por producto.
print("\nDataFrame product_id → cat1 (sin NaN):")
print(df_prod_cat1.head())

Todos los product_id tienen una sola cat1 consistente (ignorando NaN).

DataFrame product_id → cat1 (sin NaN):
   product_id   cat1
0       20001     HC
1       20002     HC
2       20003  FOODS
3       20004  FOODS
4       20005  FOODS


In [5]:
# 2) Convertir este mapeo en un diccionario para hacer un .map() más adelante
mapeo_cat1 = dict(zip(df_prod_cat1['product_id'], df_prod_cat1['cat1']))

# 3) Rellenar los NaN de df['cat1'] usando ese mapeo
#    Solo afectará a filas donde df['cat1'] es NaN pero product_id existe en mapeo_cat1
df['cat1'] = df['cat1'].fillna(df['product_id'].map(mapeo_cat1))

# 4) (Opcional) Verificar cuántos NaN quedaron en cat1 tras rellenar
nan_restantes = df['cat1'].isna().sum()
print(f"\nCantidad de filas que todavía tienen cat1 = NaN: {nan_restantes}")


Cantidad de filas que todavía tienen cat1 = NaN: 0


###  Variables de calendario / estacionales

In [6]:
df['periodo'] = df['period']

In [7]:
# Convertir 'periodo' a PeriodIndex mensual
df['period'] = pd.to_datetime(df['period'], format='%Y-%m').dt.to_period('M')

# Extraer mes y año como features numéricas
df['year']  = df['period'].dt.year
df['month'] = df['period'].dt.month
df['days_in_month'] = df['period'].dt.days_in_month
df['semester'] = ((df['month'] - 1) // 6) + 1
df['quarter'] = df['period'].dt.quarter
df['month_q'] = ((df['period'].dt.month - 1) % 3) + 1

# Orden secuencial global
first_period = df['period'].min()
df['period_ordinal'] = ((df['year'] - first_period.year) * 12) + (df['month'] - first_period.month + 1)

In [8]:
# Continuidad temporal (cíclica) usando seno y coseno
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)

# Estacionalidad global 
N = df['period_ordinal'].max()  # O ponés 36 si es fijo
df['period_ordinal_sin'] = np.sin(2 * np.pi * df['period_ordinal'] / N)
df['period_ordinal_cos'] = np.cos(2 * np.pi * df['period_ordinal'] / N)

In [9]:
# Indicar “temporada alta”/“temporada baja” (ejemplo generalista)
df['is_summer'] = df['month'].isin([12, 1, 2]).astype(int)
df['is_winter'] = df['month'].isin([6, 7, 8]).astype(int)

# 3.4. Indicador de “pre-campaña” (suponiendo picos en marzo y octubre)
df['pre_march']   = (df['month'] == 2).astype(int)
df['pre_october'] = (df['month'] == 9).astype(int)

### OneHotEncoder Cat1

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Defino el encoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Ajusto y transformo
cat1_ohe = ohe.fit_transform(df[['cat1']])

# Defino el nombre “cat1_HC”, “cat1_FC”, etc.
ohe_cols = ohe.get_feature_names_out(['cat1'])

# Creo un DataFrame con los nombres de las columnas
cat1_df = pd.DataFrame(cat1_ohe, columns=ohe_cols, index=df.index)

# Uno al DataFrame original
df = pd.concat([df, cat1_df], axis=1)

In [11]:
df['cat1'].value_counts(dropna=False)

cat1
PC       4398482
HC       1952471
FOODS    1389007
REF        41659
Name: count, dtype: int64

In [12]:
df.drop(columns=['cat1'], inplace=True)

### Lags y rolling-windows sobre tn
#### Lags simples

In [13]:
# Asegurarnos de que df_full esté ordenado correctamente
df = df.sort_values(['product_id','customer_id','period'])

# Lags 1 a 36
for lag in range(1, 37):
    df[f'tn_lag{lag}'] = (
        df
        .groupby(['product_id', 'customer_id'])['tn']
        .shift(lag)
    )

#### Rolling-mean y rolling-std (ventanas móviles)

In [14]:
for window in range(2, 37):
    # Media móvil
    df[f'tn_roll{window}'] = (
        df
        .groupby(['product_id', 'customer_id'])['tn']
        .rolling(window=window, min_periods=1)
        .mean()
        .reset_index(level=[0,1], drop=True)
    )
    # Desvío estándar móvil
    df[f'tn_rollstd{window}'] = (
        df
        .groupby(['product_id', 'customer_id'])['tn']
        .rolling(window=window, min_periods=1)
        .std()
        .reset_index(level=[0,1], drop=True)
    )
    # Mínimo móvil
    df[f'tn_rollmin{window}'] = (
        df
        .groupby(['product_id', 'customer_id'])['tn']
        .rolling(window=window, min_periods=1)
        .min()
        .reset_index(level=[0,1], drop=True)
    )
    # Máximo móvil
    df[f'tn_rollmax{window}'] = (
        df
        .groupby(['product_id', 'customer_id'])['tn']
        .rolling(window=window, min_periods=1)
        .max()
        .reset_index(level=[0,1], drop=True)
    )


  df[f'tn_rollstd{window}'] = (
  df[f'tn_rollmin{window}'] = (
  df[f'tn_rollmax{window}'] = (
  df[f'tn_roll{window}'] = (
  df[f'tn_rollstd{window}'] = (
  df[f'tn_rollmin{window}'] = (
  df[f'tn_rollmax{window}'] = (
  df[f'tn_roll{window}'] = (
  df[f'tn_rollstd{window}'] = (
  df[f'tn_rollmin{window}'] = (
  df[f'tn_rollmax{window}'] = (
  df[f'tn_roll{window}'] = (
  df[f'tn_rollstd{window}'] = (
  df[f'tn_rollmin{window}'] = (
  df[f'tn_rollmax{window}'] = (
  df[f'tn_roll{window}'] = (
  df[f'tn_rollstd{window}'] = (
  df[f'tn_rollmin{window}'] = (
  df[f'tn_rollmax{window}'] = (
  df[f'tn_roll{window}'] = (
  df[f'tn_rollstd{window}'] = (
  df[f'tn_rollmin{window}'] = (
  df[f'tn_rollmax{window}'] = (
  df[f'tn_roll{window}'] = (
  df[f'tn_rollstd{window}'] = (
  df[f'tn_rollmin{window}'] = (
  df[f'tn_rollmax{window}'] = (
  df[f'tn_roll{window}'] = (
  df[f'tn_rollstd{window}'] = (
  df[f'tn_rollmin{window}'] = (
  df[f'tn_rollmax{window}'] = (
  df[f'tn_roll{window}'] = (


In [15]:
# Indicador binario de nuevo maximo y nuevo minimo
for window in range(2, 37):
    # Nuevo máximo en la ventana: 1 si el valor actual es igual al máximo móvil
    df[f'tn_is_new_rollmax{window}'] = (df['tn'] == df[f'tn_rollmax{window}']).astype(int)
    
    # Nuevo mínimo en la ventana: 1 si el valor actual es igual al mínimo móvil
    df[f'tn_is_new_rollmin{window}'] = (df['tn'] == df[f'tn_rollmin{window}']).astype(int)

  df[f'tn_is_new_rollmax{window}'] = (df['tn'] == df[f'tn_rollmax{window}']).astype(int)
  df[f'tn_is_new_rollmin{window}'] = (df['tn'] == df[f'tn_rollmin{window}']).astype(int)
  df[f'tn_is_new_rollmax{window}'] = (df['tn'] == df[f'tn_rollmax{window}']).astype(int)
  df[f'tn_is_new_rollmin{window}'] = (df['tn'] == df[f'tn_rollmin{window}']).astype(int)
  df[f'tn_is_new_rollmax{window}'] = (df['tn'] == df[f'tn_rollmax{window}']).astype(int)
  df[f'tn_is_new_rollmin{window}'] = (df['tn'] == df[f'tn_rollmin{window}']).astype(int)
  df[f'tn_is_new_rollmax{window}'] = (df['tn'] == df[f'tn_rollmax{window}']).astype(int)
  df[f'tn_is_new_rollmin{window}'] = (df['tn'] == df[f'tn_rollmin{window}']).astype(int)
  df[f'tn_is_new_rollmax{window}'] = (df['tn'] == df[f'tn_rollmax{window}']).astype(int)
  df[f'tn_is_new_rollmin{window}'] = (df['tn'] == df[f'tn_rollmin{window}']).astype(int)
  df[f'tn_is_new_rollmax{window}'] = (df['tn'] == df[f'tn_rollmax{window}']).astype(int)
  df[f'tn_is_new_roll

### Ratio entre el valor actual y el promedio móvil

In [16]:
for window in range(2, 37):
    df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)

  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'] / (df[f'tn_roll{window}'] + 1e-6)
  df[f'tn_div_roll{window}'] = df['tn'

### Delta lags

In [17]:
# for lag in range(1, 37):  # Por ejemplo, los últimos 12 meses
#     df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']

# # Segunda diferenciacion (delta del delta)
# for lag in range(2, 13):
#     df[f'tn_accel{lag}'] = df[f'tn_delta{lag-1}'] - df[f'tn_delta{lag}']

# # Ratios del delta
# for lag in range(1, 13):
#     df[f'tn_rel_delta{lag}'] = df['tn_delta{lag}'] / (df[f'tn_lag{lag}'] + 1e-6)

In [18]:
# Primer paso: calcular los delta y delta relativos
for lag in range(1, 13):
    df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']
    df[f'tn_rel_delta{lag}'] = df[f'tn_delta{lag}'] / (df[f'tn_lag{lag}'] + 1e-6)

# Segundo paso: calcular los delta del delta
for lag in range(1, 12):  # Hasta 11 porque el 12 no tiene siguiente
    df[f'tn_accel{lag}'] = df[f'tn_delta{lag}'] - df[f'tn_delta{lag+1}']

  df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']
  df[f'tn_rel_delta{lag}'] = df[f'tn_delta{lag}'] / (df[f'tn_lag{lag}'] + 1e-6)
  df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']
  df[f'tn_rel_delta{lag}'] = df[f'tn_delta{lag}'] / (df[f'tn_lag{lag}'] + 1e-6)
  df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']
  df[f'tn_rel_delta{lag}'] = df[f'tn_delta{lag}'] / (df[f'tn_lag{lag}'] + 1e-6)
  df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']
  df[f'tn_rel_delta{lag}'] = df[f'tn_delta{lag}'] / (df[f'tn_lag{lag}'] + 1e-6)
  df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']
  df[f'tn_rel_delta{lag}'] = df[f'tn_delta{lag}'] / (df[f'tn_lag{lag}'] + 1e-6)
  df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']
  df[f'tn_rel_delta{lag}'] = df[f'tn_delta{lag}'] / (df[f'tn_lag{lag}'] + 1e-6)
  df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']
  df[f'tn_rel_delta{lag}'] = df[f'tn_delta{lag}'] / (df[f'tn_lag{lag}'] + 1e-6)
  df[f'tn_delta{lag}'] = df['tn'] - df[f'tn_lag{lag}']


### Tendencia

In [19]:
def rolling_slope(x, window):
    idx = np.arange(window)
    def _slope(arr):
        if len(arr) < window:
            arr = arr[-window:]
            idx2 = np.arange(len(arr))
            if len(arr) < 2:
                return np.nan
            return np.polyfit(idx2, arr, 1)[0]
        return np.polyfit(idx, arr, 1)[0]
    return x.rolling(window).apply(_slope, raw=True)

for window in [2, 3, 6, 9, 12, 18, 24, 36]:
    df[f'tn_trend_slope{window}'] = (
        df
        .groupby(['product_id','customer_id'])['tn']
        .apply(lambda x: rolling_slope(x, window))
        .reset_index(level=[0,1], drop=True)
    )

  df[f'tn_trend_slope{window}'] = (
  df[f'tn_trend_slope{window}'] = (
  df[f'tn_trend_slope{window}'] = (
  df[f'tn_trend_slope{window}'] = (
  df[f'tn_trend_slope{window}'] = (
  df[f'tn_trend_slope{window}'] = (
  df[f'tn_trend_slope{window}'] = (
  df[f'tn_trend_slope{window}'] = (


In [20]:
df['tn_vs_ma12'] = df['tn'] - df['tn_roll12']

  df['tn_vs_ma12'] = df['tn'] - df['tn_roll12']


### Edad del producto y del cliente (feature “edad”)

In [48]:
# Primer periodo (nacimiento) de cada producto y cada cliente
prod_nac = (
    df
    .groupby('product_id')['period']
    .min()
    .reset_index()
    .rename(columns={'period': 'prod_start'})
)

cust_nac = (
    df
    .groupby('customer_id')['period']
    .min()
    .reset_index()
    .rename(columns={'period': 'cust_start'})
)

In [49]:
# Suponiendo que ya tienes prod_nac y cust_nac
# Unir a df_full las fechas de nacimiento
df = df.merge(prod_nac, on='product_id', how='left')
df = df.merge(cust_nac, on='customer_id', how='left')

# Edad en meses de product_id en cada fila
df['prod_age'] = (df['period'] - df['prod_start']).apply(lambda x: x.n)

# Edad en meses de customer_id en cada fila
df['cust_age'] = (df['period'] - df['cust_start']).apply(lambda x: x.n)

: 

### Indicadores de “actividad” / “inactividad prolongada”

In [22]:
# Función que calcula la racha de meses consecutivos sin vender
def calcular_racha_no_ventas(serie_no_sale: pd.Series) -> pd.Series:
    """
    Dada una Serie booleana/0-1 que indica para cada mes si no hubo venta (1) o sí hubo (0),
    devuelve otra Serie del mismo tamaño donde cada posición es el número de meses consecutivos
    sin venta que lleva hasta esa fila (incluyéndola). Reinicia a 0 cuando hay venta.
    """
    racha = []
    cuenta = 0
    for valor in serie_no_sale:
        if valor == 1:
            cuenta += 1
        else:
            cuenta = 0
        racha.append(cuenta)
    return pd.Series(racha, index=serie_no_sale.index)

df = df.sort_values(['product_id','customer_id','period']).copy()

# Calcular no_sale (1 si tn == 0, ó 0 si tn > 0)
df['no_sale'] = (df['tn'] == 0).astype(int)

# 4) Aplicar la función por cada grupo <product_id, customer_id>
df['no_sale_streak'] = (
    df
    .groupby(['product_id','customer_id'])['no_sale']
    .apply(calcular_racha_no_ventas)
    .reset_index(level=[0,1], drop=True)
)

In [20]:
# 5) Verificar manualmente un ejemplo para confirmar que ahora sí reinicia
print(df.loc[
    (df['product_id']==21266) & (df['customer_id']==10040),
    ['period','tn','no_sale','no_sale_streak']
])

          period       tn  no_sale  no_sale_streak
7780615  2019-03  0.00000        1               1
7780616  2019-04  0.05801        0               0
7780617  2019-05  0.00000        1               1
7780618  2019-06  0.01479        0               0
7780619  2019-07  0.00000        1               1
7780620  2019-08  0.00114        0               0
7780621  2019-09  0.00000        1               1
7780622  2019-10  0.00000        1               2
7780623  2019-11  0.00000        1               3
7780624  2019-12  0.00000        1               4


In [22]:
df.head()

Unnamed: 0,period,product_id,customer_id,tn,target,cat1_FOODS,cat1_HC,cat1_PC,cat1_REF,tn_lag1,...,share_cliente,prod_start,cust_start,prod_age,cust_age,no_sale,no_sale_streak,tn_slope3,tn_slope6,tn_slope12
0,2017-01,20001,10001,99.43861,92.46537,0.0,1.0,0.0,0.0,,...,0.052969,2017-01,2017-01,0,0,0,0,,,
1,2017-02,20001,10001,198.84365,13.29728,0.0,1.0,0.0,0.0,99.43861,...,0.085162,2017-01,2017-01,1,1,0,0,99.40504,99.40504,99.40504
2,2017-03,20001,10001,92.46537,101.00563,0.0,1.0,0.0,0.0,198.84365,...,0.032619,2017-01,2017-01,2,2,0,0,-3.48662,-3.48662,-3.48662
3,2017-04,20001,10001,13.29728,128.04792,0.0,1.0,0.0,0.0,92.46537,...,0.00535,2017-01,2017-01,3,3,0,0,-92.773185,-36.480227,-36.480227
4,2017-05,20001,10001,101.00563,101.20711,0.0,1.0,0.0,0.0,13.29728,...,0.036501,2017-01,2017-01,4,4,0,0,4.27013,-18.241233,-18.241233


## Validamos posibles inf o NaN

In [25]:
# Lista para acumular resultados
resultados = []

for col in df.columns:
    nulos = df[col].isnull().sum()
    # Solo contamos infinitos si la columna es numérica
    if pd.api.types.is_numeric_dtype(df[col]):
        infs = np.isinf(df[col]).sum()
    else:
        infs = 0
    resultados.append({
        'column': col,
        'null_count': nulos,
        'inf_count': infs
    })

resumen = pd.DataFrame(resultados)
pd.set_option('display.max_rows', None)  # Mostrar todas las filas
resumen

Unnamed: 0,column,null_count,inf_count
0,period,0,0
1,product_id,0,0
2,customer_id,0,0
3,tn,0,0
4,target,525526,0
5,periodo,0,0
6,year,0,0
7,month,0,0
8,days_in_month,0,0
9,semester,0,0


### Manejo de posibles inf o NaN

In [23]:
# Convertir todos los infinitos (+Inf y -Inf) en NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# # Ejemplo: ver cuántos infinitos quedan (debería ser cero)
# inf_count_post = {col: np.isinf(df[col]).sum() for col in df.columns if pd.api.types.is_numeric_dtype(df[col])}
# print("Infinitos por columna tras imputar:", inf_count_post)

In [24]:
df.to_csv('df_with_target_and_features.csv', index=False, sep='\t')