## Modelo Base tutorial

https://www.kaggle.com/code/falrrema/exercise-trend/edit

#### Seteando entorno

In [0]:
# Funciones claves
import importlib
import subprocess

def load_install_package(packages):
    for package in packages:
        try:
            importlib.import_module(package)
            print(f"{package} está instalada y lista para usar.")
        except ImportError:
            print(f"{package} no está instalada. Instalando...")
            subprocess.check_call(['pip', 'install', package])
            print(f"{package} ha sido instalada exitosamente.")

# Clase para hacer CV en bloques (creado con chatGPT)
class BlockingTimeSeriesSplit():
    def __init__(self, train_size=None, test_size=None, n_splits=None, step=None):
        self.train_size = train_size
        self.test_size = test_size
        self.n_splits = n_splits
        self.step = step if step else test_size  # Default step size is test_size

    def get_n_splits(self, X, y=None, groups=None):
        n_samples = len(X)
        
        # If train_size, test_size, and step are provided
        if self.train_size and self.test_size:
            max_splits = 1 + (n_samples - self.train_size - self.test_size) // self.step
            if self.n_splits:
                if self.n_splits > max_splits:
                    raise ValueError(f"Cannot have n_splits > {max_splits} with the provided train and test sizes and step. Consider reducing n_splits.")
                return self.n_splits
            return max_splits
        
        # If only n_splits is provided
        if self.n_splits:
            return self.n_splits
        raise ValueError("Either train/test sizes or n_splits should be provided.")
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        indices = np.arange(n_samples)
        
        n_splits = self.get_n_splits(X)
        
        if self.train_size and self.test_size:
            for i in range(n_splits):
                start = i * self.step
                mid = start + self.train_size
                stop = mid + self.test_size
                if stop > n_samples:
                    stop = n_samples
                yield indices[start: mid], indices[mid: stop]
        else:
            k_fold_size = n_samples // n_splits
            for i in range(n_splits):
                start = i * k_fold_size
                stop = start + k_fold_size
                mid = int(0.5 * (stop - start)) + start
                yield indices[start: mid], indices[mid: stop]

# Función para hacer CV generando una lista de DF con splits de train y test
def sliding_period(df, btss, period):
    # Step 1: Check Period Index
    if period == "day":
        if not isinstance(df.index, pd.PeriodIndex) or df.index.freqstr != 'D':
            raise ValueError("Index should be of type PeriodIndex with 'D' frequency for 'day' period.")
    elif period == "week":
        if not isinstance(df.index, pd.PeriodIndex) or df.index.freqstr != 'W-SUN':
            raise ValueError("Index should be of type PeriodIndex with 'W-SUN' frequency for 'week' period.")
    elif period == "month":
        if not isinstance(df.index, pd.PeriodIndex) or df.index.freqstr != 'M':
            raise ValueError("Index should be of type PeriodIndex with 'M' frequency for 'month' period.")
    elif period == "quarter":
        if not isinstance(df.index, pd.PeriodIndex) or df.index.freqstr != 'Q-DEC':
            raise ValueError("Index should be of type PeriodIndex with 'Q-DEC' frequency for 'quarter' period.")
    elif period == "year":
        if not isinstance(df.index, pd.PeriodIndex) or df.index.freqstr != 'A-DEC':
            raise ValueError("Index should be of type PeriodIndex with 'A-DEC' frequency for 'year' period.")
    # Add more checks for other periods if needed

    # Step 2: Extract Unique Periods
    unique_periods = df.index.unique()

    results = []

    # Step 3 & 4: Apply `BlockingTimeSeriesSplit` and Filter & Annotate DataFrame
    for train_periods, test_periods in btss.split(unique_periods):
        train_df = df[df.index.isin(unique_periods[train_periods])].copy()
        train_df['split'] = 'train'
        
        test_df = df[df.index.isin(unique_periods[test_periods])].copy()
        test_df['split'] = 'test'
        
        combined_df = pd.concat([train_df, test_df])
        results.append(combined_df)

    # Step 5: Return List
    return results
  
# Funcion de chequeo de BlockingTimeSeriesSplit para experimentar splits
def check_BlockTimeSeriesSplit(df, train_size=None, test_size=None, n_splits=None, step=None):
    # Extract unique periods
    unique_periods = df.index.unique()
    
    # Calculate max_splits for train_size and test_size
    max_splits = 1 + (len(unique_periods) - train_size - test_size) // (step or test_size)
    
    # Validate and compute missing parameters
    if n_splits:
        if train_size is None and test_size is None:
            split_size = len(unique_periods) // n_splits
            train_size = split_size - (split_size // 3)
            test_size = split_size // 3
        elif train_size and test_size:
            if n_splits > max_splits:
                raise ValueError(f"Cannot have n_splits > {max_splits}. Adjust train/test sizes or step.")
        else:
            raise ValueError("If n_splits is provided along with train_size or test_size, both train_size and test_size must be provided.")
    elif train_size and test_size:
        n_splits = max_splits
    else:
        raise ValueError("Provide either n_splits or both train_size and test_size.")
    
    # Create an instance of BlockingTimeSeriesSplit
    btss = BlockingTimeSeriesSplit(train_size=train_size, test_size=test_size, n_splits=n_splits, step=step or test_size)
    
    # Collect split details
    results = []
    fold_number = 1
    last_test_period = None
    for train_periods, test_periods in btss.split(unique_periods):
        results.append([fold_number, 'Train', len(train_periods), unique_periods[train_periods][0], unique_periods[train_periods][-1]])
        results.append([fold_number, 'Test', len(test_periods), unique_periods[test_periods][0], unique_periods[test_periods][-1]])
        last_test_period = unique_periods[test_periods][-1]
        fold_number += 1
    
    # Convert results to DataFrame
    splits_df = pd.DataFrame(results, columns=['Fold', 'Split', 'Length', 'Initial_period', 'Ending_period'])
    
    # Calculate unused periods
    unused_periods = len(unique_periods[unique_periods.tolist().index(last_test_period)+1:])
    
    # Print report
    print(f"Dataframe Initial Period: {unique_periods[0]}")
    print(f"Dataframe Max Period: {unique_periods[-1]}")
    print(f"Maximum possible splits: {max_splits}")
    if n_splits:
        print(f"n_splits: {n_splits}")
    print(f"Number of unused periods: {unused_periods}")
    
    return splits_df

# Funcion de medicion
def RMSLE(y_true: list, y_pred: list) -> float:
    """
    The Root Mean Squared Log Error (RMSLE) metric using only NumPy
    
    :param y_true: The ground truth labels given in the dataset
    :param y_pred: Our predictions
    :return: The RMSLE score
    """
    n = len(y_true)
    msle = np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
    return msle

In [0]:
librerias = ['numpy', 'pandas', 'matplotlib', 'pathlib', 'seaborn', 'sklearn', 'statsmodels', 'kaggle']
load_install_package(librerias)

numpy está instalada y lista para usar.
pandas está instalada y lista para usar.
matplotlib está instalada y lista para usar.
pathlib está instalada y lista para usar.
seaborn está instalada y lista para usar.
sklearn está instalada y lista para usar.
statsmodels está instalada y lista para usar.


[0;31m---------------------------------------------------------------------------[0m
[0;31mOSError[0m                                   Traceback (most recent call last)
[0;32m<command-288323088679126>[0m in [0;36m<cell line: 2>[0;34m()[0m
[1;32m      1[0m [0mlibrerias[0m [0;34m=[0m [0;34m[[0m[0;34m'numpy'[0m[0;34m,[0m [0;34m'pandas'[0m[0;34m,[0m [0;34m'matplotlib'[0m[0;34m,[0m [0;34m'pathlib'[0m[0;34m,[0m [0;34m'seaborn'[0m[0;34m,[0m [0;34m'sklearn'[0m[0;34m,[0m [0;34m'statsmodels'[0m[0;34m,[0m [0;34m'kaggle'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mload_install_package[0m[0;34m([0m[0mlibrerias[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m<command-288323088679125>[0m in [0;36mload_install_package[0;34m(packages)[0m
[1;32m      6[0m     [0;32mfor[0m [0mpackage[0m [0;32min[0m [0mpackages[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m      7[0m         [0;32mtry[0m[0;34m:[0m[0;34m

In [0]:
# Cargamos las librerias
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import kaggle
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.metrics import mean_squared_log_error

[0;31m---------------------------------------------------------------------------[0m
[0;31mOSError[0m                                   Traceback (most recent call last)
[0;32m<command-288323088679127>[0m in [0;36m<cell line: 6>[0;34m()[0m
[1;32m      4[0m [0;32mimport[0m [0mmatplotlib[0m [0;32mas[0m [0mplt[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m [0;32mimport[0m [0mseaborn[0m [0;32mas[0m [0msns[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 6[0;31m [0;32mimport[0m [0mkaggle[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      7[0m [0;32mfrom[0m [0msklearn[0m[0;34m.[0m[0mlinear_model[0m [0;32mimport[0m [0mLinearRegression[0m[0;34m[0m[0;34m[0m[0m
[1;32m      8[0m [0;32mfrom[0m [0mstatsmodels[0m[0;34m.[0m[0mtsa[0m[0;34m.[0m[0mdeterministic[0m [0;32mimport[0m [0mCalendarFourier[0m[0;34m,[0m [0mDeterministicProcess[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/python_shell/dbruntime/PythonPackageImportsInstrumentation/__

#### Cargando data

In [0]:
# Cargando feriados
holidays_events = spark.sql('select * from analytics_inversiones.kgl_holidays_events').toPandas()
# Establecer el tipo de datos de cada columna
# Definir el diccionario de tipos de datos por columna
dtype={
        'date' : 'datetime64[ns]',
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    }
holidays_events = holidays_events.astype(dtype)
holidays_events = holidays_events.set_index('date').to_period('D')

In [0]:
# Cargando Train
store_sales = spark.sql('select * from analytics_inversiones.kgl_train').toPandas()
# Establecer el tipo de datos de cada columna
# Definir el diccionario de tipos de datos por columna
dtype={ 
       'date': 'datetime64[ns]',
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32'
}
store_sales = store_sales.astype(dtype)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['date']).sort_index()
#store_sales

In [0]:
# Cargando Test
df_test = spark.sql('select * from analytics_inversiones.kgl_test').toPandas()
# Establecer el tipo de datos de cada columna
# Definir el diccionario de tipos de datos por columna
dtype={ 
       'date': 'datetime64[ns]',
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
}
df_test = df_test.astype(dtype)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['date']).sort_index()

#### Generando tablones CV

Fabrique 3 funciones:

**BlockingTimeSeriesSplit**: La clase que permite la fabricación del generador de splits que puede recibir 
  `n_splits` = número de splits train y test que se quiere hacer
  `train_size` = el tamaño de train en periodos
  `test_size` = el tamaño de test en periodos
  `step` = cuantos periodos se salta ante el siguiente splis, default es el tamaño del test

**check_BlockTimeSeriesSplit**: Permite experimentar los splits que se quiere hacer, te da un pequeño informe basado en las métricas que se les pasa.

**sliding_period**: Implementa los splits en el dataframe que se le pase. Exige que el dataframe tenga un indice de fecha como *PeriodIndex*. Devuelve una lista de dataframes de cada splits con una columna que diferencia el train del test.

In [0]:
# Experimento con los splits para lograr abordar todo el dataset idealmente 10 splits
ventana_train = 100
ventana_test = 15
df_check = check_BlockTimeSeriesSplit(df = store_sales, train_size=ventana_train, test_size=ventana_test, n_splits=None, step=ventana_train+ventana_test)
df_check

Dataframe Initial Period: 2013-01-01
Dataframe Max Period: 2017-08-15
Maximum possible splits: 14
n_splits: 14
Number of unused periods: 74


Unnamed: 0,Fold,Split,Length,Initial_period,Ending_period
0,1,Train,100,2013-01-01,2013-04-10
1,1,Test,15,2013-04-11,2013-04-25
2,2,Train,100,2013-04-26,2013-08-03
3,2,Test,15,2013-08-04,2013-08-18
4,3,Train,100,2013-08-19,2013-11-26
5,3,Test,15,2013-11-27,2013-12-11
6,4,Train,100,2013-12-12,2014-03-22
7,4,Test,15,2014-03-23,2014-04-06
8,5,Train,100,2014-04-07,2014-07-15
9,5,Test,15,2014-07-16,2014-07-30


In [0]:
# Genero los splits
ventana_train = 100
ventana_test = 15
btss = BlockingTimeSeriesSplit(train_size=ventana_train, test_size=ventana_test, step=ventana_train + ventana_test)
df_cv = sliding_period(df = store_sales, btss = btss, period = "day")
df_cv[0]

Unnamed: 0_level_0,id,store_nbr,family,sales,onpromotion,split
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-01-01,0,1,AUTOMOTIVE,0.00000,0,train
2013-01-01,1,1,BABY CARE,0.00000,0,train
2013-01-01,2,1,BEAUTY,0.00000,0,train
2013-01-01,3,1,BEVERAGES,0.00000,0,train
2013-01-01,4,1,BOOKS,0.00000,0,train
...,...,...,...,...,...,...
2013-04-25,204925,9,POULTRY,395.60199,0,test
2013-04-25,204926,9,PREPARED FOODS,65.00000,0,test
2013-04-25,204927,9,PRODUCE,0.00000,0,test
2013-04-25,204928,9,SCHOOL AND OFFICE SUPPLIES,0.00000,0,test


#### Armando modelo de seasonality

Armaré una funcion que hace CV para cada splits armado arriba y le aplicará una regresión lineal a los features construidos en el notebook del tutorial. Estos son primordialmente features de seasonality y uno de feriado.

In [0]:
def cv_predictions(splits):
    results = []
    rmsle_reports = []
    
    for i, split_df in enumerate(df_cv, start=1):
        # Separate out train and test sets
        train_df = split_df[split_df["split"] == "train"].drop(columns=['id', 'onpromotion', 'split'])
        test_df = split_df[split_df["split"] == "test"].drop(columns=['id', 'onpromotion', 'split'])

        # Reset index for train and test
        y_train = (
            train_df.reset_index()
            .set_index(["store_nbr", "family", "date"])
            .sort_index()
            .unstack(["store_nbr", "family"])
        )
        y_test = (
            test_df.reset_index()
            .set_index(["store_nbr", "family", "date"])
            .sort_index()
            .unstack(["store_nbr", "family"])
        )

        # Generate deterministic features for training set
        fourier = CalendarFourier(freq="M", order=4)
        dp = DeterministicProcess(
            index=y_train.index,
            constant=True,
            order=1,
            seasonal=True,
            additional_terms=[fourier],
            drop=True,
        )
        X_train = dp.in_sample()
        X_train["NewYear"] = X_train.index.dayofyear == 1

        # Fit the linear regression model on training data
        model = LinearRegression(fit_intercept=False)
        model.fit(X_train, y_train)

        # Predict on the training data
        y_train_pred = pd.DataFrame(
            model.predict(X_train), index=X_train.index, columns=y_train.columns
        )
        y_train_pred = y_train_pred.clip(lower=0)  # Set negative predictions to zero

        # Generate deterministic features for test set
        X_test = dp.out_of_sample(steps=len(y_test))
        X_test["NewYear"] = X_test.index.dayofyear == 1

        # Predict on the test data
        y_test_pred = pd.DataFrame(
            model.predict(X_test), index=X_test.index, columns=y_test.columns
        )
        y_test_pred = y_test_pred.clip(lower=0)  # Set negative predictions to zero

        # Prepare final dataframes for train and test
        train_df = pd.concat([y_train.stack(["store_nbr", "family"]), y_train_pred.stack(["store_nbr", "family"])],axis=1)
        train_df.columns = ["sales", "predicted"]
        train_df["SLE"] = np.square(
            np.log1p(train_df["predicted"]) - np.log1p(train_df["sales"])
        )
        train_df["split"] = "train"

        test_df = pd.concat([y_test.stack(["store_nbr", "family"]), y_test_pred.stack(["store_nbr", "family"])],axis=1)
        test_df.columns = ["sales", "predicted"]
        test_df["SLE"] = np.square(
            np.log1p(test_df["predicted"]) - np.log1p(test_df["sales"])
        )
        test_df["split"] = "test"

        # Compute RMSLE for both train and test
        rmsle_train = RMSLE(train_df["sales"], train_df["predicted"])
        rmsle_test = RMSLE(test_df["sales"], test_df["predicted"])
        rmsle_reports.append((rmsle_train, rmsle_test))

        # Concatenate results and append to results list
        result_df = pd.concat([train_df, test_df]).reset_index()
        results.append(result_df)

        print(f"FOLD: {i}")
        print(f" Train RMSLE: {rmsle_train}")
        print(f" Test RMSLE: {rmsle_test}")
    
    # Convert to DataFrame the rmsle_reports
    df_rmsle = pd.DataFrame(rmsle_reports, columns=['RMSLE_train', 'RMSLE_test'])
    print("")
    print(f" Average Train RMSLE: {df_rmsle['RMSLE_train'].mean()}")
    print(f" Average Test RMSLE: {df_rmsle['RMSLE_test'].mean()}")

    return results, rmsle_reports

In [0]:
# Corro la función implementada
results, rmsle_report = cv_predictions(df_cv)

FOLD: 1
 Train RMSLE: 0.3428391437350007
 Test RMSLE: 0.4006097157114463
FOLD: 2
 Train RMSLE: 0.37202749149322806
 Test RMSLE: 0.4251332980684654
FOLD: 3
 Train RMSLE: 0.3111557311968022
 Test RMSLE: 0.3987280663806099
FOLD: 4
 Train RMSLE: 1.1003787393679654
 Test RMSLE: 1.3539705994407443
FOLD: 5
 Train RMSLE: 0.9396662918540035
 Test RMSLE: 0.6643089300850437
FOLD: 6
 Train RMSLE: 0.8121079239127986
 Test RMSLE: 0.5177957286050113
FOLD: 7
 Train RMSLE: 0.9400491073771143
 Test RMSLE: 0.9911693380949543
FOLD: 8
 Train RMSLE: 0.904308109943702
 Test RMSLE: 0.5251585713897807
FOLD: 9
 Train RMSLE: 0.5956821151054202
 Test RMSLE: 0.5811083247023155
FOLD: 10
 Train RMSLE: 0.5568356526157304
 Test RMSLE: 0.6391310068828472
FOLD: 11
 Train RMSLE: 0.451123634783162
 Test RMSLE: 0.5809267845017297
FOLD: 12
 Train RMSLE: 0.5153094483943136
 Test RMSLE: 0.5993963475123538
FOLD: 13
 Train RMSLE: 0.5779931157528012
 Test RMSLE: 0.6336906391468273
FOLD: 14
 Train RMSLE: 0.5603823406264516
 Test 

#### Resultados

Muy posiblemente que el train tenga mayor error que el test tiene relacion sobre la cantidad de días que está prediciendo en cada uno. Esto se podría explorar a mayor cabalidad, voy a jugar con varios valores en mis funciones arriba: 

---
* Folds = 25
* Train = 50 días - Average Train RMSLE: 0.491
* Test = 15 días - Average Test RMSLE: 0.756

---
* Folds = 17
* Train = 80 días - Average Train RMSLE: 0.577
* Test = 15 días - Average Test RMSLE: 0.788

---
* Folds = 14
* Train = 100 días - Average Train RMSLE: 0.64
* Test = 15 días - Average Test RMSLE: 0.63

---
* Folds = 12
* Train = 120 días - Average Train RMSLE: 0.683
* Test = 15 días - Average Test RMSLE: 0.712

---
* Folds = 10
* Train = 150 días - Average Train RMSLE: 0.704
* Test = 15 días - Average Test RMSLE: 0.665

---
* Folds = 7
* Train = 200 días - Average Train RMSLE: 0.794
* Test = 15 días - Average Test RMSLE: 0.765

---
* Folds = 4
* Train = 360 días - Average Train RMSLE: 0.791
* Test = 15 días - Average Test RMSLE: 1.12


Se observa que la ventana de train inferior a 100 días afecta el test, hay un sweet spot entre 100-150 dias pero luego train más grande solo daña el test.

In [0]:
df_rmsle = pd.DataFrame(rmsle_report, columns=['RMSLE_train', 'RMSLE_test'])
print(f" Average Train RMSLE: {df_rmsle['RMSLE_train'].mean()}")
print(f" Average Test RMSLE: {df_rmsle['RMSLE_test'].mean()}")

 Average Train RMSLE: 0.641418489011321
 Average Test RMSLE: 0.6355501962424995


#### Replicar el submission

En el notebook ocupó el año 2017 para armar el modelo final. Que son exactamente 227 periodos:

In [0]:
# Cuantos periodos ocupo en el notebook
len(store_sales.loc["2017"].index.unique())

Out[83]: 227

In [0]:
# Repliquemos el submission
# Comenzamos con el train en el periodo 2017
y = (
  store_sales.loc["2017"].reset_index().drop(columns=['id', 'onpromotion'])
            .set_index(["store_nbr", "family", "date"])
            .sort_index()
            .unstack(["store_nbr", "family"])
)


# Entrenando features
fourier = CalendarFourier(freq='M', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()
X['NewYear'] = (X.index.dayofyear == 1)

model = LinearRegression(fit_intercept=False)
model.fit(X, y)

# Calculando el RMSLE train set
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)
y_pred = y_pred.clip(lower=0)
train_df = pd.concat([y.stack(["store_nbr", "family"]), y_pred.stack(["store_nbr", "family"])],axis=1)
train_df.columns = ["sales", "predicted"]
train_df["SLE"] = np.square(np.log1p(train_df["predicted"]) - np.log1p(train_df["sales"]))

print(f"Train RMSLE: {RMSLE(train_df['sales'], train_df['predicted'])}")

Train RMSLE: 0.5488001493903649


In [0]:
# Seguimos con el test
X_test = dp.out_of_sample(steps=16)
X_test.index.name = 'date'
X_test['NewYear'] = (X_test.index.dayofyear == 1)

y_submit = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
y_submit = y_submit.clip(lower=0)
y_submit = y_submit.stack(['store_nbr', 'family'])
y_submit = y_submit.join(df_test.reset_index().set_index(["store_nbr", "family", "date"]).id).reindex(columns=['id', 'sales'])

# Pasamos a spark y guardamos
spark_df = spark.createDataFrame(y_submit)
spark_df.write.format("delta").mode("overwrite").saveAsTable("analytics_inversiones.fr_kgl_submission")

#### Submission con mejoras

Mejoras con ventanas más pequeña de 100 días

In [0]:
# Obteniendo los últimos 100 periodos
ult_100 = store_sales.index.unique()[-100:]
print(f"Fecha Inicio: {np.min(ult_100)}")
print(f"Fecha Fin: {np.max(ult_100)}")

Fecha Inicio: 2017-05-08
Fecha Fin: 2017-08-15


In [0]:
# Comenzamos con el train en los últimos 100 periodos
y = (
  store_sales.loc[ult_100].reset_index().drop(columns=['id', 'onpromotion'])
            .set_index(["store_nbr", "family", "date"])
            .sort_index()
            .unstack(["store_nbr", "family"])
)

# Entrenando features
fourier = CalendarFourier(freq='M', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()
X['NewYear'] = (X.index.dayofyear == 1)

model = LinearRegression(fit_intercept=False)
model.fit(X, y)

# Calculando el RMSLE train set
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)
y_pred = y_pred.clip(lower=0)
train_df = pd.concat([y.stack(["store_nbr", "family"]), y_pred.stack(["store_nbr", "family"])],axis=1)
train_df.columns = ["sales", "predicted"]
train_df["SLE"] = np.square(np.log1p(train_df["predicted"]) - np.log1p(train_df["sales"]))

print(f"Train RMSLE: {RMSLE(train_df['sales'], train_df['predicted'])}")

Train RMSLE: 0.417717979829191


Es más bajo el RMSLE como esperabamos dado el CV.

In [0]:
# Seguimos con el test
X_test = dp.out_of_sample(steps=16)
X_test.index.name = 'date'
X_test['NewYear'] = (X_test.index.dayofyear == 1)

y_submit = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
y_submit = y_submit.clip(lower=0)
y_submit = y_submit.stack(['store_nbr', 'family'])
y_submit = y_submit.join(df_test.reset_index().set_index(["store_nbr", "family", "date"]).id).reindex(columns=['id', 'sales'])

# Pasamos a spark y guardamos
spark_df = spark.createDataFrame(y_submit)
spark_df.write.format("delta").mode("overwrite").saveAsTable("analytics_inversiones.fr_kgl_submission")