In [7]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [2]:
df = pd.read_csv('data/dataset.csv')
df = df.sort_values(by=['shop_id', 'item_id', 'date'])

In [3]:
columns = ['day','month','year','quarter','weekday','is_month_start','is_month_end','acc_month_num','category_id','price','quantity','pbi','OPEP_oil_price','unemployment_rate','IPC_rate']
grouped_data = df.groupby(['date', 'shop_id', 'item_id'])[columns].first().reset_index()
grouped_data.head()

Unnamed: 0,date,shop_id,item_id,day,month,year,quarter,weekday,is_month_start,is_month_end,acc_month_num,category_id,price,quantity,pbi,OPEP_oil_price,unemployment_rate,IPC_rate
0,2013-10-31,2,1495,31,10,2013,4,3,0,1,0,30,699.0,0,341693.0,106.75,5.5,6.3
1,2013-10-31,2,1555,31,10,2013,4,3,0,1,0,28,1299.0,0,341693.0,106.75,5.5,6.3
2,2013-10-31,2,1556,31,10,2013,4,3,0,1,0,20,2999.0,0,341693.0,106.75,5.5,6.3
3,2013-10-31,2,1855,31,10,2013,4,3,0,1,0,30,1199.0,0,341693.0,106.75,5.5,6.3
4,2013-10-31,2,1857,31,10,2013,4,3,0,1,0,20,2599.0,0,341693.0,106.75,5.5,6.3


In [4]:
unique_dates = df['date'].unique()
unique_shop_item = df[['shop_id', 'item_id']].drop_duplicates()
columns = ['day','month','year','quarter','weekday','is_month_start','is_month_end','acc_month_num','category_id','price','quantity','pbi','OPEP_oil_price','unemployment_rate','IPC_rate']
num_steps = len(unique_dates)
num_features = len(df.columns) - 3  # Excluyendo 'date', 'shop_id', 'item_id'

# Agrupa los datos por fecha, tienda y artículo
grouped_data = df.groupby(['date', 'shop_id', 'item_id'])[columns].first().reset_index()

# Crea un diccionario para mapear las fechas a índices en el tensor
date_index_mapping = {date: i for i, date in enumerate(unique_dates)}

# Inicializa un tensor con ceros
tensor = torch.zeros((len(unique_shop_item), num_steps, num_features))



# Llena el tensor con los valores correspondientes
for index, row in grouped_data.iterrows():
    date_index = date_index_mapping[row['date']]
    shop_item_index = unique_shop_item.index[(unique_shop_item['shop_id'] == row['shop_id']) & (unique_shop_item['item_id'] == row['item_id'])][0]
    numeric_data = row[columns].apply(pd.to_numeric, errors='coerce').values
    tensor[shop_item_index, date_index, :] = torch.tensor(numeric_data)



# Necesitamos llenar este tensor con los valores de la base de datos + las APIs
# Necesitamos llenar este tensor con los valores de la base de datos + las APIs
# Necesitamos llenar este tensor con los valores de la base de datos + las APIs
# Necesitamos llenar este tensor con los valores de la base de datos + las APIs



tensor.shape

torch.Size([6254, 731, 15])

In [5]:
pd.DataFrame(tensor[0, :, :].numpy(), columns=columns).head()

Unnamed: 0,day,month,year,quarter,weekday,is_month_start,is_month_end,acc_month_num,category_id,price,quantity,pbi,OPEP_oil_price,unemployment_rate,IPC_rate
0,31.0,10.0,2013.0,4.0,3.0,0.0,1.0,0.0,30.0,699.0,0.0,341693.0,106.75,5.5,6.3
1,1.0,11.0,2013.0,4.0,4.0,1.0,0.0,1.0,30.0,699.0,0.0,341693.0,105.529999,5.4,6.5
2,2.0,11.0,2013.0,4.0,5.0,0.0,0.0,1.0,30.0,699.0,0.0,341693.0,105.529999,5.4,6.5
3,3.0,11.0,2013.0,4.0,6.0,0.0,0.0,1.0,30.0,699.0,0.0,341693.0,105.529999,5.4,6.5
4,4.0,11.0,2013.0,4.0,0.0,0.0,0.0,1.0,30.0,699.0,0.0,341693.0,104.220001,5.4,6.5


In [13]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, sequence_length):
        self.data = data
        self.sequence_length = sequence_length

    def __len__(self):
        return self.data.shape[0] - self.sequence_length + 1

    def __getitem__(self, idx):
        return {
            'data': self.data[idx, :self.sequence_length, :],
            'label': self.data[idx, self.sequence_length, :]
        }


x_train, x_test = train_test_split(tensor, test_size=0.1, shuffle=False)
x_train, x_test = x_train[:-1], x_test[:-1]

window_size = 30
train_dataset = TimeSeriesDataset(x_train, sequence_length=window_size)
test_dataset = TimeSeriesDataset(x_test, sequence_length=window_size)

batch_size = 61 # 1, 2, 41, 61, 82, 122, 2501 (divisores enteros de 5002)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [9]:
######## Ahora toca definir el modelo ######### 
######## Ahora toca definir el modelo ######### 
######## Ahora toca definir el modelo ######### 
######## Ahora toca definir el modelo ######### 


class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        
        self.lstm = nn.LSTM(input_size=input_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True)
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # La entrada x debe tener dimensiones (batch_size, seq_len, input_size)
        lstm_out, _ = self.lstm(x)
        
        # Tomamos el último timestep de la salida de la LSTM
        last_timestep = lstm_out[:, -1, :]
        
        # Pasamos la salida del último timestep a través de una capa lineal
        output = self.fc(last_timestep)
        
        return output


In [20]:

# Definimos las dimensiones de entrada y salida de la red
input_size = 15  # Número de variables en cada timestep
hidden_size = 512  # Tamaño del estado oculto de la LSTM
num_layers = 5  # Número de capas en la LSTM
output_size = 1  # Tamaño de la salida (pronóstico para el próximo timestep)

# Creamos una instancia del modelo
model = LSTMModel(input_size, hidden_size, num_layers, output_size)

# Definir la función de pérdida y el optimizador
criterion = nn.MSELoss()  # Usar la pérdida del error cuadrático medio para la tarea de regresión
optimizer = torch.optim.Adam(model.parameters(), lr=1)  # Usar el optimizador Adam

# Definir el número de épocas
num_epochs = 10

# Bucle de entrenamiento
for epoch in tqdm(range(num_epochs)):
    model.train()  # Poner el modelo en modo de entrenamiento
    for batch in tqdm(train_dataloader):
        # Obtener los datos y las etiquetas del batch
        data = batch['data']
        labels = batch['label']

        # Pasar los datos a través del modelo
        outputs = model(data)

        # Calcular la pérdida
        loss = criterion(outputs, labels)

        # Retropropagar y optimizar
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Bucle de prueba
    model.eval()  # Poner el modelo en modo de evaluación
    with torch.no_grad():  # Desactivar el cálculo de gradientes para mejorar la eficiencia
        total_loss = 0
        total_samples = 0
        for batch in tqdm(test_dataloader):
            # Obtener los datos y las etiquetas del batch
            data = batch['data']
            labels = batch['label']

            # Pasar los datos a través del modelo
            outputs = model(data)

            # Calcular la pérdida
            loss = criterion(outputs, labels)

            total_loss += loss.item() * data.size(0)
            total_samples += data.size(0)

        # Calcular la pérdida media
        avg_loss = total_loss / total_samples

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')


100%|██████████| 92/92 [00:29<00:00,  3.14it/s]
100%|██████████| 10/10 [00:01<00:00,  9.10it/s]
 10%|█         | 1/10 [00:30<04:33, 30.37s/it]

Epoch [1/10], Loss: 7291065597.4228


100%|██████████| 92/92 [00:28<00:00,  3.24it/s]
100%|██████████| 10/10 [00:01<00:00,  9.24it/s]
 20%|██        | 2/10 [00:59<03:58, 29.82s/it]

Epoch [2/10], Loss: 7255390898.6846


100%|██████████| 92/92 [00:29<00:00,  3.17it/s]
100%|██████████| 10/10 [00:01<00:00,  9.17it/s]
 30%|███       | 3/10 [01:29<03:29, 29.96s/it]

Epoch [3/10], Loss: 7255069850.6309


100%|██████████| 92/92 [00:29<00:00,  3.12it/s]
100%|██████████| 10/10 [00:01<00:00,  9.10it/s]
 40%|████      | 4/10 [02:00<03:01, 30.19s/it]

Epoch [4/10], Loss: 7255069955.4362


100%|██████████| 92/92 [00:29<00:00,  3.14it/s]
100%|██████████| 10/10 [00:01<00:00,  9.14it/s]
 50%|█████     | 5/10 [02:30<02:31, 30.27s/it]

Epoch [5/10], Loss: 7255071281.8255


100%|██████████| 92/92 [00:29<00:00,  3.15it/s]
100%|██████████| 10/10 [00:01<00:00,  8.62it/s]
 60%|██████    | 6/10 [03:01<02:01, 30.32s/it]

Epoch [6/10], Loss: 7255071072.2148


100%|██████████| 92/92 [00:29<00:00,  3.13it/s]
100%|██████████| 10/10 [00:01<00:00,  8.12it/s]
 70%|███████   | 7/10 [03:31<01:31, 30.41s/it]

Epoch [7/10], Loss: 7255070427.0604


100%|██████████| 92/92 [00:31<00:00,  2.92it/s]
100%|██████████| 10/10 [00:01<00:00,  8.53it/s]
 80%|████████  | 8/10 [04:04<01:02, 31.13s/it]

Epoch [8/10], Loss: 7255070677.0470


100%|██████████| 92/92 [00:30<00:00,  3.01it/s]
100%|██████████| 10/10 [00:01<00:00,  8.67it/s]
 90%|█████████ | 9/10 [04:36<00:31, 31.33s/it]

Epoch [9/10], Loss: 7255069903.0336


100%|██████████| 92/92 [00:30<00:00,  2.97it/s]
100%|██████████| 10/10 [00:01<00:00,  8.98it/s]
100%|██████████| 10/10 [05:08<00:00, 30.84s/it]

Epoch [10/10], Loss: 7255069443.4362



