In [1]:
import os
import zipfile
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import torch as torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from brevage_sales import brevage_preprocessing
from training_functions import train_model

In [2]:
kaggle_datasets = ["rockyt07/stock-market-sensex-nifty-all-time-dataset",
            "minahilfatima12328/performance-trends-in-education",
            "alessiocorrado99/animals10",
            "sebastianwillmann/beverage-sales"]
data_dir = "data/"

download = False

In [3]:
if download:
    !mkdir -p {data_dir}
    for dataset in kaggle_datasets:
        if not os.path.exists(os.path.join(data_dir, dataset.split("/")[-1])):    
            !mkdir -p {data_dir/dataset}
            !kaggle datasets download -d {dataset} -p {data_dir}/{dataset} --unzip


In [4]:
# mnist download
(mnist_X_train_full, mnist_y_train_full), (mnist_X_test, mnist_y_test) = (keras.datasets.mnist.load_data())

In [6]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# Data Processing

In [7]:
df = pd.read_csv('./data/sebastianwillmann/beverage-sales/synthetic_beverage_sales_data.csv')
print(df.head())
print(df.shape)


  Order_ID Customer_ID Customer_Type             Product     Category  \
0     ORD1     CUS1496           B2B          Vio Wasser        Water   
1     ORD1     CUS1496           B2B               Evian        Water   
2     ORD1     CUS1496           B2B              Sprite  Soft Drinks   
3     ORD1     CUS1496           B2B  Rauch Multivitamin       Juices   
4     ORD1     CUS1496           B2B        Gerolsteiner        Water   

   Unit_Price  Quantity  Discount  Total_Price             Region  Order_Date  
0        1.66        53      0.10        79.18  Baden-Württemberg  2023-08-23  
1        1.56        90      0.10       126.36  Baden-Württemberg  2023-08-23  
2        1.17        73      0.05        81.14  Baden-Württemberg  2023-08-23  
3        3.22        59      0.10       170.98  Baden-Württemberg  2023-08-23  
4        0.87        35      0.10        27.40  Baden-Württemberg  2023-08-23  
(8999910, 11)


In [8]:
# on ne garde que 1 000 000 lignes
df = df.sample(n=1000000, random_state=42).reset_index(drop=True)
X_train_scaled,X_val_scaled,X_test_scaled,y_train,y_val,y_test = brevage_preprocessing(df,test_size=0.2,val_size=0.2)


In [9]:
from brevage_sales import brevage_model

In [10]:
model = brevage_model(X_train_scaled.shape[1], mode='relu').to(device)
criterion = nn.MSELoss()
learning_rate = 0.001
num_epochs = 50
batch_size = 128


optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_dataset = TensorDataset(torch.tensor(X_train_scaled, dtype=torch.float32).to(device), torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device))
val_dataset = TensorDataset(torch.tensor(X_val_scaled, dtype=torch.float32).to(device), torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1).to(device))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [11]:
history = train_model(model, criterion, optimizer, num_epochs,train_loader, val_loader)

50
Epoch 1/50, Training Loss: 76762.4790, Validation Loss: 13465.6400
Epoch 2/50, Training Loss: 4917.4804, Validation Loss: 2914.6772
Epoch 3/50, Training Loss: 2385.2716, Validation Loss: 2020.1267
Epoch 4/50, Training Loss: 1355.4029, Validation Loss: 859.7517
Epoch 5/50, Training Loss: 546.2673, Validation Loss: 348.0062
Epoch 6/50, Training Loss: 285.8397, Validation Loss: 249.3131
Epoch 7/50, Training Loss: 208.8318, Validation Loss: 183.0360
Epoch 8/50, Training Loss: 173.3266, Validation Loss: 168.9972
Epoch 9/50, Training Loss: 150.5946, Validation Loss: 155.3831
Epoch 10/50, Training Loss: 134.2525, Validation Loss: 132.5959
Epoch 11/50, Training Loss: 122.5016, Validation Loss: 110.8246
Epoch 12/50, Training Loss: 116.3562, Validation Loss: 114.6704
Epoch 13/50, Training Loss: 112.6145, Validation Loss: 103.2811
Epoch 14/50, Training Loss: 107.0414, Validation Loss: 107.8675
Epoch 15/50, Training Loss: 103.7634, Validation Loss: 112.3664
Epoch 16/50, Training Loss: 99.6613, 

In [13]:
y_pred = model(torch.tensor(X_test_scaled, dtype=torch.float32).to(device)).cpu().detach().numpy()
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f'Test MSE: {mse}')
history["final_test_loss"]  = mse

Test MSE: 46.5607867816725


In [12]:
print(history)

{'train_loss': [76762.47895996094, 4917.480402141927, 2385.2715601302084, 1355.4028592252605, 546.267305398763, 285.83973342610676, 208.83184728515624, 173.3265512467448, 150.59456974527995, 134.25248311035156, 122.50164867024739, 116.35622235677083, 112.61448102539063, 107.04140088378907, 103.76343618326823, 99.66126768229167, 95.9821802360026, 94.33911462320964, 91.18894970499674, 88.02515171468099, 84.88403221150716, 83.3407439436849, 81.36441092081705, 79.31362923502604, 78.35122577555339, 76.03748635701497, 73.95446583658854, 72.45813270914714, 70.24104186238607, 69.9657667972819, 66.68204318888347, 65.17111615193684, 64.77353048421224, 61.88393656494141, 61.30457855102539, 59.284649084472655, 57.61360815063477, 56.9302004699707, 55.20521035746256, 54.72456991923014, 54.28219326944987, 53.1172882039388, 51.10087702718099, 49.87973960306803, 51.0055470296224, 49.062449031982425, 48.737331572672524, 47.79843918273926, 47.24748483378092, 47.339593358357746], 'val_loss': [13465.639988

In [None]:
# ce que l'on a besoin d'enregistrer
# - train loss pour chaque epoch
# - val loss pour chaque epoch
# - temps d'entrainement
# - final test loss
# - relu ou gelu
# - parameters du modèle (dépend du dataset)
# - paramètres d'entrainement :
    # - nombre d'epochs
    # - batch size
    # - learning rate


history = {
    'train_loss': [],
    'val_loss': [],
    'final_test_loss': None,
    'activation_function': 'relu',
    'model_parameters': model.state_dict(),
    'training_parameters': {
        'num_epochs': num_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
}