In [None]:
import random
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 




In [None]:
def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

In [None]:
SEED = 1038893

fix_random(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

root = "../../data/ml-25m"

Device: cpu


In [None]:
## Data Acquisition
def get_data_from_csv(file: str, nrows=None):
    if nrows:
        df = pd.read_csv(f"{root}/{file}", nrows=nrows)
    else:
        df = pd.read_csv(f"{root}/{file}")
    print(f"Loaded ml-25m data: {root}/{file}")

    return df

In [None]:
def get_ratings_from_csv(nrows=None):
    path = "ratings.csv"

    data = get_data_from_csv(path, nrows)
    # data = get_data_from_csv(path)

    data.drop("timestamp", axis=1, inplace=True)
    # todo: drop user id
    #data.drop("userId", axis=1, inplace=True)

    return data

In [None]:
def get_tag_relevances_from_csv(nrows=None):
    path = "genome-scores.csv"

    return get_data_from_csv(path, nrows)
genome_scores_path = "genome-scores.csv"
ratings_path = "ratings.csv"
ratings = get_ratings_from_csv()
print(ratings)
genome_scores = get_tag_relevances_from_csv()
print(genome_scores)
# films = get_data_from_csv(f"{root}/{ratings}")[]

Loaded ml-25m data: ../../data/ml-25m/ratings.csv
          userId  movieId  rating
0              1      296     5.0
1              1      306     3.5
2              1      307     5.0
3              1      665     5.0
4              1      899     3.5
...          ...      ...     ...
25000090  162541    50872     4.5
25000091  162541    55768     2.5
25000092  162541    56176     2.0
25000093  162541    58559     4.0
25000094  162541    63876     5.0

[25000095 rows x 3 columns]
Loaded ml-25m data: ../../data/ml-25m/genome-scores.csv
          movieId  tagId  relevance
0               1      1    0.02875
1               1      2    0.02375
2               1      3    0.06250
3               1      4    0.07575
4               1      5    0.14075
...           ...    ...        ...
15584443   206499   1124    0.11000
15584444   206499   1125    0.04850
15584445   206499   1126    0.01325
15584446   206499   1127    0.14025
15584447   206499   1128    0.03350

[15584448 rows x 3 colum

In [None]:
# DATA VISUALIZATION

            # # Fill in missing values with zeros
            # X.fillna(0, inplace=True)

# FARE TEST CON AVG, STD_DEV
def addColumnOperation(ratings,X):
     # Compute the mean rating for each user
     count_rating = ratings.groupby('movieId', as_index=False)['rating'].count()
     std= ratings.groupby('movieId', as_index=False)['rating'].std()
     std.fillna(0, inplace=True)
     min_ratings= ratings.groupby('movieId', as_index=False)['rating'].min()
     max_ratings= ratings.groupby('movieId', as_index=False)['rating'].max()
     median= ratings.groupby('movieId', as_index=False)['rating'].median()
     operation = pd.DataFrame({'movieId':count_rating['movieId'],'count_rating': count_rating['rating'], 'std': std['rating'], 'min': min_ratings['rating'], 'max': max_ratings['rating'], 'median': median['rating']}) 
     X = pd.merge(X, operation, on='movieId')
     X.drop("movieId", axis=1, inplace=True)
     return X

In [39]:
class Model:
    def __init__(self, ratings, relevance, seed=42):
        #! df['rating'] = df['rating'].astype('float16')

        # Reduce genome-score size

        # Pivot the relevance DataFrame to create a matrix of tag relevance scores for each movie
        relevance_matrix = relevance.pivot_table(index='movieId', columns='tagId', values='relevance', fill_value=0)

        
        # Merge the ratings and relevance data
        mean_ratings = ratings.groupby('movieId', as_index=False)['rating'].mean()
        X = mean_ratings.merge(relevance_matrix, on='movieId')
        X = addColumnOperation(ratings,X)
        X.columns = X.columns.astype(str)
        ratings = None  
        # mescolare le righe del DataFrame
        #X = X.sample(frac=1).reset_index(drop=True)
        
        self.df = X
        
        print(X.columns)
        # Split the data into training and testing sets
        self.training, self.test = train_test_split(self.df, test_size=0.2, random_state=42)

        # Split training data into training and validation sets
        self.traininig, self.val = train_test_split(self.training, test_size=0.2, random_state=42)

        data_config = DataConfig(
        target=[
            "rating"
        ],  # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
        continuous_cols=self.df.columns.tolist(),
        )
        trainer_config = TrainerConfig(
            auto_lr_find=True,  # Runs the LRFinder to automatically derive a learning rate
            batch_size=1024,
            max_epochs=100,
            accelerator="auto",
        )
        optimizer_config = OptimizerConfig()

        model_config = CategoryEmbeddingModelConfig(
            task="regression",
            layers="1024-512-512",  # Number of nodes in each layer
            activation="LeakyReLU",  # Activation between each layers
            learning_rate=1e-3,
        )

    
        self.tabularModel = TabularModel(
            data_config=data_config,
            optimizer_config=optimizer_config,
            model_config=model_config,
            trainer_config=trainer_config,
            
        )
                       
    def train(self):
        self.tabularModel.fit(
            train  = self.training,
            validation = self.val,
        )

        result = self.tabularModel.evaluate(self.test)
        #self.tabularModel.save_model("examples/basic")
        #loaded_model = TabularModel.load_from_checkpoint("examples/basic")
        
                


In [40]:
model = Model(ratings, genome_scores)

Index(['rating', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '1124', '1125', '1126', '1127', '1128', 'count_rating', 'std', 'min',
       'max', 'median'],
      dtype='object', length=1134)


2023-02-27 13:56:25,126 - {pytorch_tabular.tabular_model:102} - INFO - Experiment Tracking is turned off


In [41]:
model.train()

Global seed set to 42
2023-02-27 13:56:25,396 - {pytorch_tabular.tabular_model:465} - INFO - Preparing the DataLoaders
2023-02-27 13:56:25,479 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for regression task
2023-02-27 13:56:28,850 - {pytorch_tabular.tabular_model:508} - INFO - Preparing the Model: CategoryEmbeddingModel
2023-02-27 13:56:29,247 - {pytorch_tabular.tabular_model:264} - INFO - Preparing the Trainer
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-02-27 13:56:29,625 - {pytorch_tabular.tabular_model:558} - INFO - Auto LR Find Started
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
  rank_zero_warn(
  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=82` reached.
LR finder stopped early after 82 steps due to diverging loss.
Learning rate set to 0.0002511886431509582
Restoring states from the checkpoint path at c:\Users\simon\Desktop\unibo-data-analytics\src\Fun_3\.lr_find_a7c94326-332e-4bb6-b0ac-47ba425a7374.ckpt
Restored all states from the checkpoint file at c:\Users\simon\Desktop\unibo-data-analytics\src\Fun_3\.lr_find_a7c94326-332e-4bb6-b0ac-47ba425a7374.ckpt
2023-02-27 13:56:46,095 - {pytorch_tabular.tabular_model:560} - INFO - Suggested LR: 0.0002511886431509582. For plot and detailed analysis, use `find_learning_rate` method.
2023-02-27 13:56:46,103 - {pytorch_tabular.tabular_model:566} - INFO - Training Started


Output()

2023-02-27 13:58:02,606 - {pytorch_tabular.tabular_model:568} - INFO - Training the model completed
2023-02-27 13:58:02,606 - {pytorch_tabular.tabular_model:1207} - INFO - Loading the best model
  rank_zero_warn(


Output()