In [34]:
import random
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetRegressor
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


In [35]:
def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

In [36]:
SEED = 1038893

fix_random(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

root = "../../data/ml-25m"

Device: cpu


In [37]:
## Data Acquisition
def get_data_from_csv(file: str, nrows=None):
    if nrows:
        df = pd.read_csv(f"{root}/{file}", nrows=nrows)
    else:
        df = pd.read_csv(f"{root}/{file}")
    print(f"Loaded ml-25m data: {root}/{file}")

    return df

In [38]:
def get_ratings_from_csv(nrows=None):
    path = "ratings.csv"

    data = get_data_from_csv(path, nrows)
    # data = get_data_from_csv(path)

    data.drop("timestamp", axis=1, inplace=True)
    # todo: drop user id
    #data.drop("userId", axis=1, inplace=True)

    return data

In [39]:
def get_tag_relevances_from_csv(nrows=None):
    path = "genome-scores.csv"

    return get_data_from_csv(path, nrows)
genome_scores_path = "genome-scores.csv"
ratings_path = "ratings.csv"
ratings = get_ratings_from_csv()
print(ratings)
genome_scores = get_tag_relevances_from_csv()
print(genome_scores)
# films = get_data_from_csv(f"{root}/{ratings}")[]

Loaded ml-25m data: ../../data/ml-25m/ratings.csv
          userId  movieId  rating
0              1      296     5.0
1              1      306     3.5
2              1      307     5.0
3              1      665     5.0
4              1      899     3.5
...          ...      ...     ...
25000090  162541    50872     4.5
25000091  162541    55768     2.5
25000092  162541    56176     2.0
25000093  162541    58559     4.0
25000094  162541    63876     5.0

[25000095 rows x 3 columns]
Loaded ml-25m data: ../../data/ml-25m/genome-scores.csv
          movieId  tagId  relevance
0               1      1    0.02875
1               1      2    0.02375
2               1      3    0.06250
3               1      4    0.07575
4               1      5    0.14075
...           ...    ...        ...
15584443   206499   1124    0.11000
15584444   206499   1125    0.04850
15584445   206499   1126    0.01325
15584446   206499   1127    0.14025
15584447   206499   1128    0.03350

[15584448 rows x 3 colum

In [40]:
# DATA VISUALIZATION

            # # Fill in missing values with zeros
            # X.fillna(0, inplace=True)

# FARE TEST CON AVG, STD_DEV
def addColumnOperation(ratings,X):
     # Compute the mean rating for each user
     count_rating = ratings.groupby('movieId', as_index=False)['rating'].count()
     std= ratings.groupby('movieId', as_index=False)['rating'].std()
     std.fillna(0, inplace=True)
     min_ratings= ratings.groupby('movieId', as_index=False)['rating'].min()
     max_ratings= ratings.groupby('movieId', as_index=False)['rating'].max()
     median= ratings.groupby('movieId', as_index=False)['rating'].median()
     operation = pd.DataFrame({'movieId':count_rating['movieId'],'count_rating': count_rating['rating'], 'std': std['rating'], 'min': min_ratings['rating'], 'max': max_ratings['rating'], 'median': median['rating']}) 
     X = pd.merge(X, operation, on='movieId')
     X.drop("movieId", axis=1, inplace=True)
     return X

In [45]:
from sklearn.metrics import mean_squared_error, r2_score
from pytorch_tabnet.augmentations import RegressionSMOTE
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
           

class Model:
    def __init__(self, ratings, relevance, seed=42):
        self.aug = RegressionSMOTE(p=0.2)
        #! df['rating'] = df['rating'].astype('float16')

        # Reduce genome-score size

        # Pivot the relevance DataFrame to create a matrix of tag relevance scores for each movie
        relevance_matrix = relevance.pivot_table(index='movieId', columns='tagId', values='relevance', fill_value=0)
        # Merge the ratings and relevance data
        mean_ratings = ratings.groupby('movieId', as_index=False)['rating'].mean()
        X = mean_ratings.merge(relevance_matrix, on='movieId')
        #X = addColumnOperation(ratings,X)
        X.columns = X.columns.astype(str)
        ratings = None  
        train = X
        # mescolare le righe del DataFrame
        #X = X.sample(frac=1).reset_index(drop=True)
        
        if "Set" not in train.columns:
            train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

        features = [ col for col in train.columns if col not in ["rating", "Set"]]
        target = "rating"
        
        train_indices = train[train.Set=="train"].index
        valid_indices = train[train.Set=="valid"].index
        test_indices = train[train.Set=="test"].index

        self.X_train = train[features].values[train_indices]
        self.y_train = train[target].values[train_indices].reshape(-1, 1)

        self.X_valid = train[features].values[valid_indices]
        self.y_valid = train[target].values[valid_indices].reshape(-1, 1)

        self.X_test = train[features].values[test_indices]
        self.y_test = train[target].values[test_indices].reshape(-1, 1)
        

        print(self.X_train)
        # Split the training data into training and validation sets    
        self.clf = TabNetRegressor(seed=SEED)  #TabNetRegressor()
        
    def train(self):
        self.clf.fit(
            X_train=self.X_train, y_train=self.y_train,
            eval_set=[(self.X_train,self.y_train), (self.X_valid, self.y_valid)],
            eval_name=['train', 'valid'],
            eval_metric=['rmsle', 'mae', 'rmse', 'mse'],
            max_epochs=150,
            patience=20,
            batch_size=1024, virtual_batch_size=1024,
            num_workers=0,
            drop_last=False,
            augmentations=self.aug, #aug
        ) 

    def test(self):
        # Predict the labels of the test set: y_pred
        y_pred = self.clf.predict(self.X_test)

        # Compute the mean squared error
        mse = mean_squared_error(self.y_test, y_pred)
        rmse = mean_squared_error(self.y_test, y_pred, squared=False)
        r2 = r2_score(self.y_test, y_pred)
        mae = mean_absolute_error(self.y_test, y_pred)

        print(f"MSE: {mse} RMSE: {rmse} R2: {r2} MAE: {mae}")
        print("=====================================")

    

In [46]:
model = Model(ratings, genome_scores)

[[1.00000e+00 2.87500e-02 2.37500e-02 ... 2.97500e-02 8.47500e-02
  2.20000e-02]
 [2.00000e+00 4.12500e-02 4.05000e-02 ... 1.10000e-02 1.05250e-01
  1.97500e-02]
 [3.00000e+00 4.67500e-02 5.55000e-02 ... 1.80000e-02 9.10000e-02
  1.77500e-02]
 ...
 [2.05383e+05 4.10000e-02 4.02500e-02 ... 2.90000e-02 1.17250e-01
  3.92500e-02]
 [2.05425e+05 4.52500e-02 4.12500e-02 ... 1.50000e-02 1.10500e-01
  2.85000e-02]
 [2.06499e+05 1.00500e-01 9.32500e-02 ... 1.32500e-02 1.40250e-01
  3.35000e-02]]




In [47]:
model.train()

epoch 0  | loss: 11.67023| train_rmsle: 0.30197 | train_mae: 1.78833 | train_rmse: 1.85349 | train_mse: 3.43544 | valid_rmsle: 0.30387 | valid_mae: 1.79529 | valid_rmse: 1.85979 | valid_mse: 3.4588  |  0:00:11s
epoch 1  | loss: 3.34823 | train_rmsle: 0.12104 | train_mae: 1.21853 | train_rmse: 1.30273 | train_mse: 1.6971  | valid_rmsle: 0.12201 | valid_mae: 1.22289 | valid_rmse: 1.30869 | valid_mse: 1.71266 |  0:00:22s
epoch 2  | loss: 1.09356 | train_rmsle: 0.05416 | train_mae: 0.83401 | train_rmse: 0.92877 | train_mse: 0.86261 | valid_rmsle: 0.05436 | valid_mae: 0.835   | valid_rmse: 0.93227 | valid_mse: 0.86913 |  0:00:35s
epoch 3  | loss: 0.52863 | train_rmsle: 0.02066 | train_mae: 0.50827 | train_rmse: 0.59618 | train_mse: 0.35543 | valid_rmsle: 0.02058 | valid_mae: 0.50914 | valid_rmse: 0.59875 | valid_mse: 0.35851 |  0:00:45s
epoch 4  | loss: 0.34933 | train_rmsle: 0.01448 | train_mae: 0.41231 | train_rmse: 0.49493 | train_mse: 0.24496 | valid_rmsle: 0.01426 | valid_mae: 0.41482 



In [48]:
model.test()

MSE: 0.009059816208471365 RMSE: 0.0951830668158542 R2: 0.9598956845605661 MAE: 0.07284897751167234


AttributeError: 'TabNetRegressor' object has no attribute 'score'