In [281]:
import random
import torch
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, accuracy_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [63]:
def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

In [64]:
SEED = 1038893

fix_random(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

root = "../data/ml-25m"

Device: cpu


## Data Acquisition

In [65]:
def get_data_from_csv(file: str, nrows=None):
    if nrows:
        df = pd.read_csv(f"{root}/{file}", nrows=nrows)
    else:
        df = pd.read_csv(f"{root}/{file}")
    print(f"Loaded ml-25m data: {root}/{file}")

    return df

def get_ratings_from_csv(nrows=None):
    path = "ratings.csv"

    data = get_data_from_csv(path, nrows)
    # data = get_data_from_csv(path)

    data.drop("timestamp", axis=1, inplace=True)
    # todo: drop user id
    #data.drop("userId", axis=1, inplace=True)

    return data

def get_tag_relevances_from_csv(nrows=None):
    path = "genome-scores.csv"

    return get_data_from_csv(path, nrows)

In [66]:
genome_scores_path = "genome-scores.csv"
ratings_path = "ratings.csv"

In [67]:
ratings = get_ratings_from_csv()
print(ratings)

Loaded ml-25m data: ../data/ml-25m/ratings.csv
          userId  movieId  rating
0              1      296     5.0
1              1      306     3.5
2              1      307     5.0
3              1      665     5.0
4              1      899     3.5
...          ...      ...     ...
25000090  162541    50872     4.5
25000091  162541    55768     2.5
25000092  162541    56176     2.0
25000093  162541    58559     4.0
25000094  162541    63876     5.0

[25000095 rows x 3 columns]


In [68]:
genome_scores = get_tag_relevances_from_csv()
print(genome_scores)

Loaded ml-25m data: ../data/ml-25m/genome-scores.csv
          movieId  tagId  relevance
0               1      1    0.02875
1               1      2    0.02375
2               1      3    0.06250
3               1      4    0.07575
4               1      5    0.14075
...           ...    ...        ...
15584443   206499   1124    0.11000
15584444   206499   1125    0.04850
15584445   206499   1126    0.01325
15584446   206499   1127    0.14025
15584447   206499   1128    0.03350

[15584448 rows x 3 columns]


In [69]:
# films = get_data_from_csv(f"{root}/{ratings}")[]


# DATA VISUALIZATION


In [None]:
            # # Fill in missing values with zeros
            # X.fillna(0, inplace=True)


# FARE TEST CON AVG, STD_DEV

In [282]:
class Bayes:
    def __init__(self, ratings, relevance, seed=42):
            #! df['rating'] = df['rating'].astype('float16')

            # Reduce genome-score size

            # Pivot the relevance DataFrame to create a matrix of tag relevance scores for each movie
            relevance_matrix = relevance.pivot_table(index='movieId', columns='tagId', values='relevance', fill_value=0)

            # Compute the mean rating for each user
            mean_ratings = ratings.groupby('movieId', as_index=False)['rating'].mean()
            ratings = None

            
            # Merge the ratings and relevance data
            X = mean_ratings.merge(relevance_matrix, on='movieId')
            X.columns = X.columns.astype(str)
            
            y = np.round(X['rating'], decimals=0)
            #y = X['rating']
            X = X.drop('rating', axis=1)

            print(y.head())

           # Convert X to a PyTorch tensor
            #self.X = X.to_numpy()
            #self.X = torch.tensor(X.values, dtype=torch.float16)

            # Create the target variable (y) as a PyTorch tensor
            #self.y = y.to_numpy()
            #self.y = torch.tensor(y.values, dtype=torch.float16)

            # Split the data into training and testing sets
            from sklearn.model_selection import train_test_split
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
            print(self.X_train.shape, self.X_test.shape, self.y_train.shape, self.y_test.shape)
            # Initialize the Gaussian Naive Bayes model
            self.nb = GaussianNB()


    # Training
    def train(self):
        nb = GaussianNB()
        nb.fit(self.X_train, self.y_train) 
        self.nb = nb

    def test(self):
        # Predict ratings for the test data
        y_pred = self.nb.predict(self.X_test)

        # Compute the mean squared error
        mse = mean_squared_error(self.y_test, y_pred)
        accuracy = accuracy_score(self.y_test, y_pred)

        print(f"Mean squared error: {mse}")
        print(f"Accuracy: {accuracy}")
        


In [283]:

# class Bayes:
#     def __init__(self, ratings, relevance, seed = 1038893) -> None:
#         self.ratings = ratings
#         #self.ratings = self.ratings.drop(['timestamp'], axis=1, inplace=True)
#         #self.ratings.drop(['userId'], axis=1, inplace=True)
#         mean_ratings = self.ratings.groupby('movieId', as_index=False)['rating'].mean()

#         #print(movie_genome_scores.head())

#         tmp_rels = relevance.groupby('movieId', as_index = False)
#         #print(relevancessss.head())
        
#         movies = []

        
#         #! le colonne sono nominate con int (tranne movieID)
#         scores = pd.DataFrame(index=np.arange(1), columns=np.arange(1129))
#         scores.rename(columns={0: 'movieId'}, inplace=True)
#         #scores = pd.DataFrame(columns=['movieId', tags])

#         # tmp_arr = []
#         # for mid, movie in enumerate(tmp_rels):
#         #     rels = []
#         #     # print(movie)
#         #     scores = movie[1]['relevance']

#         #     for x in range(1128):
#         #         # rels.append(movie[1]['relevance'][x])
#         #         print(movie[1]['relevance'][x])
#         #         # tmp_arr = [scores]

            
#             # print(f'\n\n{scores}')
#             # for item in movie:
#             #     pass
#                 # tmp_tag = item['tagId']
#                 # tmp_rel = item['relevance']
#                 # print(f'dd {tmp_tag} - {tmp_rel}')

#         # print(tmp_arr)



#         # d_tmp = {'movieId': movie[0], 'scores': rels}
#         # scores = scores.append(d_tmp, ignore_index = True)

#         # for mId, movie in enumerate(tmp_rels):
#         #     rels = []
#         #     for relevance in movie[1]['relevance']:
#         #         rels.append(relevance)

#         #     d_tmp = {'movieId': movie[0], 'scores': rels}
#         #     scores = scores.append(d_tmp, ignore_index = True)

#         # self.movie_genome_scores = pd.merge(mean_ratings, scores, on='movieId')
#         # self.movie_genome_scores.drop(['movieId'], axis=1, inplace=True)        

        

#         # self.x = self.movie_genome_scores['scores']
#         # self.y = mean_ratings['rating']
#         # print(self.x.head())
#         # print(self.y.head())

#         # self.X_sparse = csr_matrix(self.x)

#         # from sklearn.model_selection import train_test_split
#         # X_train, X_test, y_train, y_test = train_test_split(self.X_sparse, self.y, test_size=0.2, random_state=seed)

        
#         # self.nb = GaussianNB

#     #*  Training
#     def train(self):
#         self.nb.fit(self.X_train.toarray(), self.y_train)

#     #*  Testing
#     def testing(self):
#         y_pred = self.nb.predict(self.X_test.toarray())

#         #*  Calculating Mean Squared Error
#         mse = mean_squared_error(self.y_test, y_pred)

#         print(f'Mean Squared Error: {mse}')



In [284]:
bs = Bayes(ratings, genome_scores, SEED)

0    4.0
1    3.0
2    3.0
3    3.0
4    3.0
Name: rating, dtype: float64
(11052, 1129) (2764, 1129) (11052,) (2764,)


In [285]:
bs.train()

In [286]:
bs.test()

Mean squared error: 0.32778581765557163
Accuracy: 0.6732995658465991
