In [8]:
import random
import torch
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error

In [9]:
def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

In [18]:
SEED = 1038893

fix_random(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

root = "../data/ml-25m"

Device: cpu


## Data Acquisition

In [11]:
def get_data_from_csv(file: str):
    df = pd.read_csv(file)
    return df

In [12]:
genome_scores_path = "genome-scores.csv"
ratings_path = "ratings.csv"

In [20]:
ratings = get_data_from_csv(f"{root}/{ratings_path}")[:10000]
print(ratings)

      userId  movieId  rating   timestamp
0          1      296     5.0  1147880044
1          1      306     3.5  1147868817
2          1      307     5.0  1147868828
3          1      665     5.0  1147878820
4          1      899     3.5  1147868510
...      ...      ...     ...         ...
9995      75      736     4.0  1537207939
9996      75      778     3.0  1537208100
9997      75      783     3.0  1537348896
9998      75      805     3.5  1537348872
9999      75      832     3.0  1537208657

[10000 rows x 4 columns]


In [30]:
genome_scores = get_data_from_csv(f"{root}/{genome_scores_path}")[:10152]
print(genome_scores)

       movieId  tagId  relevance
0            1      1    0.02875
1            1      2    0.02375
2            1      3    0.06250
3            1      4    0.07575
4            1      5    0.14075
...        ...    ...        ...
10147        9   1124    0.01600
10148        9   1125    0.01200
10149        9   1126    0.00625
10150        9   1127    0.09475
10151        9   1128    0.01750

[10152 rows x 3 columns]


In [31]:

class Bayes:
    def __init__(self, ratings, scores, seed = 1038893) -> None:
        movie_genome_scores = pd.merge(ratings, scores, on='movieId')

        self.x = movie_genome_scores.pivot(index='movieId', columns='tagId', values='relevance')
        self.y = ratings.groupby('movieId').mean()['rating']

        self.X_sparse = csr_matrix(self.x)

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(self.X_sparse, self.y, test_size=0.2, random_state=seed)

        
        self.nb = GaussianNB

    #*  Training
    def train(self):
        self.nb.fit(self.X_train.toarray(), self.y_train)

    #*  Testing
    def testing(self):
        y_pred = self.nb.predict(self.X_test.toarray())

        #*  Calculating Mean Squared Error
        mse = mean_squared_error(self.y_test, y_pred)

        print(f'Mean Squared Error: {mse}')



In [32]:
bs = Bayes(ratings, genome_scores, SEED)

ValueError: Index contains duplicate entries, cannot reshape