In [15]:
print("ciao")

ciao


In [16]:
import random
import torch
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score, mean_squared_error
from sklearn.svm import LinearSVR, SVR
from sklearn.model_selection import train_test_split,GridSearchCV
        

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [17]:
def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

In [18]:
SEED = 1038893

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

fix_random(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))



root = "../../data/ml-25m"

Device: cuda


## Data Acquisition

In [19]:
def get_data_from_csv(file: str, nrows=None):
    if nrows:
        df = pd.read_csv(f"{root}/{file}", nrows=nrows)
    else:
        df = pd.read_csv(f"{root}/{file}")
    print(f"Loaded ml-25m data: {root}/{file}")

    return df

def get_ratings_from_csv(nrows=None):
    path = "ratings.csv"

    data = get_data_from_csv(path, nrows)
    # data = get_data_from_csv(path)

    data.drop("timestamp", axis=1, inplace=True)
    # todo: drop user id
    #data.drop("userId", axis=1, inplace=True)

    return data

def get_tag_relevances_from_csv(nrows=None):
    path = "genome-scores.csv"

    return get_data_from_csv(path, nrows)

In [20]:
genome_scores_path = "genome-scores.csv"
ratings_path = "ratings.csv"

In [21]:
ratings = get_ratings_from_csv()
print(ratings)

Loaded ml-25m data: ../../data/ml-25m/ratings.csv
          userId  movieId  rating
0              1      296     5.0
1              1      306     3.5
2              1      307     5.0
3              1      665     5.0
4              1      899     3.5
...          ...      ...     ...
25000090  162541    50872     4.5
25000091  162541    55768     2.5
25000092  162541    56176     2.0
25000093  162541    58559     4.0
25000094  162541    63876     5.0

[25000095 rows x 3 columns]


In [22]:
genome_scores = get_tag_relevances_from_csv()
print(genome_scores)

Loaded ml-25m data: ../../data/ml-25m/genome-scores.csv
          movieId  tagId  relevance
0               1      1    0.02875
1               1      2    0.02375
2               1      3    0.06250
3               1      4    0.07575
4               1      5    0.14075
...           ...    ...        ...
15584443   206499   1124    0.11000
15584444   206499   1125    0.04850
15584445   206499   1126    0.01325
15584446   206499   1127    0.14025
15584447   206499   1128    0.03350

[15584448 rows x 3 columns]


In [23]:
# films = get_data_from_csv(f"{root}/{ratings}")[]


# DATA VISUALIZATION


In [24]:
            # # Fill in missing values with zeros
            # X.fillna(0, inplace=True)


In [25]:
def addColumnOperation(ratings,X):
     # Compute the mean rating for each user
     count_rating = ratings.groupby('movieId', as_index=False)['rating'].count()
     std= ratings.groupby('movieId', as_index=False)['rating'].std()
     std.fillna(0, inplace=True)
     min_ratings= ratings.groupby('movieId', as_index=False)['rating'].min()
     max_ratings= ratings.groupby('movieId', as_index=False)['rating'].max()
     median= ratings.groupby('movieId', as_index=False)['rating'].median()
     operation = pd.DataFrame({'movieId':count_rating['movieId'],'count_rating': count_rating['rating'], 'std': std['rating'], 'min': min_ratings['rating'], 'max': max_ratings['rating'], 'median': median['rating']}) 
     X = pd.merge(X, operation, on='movieId')
     X.drop("movieId", axis=1, inplace=True)
     return X

# FARE TEST CON AVG, STD_DEV

In [26]:
class SVM:
    def __init__(self, ratings, relevance, seed=1038893):
        # Pivot the relevance DataFrame to create a matrix of tag relevance scores for each movie
        relevance_matrix = relevance.pivot_table(index='movieId', columns='tagId', values='relevance', fill_value=0)

        # Compute the mean rating for each user
        mean_ratings = ratings.groupby('movieId', as_index=False)['rating'].mean()
        X = mean_ratings.merge(relevance_matrix, on='movieId')
        X = addColumnOperation(ratings,X)
        ratings = None
        #X = X.drop('movieId', axis=1)
        # Merge the ratings and relevance data
   
        X.columns = X.columns.astype(str)
        print(X.columns)
        y = X['rating']
        X = X.drop('rating', axis=1)

        # Split the data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        #print(self.X_train.shape, self.X_test.shape, self.y_train.shape, self.y_test.shape)

        # Initialize the SVM model
        self.svm = LinearSVR()

        self.params = {'C': [0.1, 1, 10, 100],
            'epsilon': [0.01, 0.1, 1, 10],
            'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
            'max_iter': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 200)],
            'random_state': [seed]
          }

    # Training
    def train(self, num_epochs=10):
        for epoch in range(num_epochs):
            self.svm.fit(self.X_train, self.y_train)
            self.test()
        print('training finished.')

    def tuning(self):
        grid = GridSearchCV(estimator = self.svm, param_grid = self.params, cv = 5, n_jobs = -1, verbose = 10)
        grid.fit(self.X_train, self.y_train)
        print(grid.best_params_)
        self.svm = grid.best_estimator_
        

    def test(self):
        # Predict ratings for the test data
        y_pred = self.svm.predict(self.X_test)

        # Compute the mean squared error
        mse = mean_squared_error(self.y_test, y_pred)
        # Compute R^2
        r2 = r2_score(self.y_test, y_pred)
        # Compute the mean absolute error
        mae = mean_absolute_error(self.y_test, y_pred)

        print(f"Mean squared error: {mse}")
        print(f"R^2: {r2}")
        print(f"Mean absolute error: {mae}")

In [27]:
bs = SVM(ratings, genome_scores, SEED)

Index(['rating', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '1124', '1125', '1126', '1127', '1128', 'count_rating', 'std', 'min',
       'max', 'median'],
      dtype='object', length=1134)


In [28]:
bs.tuning()

Fitting 5 folds for each of 6400 candidates, totalling 32000 fits


KeyboardInterrupt: 

In [None]:
bs.train(5)

In [None]:
bs.test()