In [12]:
import random
import torch
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [13]:
def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

In [14]:
SEED = 1038893

fix_random(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

root = "../../data/ml-25m"

Device: cpu


## Data Acquisition

In [15]:
def get_data_from_csv(file: str, nrows=None):
    if nrows:
        df = pd.read_csv(f"{root}/{file}", nrows=nrows)
    else:
        df = pd.read_csv(f"{root}/{file}")
    print(f"Loaded ml-25m data: {root}/{file}")

    return df

def get_movies_from_csv(nrows=None):
    path = "movies.csv"

    data = get_data_from_csv(path, nrows)
    # data = get_data_from_csv(path)
    print(data.head())
    return data

def get_ratings_from_csv(nrows=None):
    path = "ratings.csv"

    data = get_data_from_csv(path, nrows)
    # data = get_data_from_csv(path)

    data.drop("timestamp", axis=1, inplace=True)
    # todo: drop user id
    #data.drop("userId", axis=1, inplace=True)

    return data

def get_tag_relevances_from_csv(nrows=None):
    path = "genome-scores.csv"

    return get_data_from_csv(path, nrows)

In [16]:
genome_scores_path = "genome-scores.csv"
ratings_path = "ratings.csv"

In [17]:
ratings = get_ratings_from_csv()
movies = get_movies_from_csv()
print(ratings)

Loaded ml-25m data: ../../data/ml-25m/ratings.csv
Loaded ml-25m data: ../../data/ml-25m/movies.csv
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
          userId  movieId  rating
0              1      296     5.0
1              1      306     3.5
2              1      307     5.0
3              1      665     5.0
4              1      899     3.5
...          ...      ...     ...
25000090  162541    50872     4.5
25000091  162541    55768     2.5


In [18]:
genome_scores = get_tag_relevances_from_csv()
print(genome_scores)

Loaded ml-25m data: ../../data/ml-25m/genome-scores.csv
          movieId  tagId  relevance
0               1      1    0.02875
1               1      2    0.02375
2               1      3    0.06250
3               1      4    0.07575
4               1      5    0.14075
...           ...    ...        ...
15584443   206499   1124    0.11000
15584444   206499   1125    0.04850
15584445   206499   1126    0.01325
15584446   206499   1127    0.14025
15584447   206499   1128    0.03350

[15584448 rows x 3 columns]


# DATA VISUALIZATION


# FARE TEST CON AVG, STD_DEV

In [19]:
def addColumnOperation(ratings,X):
     # Compute the mean rating for each user
     count_rating = ratings.groupby('movieId', as_index=False)['rating'].count()
     std= ratings.groupby('movieId', as_index=False)['rating'].std()
     std.fillna(0, inplace=True)
     min_ratings= ratings.groupby('movieId', as_index=False)['rating'].min()
     max_ratings= ratings.groupby('movieId', as_index=False)['rating'].max()
     median= ratings.groupby('movieId', as_index=False)['rating'].median()
     operation = pd.DataFrame({'movieId':count_rating['movieId'],'count_rating': count_rating['rating'], 'std': std['rating'], 'min': min_ratings['rating'], 'max': max_ratings['rating'], 'median': median['rating']}) 
     X = pd.merge(X, operation, on='movieId')
     return X
    
def preprocessing1(relevance, ratings):
     # Reduce genome-score size
     # Pivot the relevance DataFrame to create a matrix of tag relevance scores for each movie
     relevance_matrix = relevance.pivot_table(index='movieId', columns='tagId', values='relevance', fill_value=0)
     counting = relevance.groupby('movieId', as_index=False)['relevance'].count()
     print(counting)
     # Merge the ratings and relevance data
     mean_ratings = ratings.groupby('movieId', as_index=False)['rating'].mean()
     X = mean_ratings.merge(relevance_matrix, on='movieId')
     #X = addColumnOperation(ratings,X)
     X.columns = X.columns.astype(str)
     ratings = None  
     # mescolare le righe del DataFrame
     #X = X.sample(frac=1).reset_index(drop=True)
     
     y = X['rating']
     X.drop("movieId", axis=1, inplace=True)
     X = X.drop('rating', axis=1)
     return X,y

def preprocessing2(movies,relevance, ratings):
     # Merge the ratings and movies data
     #X = ratings.merge(movies, on='movieId')
     movies_genres = movies['genres'].str.split( '|')
     m = pd.get_dummies(movies_genres.apply(pd.Series).stack()).sum(level=0)
     movies_genres = pd.concat([movies['movieId'], m], axis=1)

     relevance_matrix = relevance.pivot_table(index='movieId', columns='tagId', values='relevance', fill_value=0)
     mean_ratings = ratings.groupby('movieId', as_index=False)['rating'].mean()
     
     X = mean_ratings.merge(movies_genres, on='movieId')
     X = X.merge(relevance_matrix, on='movieId')
     X.columns = X.columns.astype(str)
     ratings = None  
     # mescolare le righe del DataFrame
     #X = X.sample(frac=1).reset_index(drop=True)
     
     y = X['rating']
     X.drop("movieId", axis=1, inplace=True)
     X = X.drop('rating', axis=1)
     
     # Apply PCA if n_components is specified
     pca = PCA()
     X = pca.fit_transform(X)
     return X,y

In [26]:
class LinearRegressionModel:
    def __init__(self, ratings, relevance,movies, seed=SEED):
        import matplotlib.pyplot as plt
        from sklearn.preprocessing import StandardScaler,scale
        from sklearn.pipeline import Pipeline

        relevance_matrix = relevance.pivot_table(index='movieId', columns='tagId', values='relevance', fill_value=0)
        print(relevance_matrix.shape)
        #Create One Hot Encoding for genres
        movies_genres = movies['genres'].str.split( '|')
        m = pd.get_dummies(movies_genres.apply(pd.Series).stack()).sum(level=0)
        movies_genres = pd.concat([movies['movieId'], m], axis=1)

        # Merge the ratings and relevance data
        mean_ratings = ratings.groupby('movieId', as_index=False)['rating'].mean()
        X = mean_ratings.merge(relevance_matrix, on='movieId')
        X = X.merge(movies_genres, on='movieId')
    
        X.columns = X.columns.astype(str)
        
        ratings = None  

        # mescolare le righe del DataFrame
        #X = X.sample(frac=1).reset_index(drop=True)
        
        X.drop("movieId", axis=1, inplace=True)
        self.y = X['rating']
        self.X = X.drop('rating', axis=1)
        
        # Apply PCA if n_components is specified
       
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=seed)
        print(self.X_train.shape, self.X_test.shape, self.y_train.shape, self.y_test.shape)

        self.pipelineScaled = Pipeline([('Standard Scaler',StandardScaler()),('pca',PCA(n_components=0.95)),('Linear Regression',LinearRegression())],verbose=True)
        self.pipeline = Pipeline([('pca',PCA(n_components=0.95)),('Linear Regression',LinearRegression())],verbose=True)
        self.onlinereg = Pipeline([('Linear Regression',LinearRegression())],verbose=True)
       
       #self.model = LinearRegression()
        '''pca = PCA()
        pca.fit(self.X_train)
        self.X_train_t = pca.transform(self.X_train)
        self.X_test_t = pca.transform(self.X_test)
        print(self.X_train)
        print(self.X_train_t)
        plt.scatter(self.X_train_t[:, 0], self.X_train_t[:, 1], c=self.y_train, cmap='viridis')
        plt.show()
        
        plt.scatter(self.X_test_t[:, 0], self.X_test_t[:, 1], c=self.y_test, cmap='viridis')
        plt.show()
        self.model = LinearRegression()
        self.params = {
                'fit_intercept': [True, False],
                'normalize': [True, False],
                'copy_X': [True, False]
            }
            '''
       
            
    def train(self):
        for model in [self.pipelineScaled,self.pipeline,self.onlinereg]:
            model.fit(self.X_train, self.y_train)
    
        
     # Hyperparameter tuning
    def tuning(self,X,y):
        grid_search = GridSearchCV(self.model, param_grid=self.params, cv=3, scoring='neg_mean_squared_error', verbose=10, n_jobs=-1)
        grid_search.fit(X, y)
        self.model = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best score: {grid_search.best_score_:.4f}")
        
        # Re-initialize the model with the best hyperparameters
        #self.model = LinearRegression(**grid_search.best_params_)


    def test(self):
        #model = self.model
        # Predict ratings for the test data
        for model in [self.pipelineScaled, self.pipeline, self.onlinereg]:
            y_pred = model.predict(self.X_test)

            # Compute the mean squared error
            mse = mean_squared_error(self.y_test, y_pred)
            rmse = mean_squared_error(self.y_test, y_pred, squared=False)
            r2 = r2_score(self.y_test, y_pred)
            mae = mean_absolute_error(self.y_test, y_pred)

            print(f"MSE: {mse} RMSE: {rmse} R2: {r2} MAE: {mae}")
            score = model.score(self.X_test, self.y_test)
            print(f"Score:{score}")
            print("=====================================")
    

        


In [27]:
lr = LinearRegressionModel(ratings,genome_scores,movies, SEED)

(13816, 1128)


  m = pd.get_dummies(movies_genres.apply(pd.Series).stack()).sum(level=0)


(11052, 1148) (2764, 1148) (11052,) (2764,)


In [28]:
lr.train()

[Pipeline] ... (step 1 of 3) Processing Standard Scaler, total=   0.7s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   8.9s
[Pipeline] . (step 3 of 3) Processing Linear Regression, total=   2.9s
[Pipeline] ............... (step 1 of 2) Processing pca, total=   9.1s
[Pipeline] . (step 2 of 2) Processing Linear Regression, total=   2.3s
[Pipeline] . (step 1 of 1) Processing Linear Regression, total=   5.7s


In [29]:
lr.test()

MSE: 0.006597745042071569 RMSE: 0.08122650455406516 R2: 0.9713379618311104 MAE: 0.0633443088299459
Score:0.9713379618311104
MSE: 0.006559006068277205 RMSE: 0.08098769084420919 R2: 0.9715062523513468 MAE: 0.06231471983066634
Score:0.9715062523513468
MSE: 0.005221310694310264 RMSE: 0.0722586375066003 R2: 0.977317491740335 MAE: 0.05602101705311702
Score:0.977317491740335
MSE: 0.005221310694310264 RMSE: 0.0722586375066003 R2: 0.977317491740335 MAE: 0.05602101705311702
Score:0.977317491740335
