In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pivot-table3-csv/pivot_table (1).csv


In [1]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
import pandas as pd
import time
import gc
import pickle  

# Učitavanje pivot tabele iz CSV fajla i transformacija podataka u format za Surprise biblioteku
df_pivot = pd.read_csv('/kaggle/input/pivot-table3-csv/pivot_table (1).csv', index_col=0)
# 'stack()' pretvara kolone u redove tako da svaka ocena dobija sopstveni red
df_melt = df_pivot.stack().reset_index().rename(columns={'level_1': 'Movie_Id', 0: 'Rating'})

# Definisanje čitača koji govori da su ocene u opsegu od 1 do 5
reader = Reader(rating_scale=(1, 5))

# Kreiramo dataset iz tri kolone: Cust_Id, Movie_Id i Rating
data = Dataset.load_from_df(df_melt[['Cust_Id', 'Movie_Id', 'Rating']], reader)

# Delimo podatke na trening i test skupove (25% podataka je u test skupu)
trainset, testset = train_test_split(data, test_size=0.25)

# Definišemo različite vrednosti parametara koje ćemo testirati u grid search-u
n_factors_values = [10, 20, 30]  # Broj latentnih faktora (dimenzije matrice)
n_epochs_values = [10, 20, 30]   
lr_all_values = [0.002, 0.005, 0.01]  
reg_all_values = [0.02, 0.1, 0.2]     

# Promenljive za praćenje najboljeg modela
best_rmse = float('inf')  
best_params = None  
best_model = None  

# Iteracija kroz sve kombinacije parametara (grid search)
for n_factors in n_factors_values:
    for n_epochs in n_epochs_values:
        for lr_all in lr_all_values:
            for reg_all in reg_all_values:
                
                # Kreiramo SVD model sa trenutnim setom parametara
                algo = SVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all)
                
                # Treniranje modela i merenje vremena treniranja
                start_time = time.time()
                algo.fit(trainset)
                end_time = time.time()
                print(f"Parametri: n_factors={n_factors}, n_epochs={n_epochs}, lr_all={lr_all}, reg_all={reg_all}")
                print(f"Vreme fitovanja: {end_time - start_time:.2f} sekundi")
                
                # Testiranje modela na test skupu i merenje vremena predikcije
                start_time = time.time()
                predictions = algo.test(testset)
                end_time = time.time()
                print(f"Vreme predviđanja: {end_time - start_time:.2f} sekundi")
                
                
                rmse = accuracy.rmse(predictions, verbose=False)
                mae = accuracy.mae(predictions, verbose=False)
                print(f'RMSE: {rmse}, MAE: {mae}')
                
                
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_params = (n_factors, n_epochs, lr_all, reg_all)
                    best_model = algo
                    
            
                del algo
                gc.collect()

                print('-' * 50)

# Čuvanje najboljeg modela u fajl
if best_model:
    with open('best_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
    
    # Prikaz najboljih parametara i najboljeg RMSE
    print(f"Najbolji parametri: n_factors={best_params[0]}, n_epochs={best_params[1]}, lr_all={best_params[2]}, reg_all={best_params[3]}")
    print(f"Najbolji RMSE: {best_rmse}")



Parametri: n_factors=10, n_epochs=10, lr_all=0.002, reg_all=0.02
Vreme fitovanja: 55.16 sekundi
Vreme predviđanja: 50.80 sekundi
RMSE: 0.9184846570747963, MAE: 0.724866228921748
--------------------------------------------------
Parametri: n_factors=10, n_epochs=10, lr_all=0.002, reg_all=0.1
Vreme fitovanja: 49.04 sekundi
Vreme predviđanja: 39.41 sekundi
RMSE: 0.9206949178587659, MAE: 0.7290269577357712
--------------------------------------------------
Parametri: n_factors=10, n_epochs=10, lr_all=0.002, reg_all=0.2
Vreme fitovanja: 49.17 sekundi
Vreme predviđanja: 39.09 sekundi
RMSE: 0.9239681142103124, MAE: 0.7343532094625029
--------------------------------------------------
Parametri: n_factors=10, n_epochs=10, lr_all=0.005, reg_all=0.02
Vreme fitovanja: 49.69 sekundi
Vreme predviđanja: 39.28 sekundi
RMSE: 0.8895935464179726, MAE: 0.6959249290135513
--------------------------------------------------
Parametri: n_factors=10, n_epochs=10, lr_all=0.005, reg_all=0.1
Vreme fitovanja: 49