<a href="https://colab.research.google.com/github/dimitrijemarkovic/Parallel-Collaborative-Filtering-at-Scale-for-the-Netflix-Prize/blob/main/ALS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

# Učitaj CSV fajl iz direktorijuma /content
df_pivot = pd.read_csv('/content/drive/MyDrive/pivot_table.csv')
!pip install implicit




In [None]:
import pandas as pd
import numpy as np
import gc
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time

# Pretpostavljam da je df_pivot već definisan
user_movie_matrix = df_pivot.fillna(0).values
user_movie_matrix_sparse = csr_matrix(user_movie_matrix)

# Funkcija za podelu podataka na trening i test
def train_test_split(sparse_matrix, test_percentage=0.25):
    num_users, num_items = sparse_matrix.shape
    num_test_entries = int(test_percentage * sparse_matrix.nnz)
    rows, cols = sparse_matrix.nonzero()
    indices = np.arange(len(rows))
    np.random.shuffle(indices)
    test_indices = indices[:num_test_entries]
    train_indices = indices[num_test_entries:]
    test_matrix = sparse_matrix.copy()
    train_matrix = sparse_matrix.copy()
    test_data = test_matrix.data.copy()
    train_data = train_matrix.data.copy()
    test_data[test_indices] = 0
    train_data[train_indices] = 0
    test_matrix.data = test_data
    train_matrix.data = train_data
    return train_matrix, test_matrix

train, test = train_test_split(user_movie_matrix_sparse, test_percentage=0.25)

# Funkcija za evaluaciju modela
def evaluate_model(model, train_matrix, test_matrix, k=10):
    test_users, test_items = test_matrix.nonzero()
    predicted_ratings = []
    actual_ratings = []
    for user, item in zip(test_users, test_items):
        user_vector = model.user_factors[user]
        item_vector = model.item_factors[item]
        predicted_rating = np.dot(user_vector, item_vector)
        actual_rating = test_matrix[user, item]
        predicted_ratings.append(predicted_rating)
        actual_ratings.append(actual_rating)
    predicted_ratings = np.array(predicted_ratings)
    actual_ratings = np.array(actual_ratings)
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    precision_at_k_score = 0
    num_users = train_matrix.shape[0]
    for user in range(num_users):
        recommendations = model.recommend(user, train_matrix[user], N=k, filter_already_liked_items=False)
        recommended_items = [rec[0] for rec in recommendations]
        user_test_items = test_matrix[user].indices
        relevant_items = np.intersect1d(user_test_items, recommended_items)
        if len(recommended_items) > 0:
            precision_at_k_score += len(relevant_items) / len(recommended_items)
    precision_at_k_score /= num_users
    return rmse, mae, precision_at_k_score

# Definiši parametre za eksperimentisanje
factors_list = [20, 50, 100]
regularization_list = [0.01, 0.1, 1]
iterations_list = [10, 20]

best_rmse = float('inf')
best_mae = float('inf')
best_precision_at_k = 0
best_params = {}

for factors in factors_list:
    for regularization in regularization_list:
        for iterations in iterations_list:
            model = AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations)
            start_time = time.time()
            model.fit(train)
            end_time = time.time()
            print(f"Fitovanje za faktore={factors}, regularizacija={regularization}, iteracije={iterations}")
            rmse, mae, precision_at_k_score = evaluate_model(model, train, test, k=10)
            print(f'RMSE: {rmse}, MAE: {mae}, Precision@10: {precision_at_k_score}')

            # Provera i ažuriranje najboljih rezultata
            if precision_at_k_score > best_precision_at_k:
                best_rmse = rmse
                best_mae = mae
                best_precision_at_k = precision_at_k_score
                best_params = {
                    'factors': factors,
                    'regularization': regularization,
                    'iterations': iterations
                }

            # Oslobađanje memorije nakon svake iteracije
            gc.collect()

print("Najbolji parametri:")
print(best_params)
print(f'Najbolji RMSE: {best_rmse}, Najbolji MAE: {best_mae}, Najbolji Precision@10: {best_precision_at_k}')

# Očisti memoriju na kraju
gc.collect()


  check_blas_config()


  0%|          | 0/10 [00:00<?, ?it/s]

Fitovanje za faktore=20, regularizacija=0.01, iteracije=10
RMSE: 138219.7884836849, MAE: 10833.40772467476, Precision@10: 0.47827587168369834


  0%|          | 0/20 [00:00<?, ?it/s]

Fitovanje za faktore=20, regularizacija=0.01, iteracije=20
RMSE: 138219.7875230861, MAE: 10833.403628861324, Precision@10: 0.47856515495824564


  0%|          | 0/10 [00:00<?, ?it/s]

Fitovanje za faktore=20, regularizacija=0.1, iteracije=10
RMSE: 138219.78999094994, MAE: 10833.404132801865, Precision@10: 0.4762543741025248


  0%|          | 0/20 [00:00<?, ?it/s]