In [6]:
from memory_profiler import profile
import pandas as pd
import numpy as np
import time

def cosine_similarity(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    if norm_a == 0 or norm_b == 0:
        return 0
    return dot_product / (norm_a * norm_b)

def predict_rating(user_id, movie_id, user_movie_matrix, movie_similarity, treshold):
    user_ratings = user_movie_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings > 0].index
    similarities = movie_similarity.loc[movie_id, rated_movies]
    similarities = similarities[similarities > treshold]
    weighted_ratings = 0
    similarity_sum = 0
    for rated_movie in similarities.index:
        similarity = similarities[rated_movie]
        rating = user_ratings[rated_movie]
        weighted_ratings += similarity * rating
        similarity_sum += similarity
    if similarity_sum == 0:
        return 0
    return weighted_ratings / similarity_sum

def predict_missing_ratings(user_movie_matrix, movie_similarity, treshold):
    predictions = {}
    for user_id in user_movie_matrix.index:
        predictions[user_id] = {}
        for movie_id in user_movie_matrix.columns:
            if user_movie_matrix.loc[user_id, movie_id] == 0:
                predicted_rating = predict_rating(user_id, movie_id, user_movie_matrix, movie_similarity, treshold)
                predictions[user_id][movie_id] = predicted_rating
    return predictions

def gen_recommendations(predicted_ratings, top_n=5):
    recommendations = {}
    for user_id, movie_ratings in predicted_ratings.items():
        sorted_ratings = sorted(movie_ratings.items(), key=lambda x: x[1], reverse=True)
        top_recommendations = sorted_ratings[:top_n]
        recommendations[user_id] = top_recommendations
    return recommendations


def main():
    start_time = time.time()
    file_path = 'train_ratings.csv'
    ratings_df = pd.read_csv(file_path)
    num_users = 1000
    num_movies = 100
    user_movie_matrix = ratings_df.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
    user_movie_matrix = user_movie_matrix.iloc[:num_users, :]

    movie_ids = user_movie_matrix.columns
    movie_similarity = pd.DataFrame(index=movie_ids, columns=movie_ids)

    for i in movie_ids:
        for j in movie_ids:
            if i != j:
                movie_similarity.loc[i, j] = cosine_similarity(user_movie_matrix[i], user_movie_matrix[j])
            else:
                movie_similarity.loc[i, j] = 1

    treshold = 0.5
    predicted_ratings = predict_missing_ratings(user_movie_matrix, movie_similarity, treshold)

    top_n = 25
    recommendations = gen_recommendations(predicted_ratings, top_n=top_n)
    end_time = time.time()
    execution_time = end_time - start_time
    print("Recommandations générées en", execution_time, "secondes.")
%memit main()

Recommandations générées en 2860.1883413791656 secondes.
peak memory: 1041.49 MiB, increment: 706.08 MiB
