In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import sys
# add root folder to path
folder = "../../"
sys.path.append(folder)
import src.utils as utils
from src.config import MOVIES_PATH, RATINGS_PATH
from src.utils import load_data
movies, ratings = load_data(MOVIES_PATH, RATINGS_PATH)

ratings['Timestamp'] = pd.to_datetime(ratings['Timestamp'])

cutoff_date = ratings['Timestamp'].quantile(0.8)  # 80% of the data for training, 20% for testing
train_ratings = ratings[ratings['Timestamp'] <= cutoff_date]
test_ratings = ratings[ratings['Timestamp'] > cutoff_date]

train_movie_ids = train_ratings['MovieID'].unique()
test_movie_ids = test_ratings['MovieID'].unique()

train_movies = movies[movies['MovieID'].isin(train_movie_ids)]
test_movies = movies[movies['MovieID'].isin(test_movie_ids)]

# Ensure the indices are reset to avoid out of bound errors
train_movies = train_movies.reset_index(drop=True)
test_movies = test_movies.reset_index(drop=True)

title_to_index = pd.Series(train_movies.index, index=train_movies['Title']).drop_duplicates()

tfidf = TfidfVectorizer(stop_words='english')
train_movies['Genres'] = train_movies['Genres'].fillna('')
tfidf_matrix = tfidf.fit_transform(train_movies['Genres'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in title_to_index:
        return []
    idx = title_to_index[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return train_movies['Title'].iloc[movie_indices]


print(get_recommendations('Toy Story (1995)'))


def evaluate_model():
    relevant_movies = 0
    recommended_relevant_movies = 0
    total_recommendations = 0

    for title in test_movies['Title']:
        recommendations = get_recommendations(title)
        test_user_ratings = test_ratings[test_ratings['MovieID'].isin(test_movies[test_movies['Title'].isin(recommendations)]['MovieID'])]

        relevant_movies += len(test_user_ratings)
        recommended_relevant_movies += sum(test_user_ratings['Rating'] >= 4)
        total_recommendations += len(recommendations)

    precision = recommended_relevant_movies / total_recommendations if total_recommendations > 0 else 0
    recall = recommended_relevant_movies / relevant_movies if relevant_movies > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1


precision, recall, f1 = evaluate_model()
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
