# Sistema de Recomendación con MovieLens (Dataset Small) - Canaza Tito, Eddy Wilmer

## Datos de la Práctica
Autor: Canaza tito, Eddy Wilmer

Dataset:
These datasets will change over time, and are not appropriate for reporting research results. We will keep the download links stable for automated downloads. We will not archive or make available previously released versions.

Small: 100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users. Last updated 9/2018.

## Carga de datos

In [1]:
from google.colab import drive
import pandas as pd
import numpy as np

# Carga de datos
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Maestría/Recuperación de información/SR/ds/'

ratings_df = pd.read_csv(f'{file_path}/ratings.csv')
movies_df = pd.read_csv(f'{file_path}/movies.csv')

merged_df = pd.merge(ratings_df, movies_df, on='movieId', how='left')

# Construye una matriz para realizar las operaciones con las calificaciones
pivot_table = merged_df.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)

merged_df.head(10)

ValueError: mount failed

## Funciones para los cálculos de distancia o similitudes

In [None]:
# Calcula la distancia Euclidiana
def euclidean_distance(vec1, vec2):
    return np.linalg.norm(vec1 - vec2)

# Calcula la distancia Manhattan
def manhattan_distance(vec1, vec2):
    return np.sum(np.abs(vec1 - vec2))

# Calcula la similitud Pearson
def pearson_similarity(vec1, vec2):
    mean1, mean2 = np.mean(vec1), np.mean(vec2)
    num = np.sum((vec1 - mean1) * (vec2 - mean2))
    den = np.sqrt(np.sum((vec1 - mean1) ** 2) * np.sum((vec2 - mean2) ** 2))
    if den == 0:
        return 0
    return num / den

# Calcula la similitud Coseno
def cosine_similarity(vec1, vec2):
    num = np.dot(vec1, vec2)
    den = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    if den == 0:
        return 0
    return num / den

## KNN para identificar las peliculas recomendadas

El modelo de recomedación permite establecer las películas que le pueden gustar a un usuario específico a partir de un n-vecinos utilizando una distancia o similitud ('euclidean', 'manhattan', 'pearson' o 'cosine'). Para que las películas sean consideradas en la recomendación deben de superar el rating mínimo que se establece.\

In [None]:
def knn_recommendation(user_id, n_neighbors, distance_type='euclidean', min_rating=0):
    if user_id not in pivot_table.index:
        raise ValueError("El user_id no está en la base de datos.")

    user_ratings = pivot_table.loc[user_id].values
    distances = []

    for other_user_id in pivot_table.index:
        if other_user_id == user_id:
            continue
        other_user_ratings = pivot_table.loc[other_user_id].values

        if distance_type == 'euclidean':
            distance = euclidean_distance(user_ratings, other_user_ratings)
        elif distance_type == 'manhattan':
            distance = manhattan_distance(user_ratings, other_user_ratings)
        elif distance_type == 'pearson':
            distance = 1 - pearson_similarity(user_ratings, other_user_ratings)
        elif distance_type == 'cosine':
            distance = 1 - cosine_similarity(user_ratings, other_user_ratings)
        else:
            raise ValueError("Tipo de distancia no válido. Use 'euclidean', 'manhattan', 'pearson' o 'cosine'.")

        distances.append((other_user_id, distance))

    # Ordena las distancias y selecciona los n_neighbors más cercanos
    neighbors = sorted(distances, key=lambda x: x[1])[:n_neighbors]

    # Obtiene las películas y ratings de los vecinos que superan el min_rating
    recommendations = []
    for neighbor_id, distance in neighbors:
        neighbor_ratings = pivot_table.loc[neighbor_id]
        recommended_movies = neighbor_ratings[neighbor_ratings > min_rating].index.tolist()

        if not recommended_movies:
            print(f"id_neighbor: {neighbor_id}: Ninguna película supera el min_rating de {min_rating}.")
        else:
            for movie_id in recommended_movies:
                movie_title = movies_df.loc[movies_df['movieId'] == movie_id, 'title'].values[0]
                rating = neighbor_ratings[movie_id]
                recommendations.append((neighbor_id, distance, movie_id, movie_title, rating))

    # Convierte las recomendaciones en un DataFrame
    recommendations_df = pd.DataFrame(recommendations, columns=['neighbor_id', 'distance', 'movie_id', 'movie_title', 'rating'])
    recommendations_df = recommendations_df.drop_duplicates(subset='movie_id', keep='first')
    recommendations_df['distance'] = recommendations_df['distance'].round(4)

    # Ordena las películas recomendadas por su rating
    recommendations_df = recommendations_df.groupby(['neighbor_id', 'distance']).apply(lambda x: x.sort_values(by='rating', ascending=False))
    recommendations_df.reset_index(drop=True, inplace=True)

    # Imprime los resultados
    for (id_neighbor, distance), group_df in recommendations_df.groupby(['neighbor_id', 'distance']):
        print(f"id_neighbor: {id_neighbor}, distance: {distance}:")
        for index, row in group_df.iterrows():
            print(f"\t{row['movie_id']}, {row['movie_title']}, {row['rating']}")


# Parámetros del Test
user_id = 2 # Usuario
distance_type = 'pearson' # Métrica o similitud
n_neighbors = 3 # Número de vecinos
min_rating = 4.5 # Score mínimo para ser considerado en la recomendacion

# Test
knn_recommendation(user_id, n_neighbors, distance_type, min_rating)