In [1]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
import joblib

pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',504)
pd.set_option('display.width',1000)

In [2]:
df = pd.read_csv('../DatasetMergeFinalCortado(1M).csv')

In [3]:
#print(df)

In [4]:
# Guardar un DataFrame separado para mapear anime_id a title e image_url
anime_titles = df[['anime_id', 'title', 'image_url']].drop_duplicates()

In [5]:
# Parte 1: Filtrado colaborativo usando SVD
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['username', 'anime_id', 'my_score']], reader)
svd = SVD()

In [6]:
# Entrenar modelo
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c1fc30ec10>

In [7]:
# Agrupar por anime_id, genre y title, y calcular la calificación media
df_grouped = df.groupby(['anime_id', 'genre'], as_index=False)['my_score'].mean()

In [8]:
# Parte 2: Filtrado basado en contenido utilizando Binarización
df_grouped = df_grouped.dropna().reset_index(drop=True)
df_grouped['genre'] = df_grouped['genre'].apply(lambda x: x.split(','))  # Asumiendo que los géneros están separados por comas

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df_grouped['genre'])

In [9]:
# Utilizar vecinos más cercanos para encontrar animes similares
nn = NearestNeighbors(metric='jaccard')
nn.fit(genre_matrix)

In [10]:
# Función para obtener recomendaciones
def get_recommendations(anime_id):
    # Convierte anime_id en un índice
    idx = df_grouped[df_grouped['anime_id'] == anime_id].index[0]

    distances, nearest_indices = nn.kneighbors(genre_matrix[idx].reshape(1, -1), n_neighbors=20)
    # Recomendaciones con anime_id
    recommendations = df_grouped.iloc[nearest_indices[0]][['anime_id']]

    # Calcula la puntuación predicha por SVD para cada anime recomendado
    recommendations['svd_score'] = recommendations['anime_id'].apply(lambda x: svd.predict('username_example', x).est)

    # Pondera la puntuación SVD por la inversa de la distancia de género
    recommendations['hybrid_score'] = recommendations['svd_score'] / (1 + distances[0])

    # Ordena por la puntuación híbrida
    recommendations = recommendations.sort_values(by='hybrid_score', ascending=False)

    # Agregar el título e imágenes a las recomendaciones utilizando el DataFrame de mapeo
    recommendations = recommendations.merge(anime_titles, on='anime_id', how='left')

    return recommendations

In [11]:
# Slam Dunk = 170
# Neon Genesis Evangelion = 30
# Shingeki no Kyojin = 16498
# Pokemon = 1565
# One Piece = 21
# Tokyo Ghoul = 22319
# Shigatsu wa Kimi no Uso = 23273

anime_id = 16498
print(get_recommendations(anime_id))

    anime_id  svd_score  hybrid_score                                              title                                          image_url
0      25777         10     10.000000                        Shingeki no Kyojin Season 2  https://cdn.myanimelist.net/images/anime/4/841...
1      16498         10     10.000000                                 Shingeki no Kyojin  https://cdn.myanimelist.net/images/anime/10/47...
2      23775         10      7.777778        Shingeki no Kyojin Movie 1: Guren no Yumiya  https://cdn.myanimelist.net/images/anime/7/632...
3      23777         10      7.777778       Shingeki no Kyojin Movie 2: Jiyuu no Tsubasa  https://cdn.myanimelist.net/images/anime/2/725...
4      36702         10      7.777778      Shingeki no Kyojin Movie 3: Kakusei no Houkou  https://cdn.myanimelist.net/images/anime/2/887...
5      18397         10      7.777778                             Shingeki no Kyojin OVA  https://cdn.myanimelist.net/images/anime/9/592...
6      19285        



In [13]:
joblib.dump(svd, 'svd_model.pkl')
joblib.dump(nn, 'nn_model.pkl')
df_grouped.to_csv('df_grouped.csv', index=False)
anime_titles.to_csv('anime_titles.csv', index=False)