### Sistema de Recomendación basado en contenido

In [1]:
# Importar las librerías que necesitaremos
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

#Importamos warnings para evitar los mensajes de error
import warnings
warnings.filterwarnings("ignore")

# Librerías adicionales
from ast import literal_eval  # Convierte string a list

# Establecer opciones de visualización para mostrar todas las columnas y filas
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
df = pd.read_csv('movies.csv')

In [3]:
df.head(1)

Unnamed: 0,id,budget,overview,popularity,revenue,runtime,title,vote_average,vote_count,collections,company,country,language,director,genre,actor,year,return
0,862,30000000.0,"Led by Woody, Andy's toys live happily in his ...",21.946943,373554033.0,81.0,Toy Story,7.7,5415.0,Toy Story Collection,Pixar Animation Studios,United States of America,English,John Lasseter,"['Animation', 'Comedy', 'Family']","['Tom Hanks', 'Tim Allen', 'Don Rickles']",1995,12.45


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5300 entries, 0 to 5299
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            5300 non-null   int64  
 1   budget        5300 non-null   float64
 2   overview      5300 non-null   object 
 3   popularity    5300 non-null   float64
 4   revenue       5300 non-null   float64
 5   runtime       5300 non-null   float64
 6   title         5300 non-null   object 
 7   vote_average  5300 non-null   float64
 8   vote_count    5300 non-null   float64
 9   collections   5300 non-null   object 
 10  company       5300 non-null   object 
 11  country       5300 non-null   object 
 12  language      5293 non-null   object 
 13  director      5300 non-null   object 
 14  genre         5300 non-null   object 
 15  actor         5300 non-null   object 
 16  year          5300 non-null   int64  
 17  return        5300 non-null   float64
dtypes: float64(7), int64(2), obj

In [5]:
tfidf = TfidfVectorizer(stop_words="english")
df['overview'] = df['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(df["overview"])

indices = pd.Series(df.index, index=df["title"]).drop_duplicates()

In [6]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [9]:
get_recommendations('Jumanji')

2390                             Quintet
5129                        Snowed Under
3468                              DeVour
4576                              Pixels
2157                  The Last of Sheila
2185                The Last Starfighter
1602                         5 Card Stud
4073             Guardians of the Galaxy
3703                         Geri's Game
5052    How to Hook Up Your Home Theater
Name: title, dtype: object