In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
movies = pd.read_csv('ml-latest-small/movies.csv').assign(combined=lambda df: df['title'].str.lower() + ' ' + df['genres'].str.lower())

In [18]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['combined'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [22]:
def recommend_movies(title, cosine_sim, movies, n=5):
    idx = movies[movies['title'].str.lower() == title.lower()].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores[1:n+1]]
    recommended_movies = movies.iloc[movie_indices].copy()
    recommended_movies['sim_score'] = [sim_scores[i + 1][1] for i in range(n)]
    
    return recommended_movies

In [28]:
recommendations = recommend_movies('Toy Story (1995)', cosine_sim, movies)
print(recommendations[['title','sim_score','genres']])
# print(recommend_movies('Jumanji (1995)', cosine_sim, movies)[['title']])

                                      title  sim_score  \
2355                     Toy Story 2 (1999)   0.880446   
7355                     Toy Story 3 (2010)   0.821047   
3595                        Toy, The (1982)   0.538018   
2539  We're Back! A Dinosaur's Story (1993)   0.456423   
26                      Now and Then (1995)   0.421651   

                                                genres  
2355       Adventure|Animation|Children|Comedy|Fantasy  
7355  Adventure|Animation|Children|Comedy|Fantasy|IMAX  
3595                                            Comedy  
2539              Adventure|Animation|Children|Fantasy  
26                                      Children|Drama  
