- https://www.datacamp.com/tutorial/recommender-systems-python
- https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system


In [9]:
import pathlib as pl

import kagglehub
import pandas as pd

tmdb_path = next(
  iter(pl.Path(kagglehub.dataset_download("asaniczka/tmdb-movies-dataset-2023-930k-movies")).glob("*.csv"))
)

In [11]:
print(tmdb_path)

/home/dodo/.cache/kagglehub/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies/versions/430/TMDB_movie_dataset_v11.csv


In [2]:
movie_metadata = pd.read_csv(tmdb_path, index_col="id")
movie_metadata = movie_metadata.loc[~movie_metadata["overview"].isna(), ["title", "overview"]]
print(movie_metadata.shape)
movie_metadata = movie_metadata.sample(n=10000, random_state=42).sort_index()
movie_metadata.head()

(915354, 2)


Unnamed: 0_level_0,title,overview
id,Unnamed: 1_level_1,Unnamed: 2_level_1
96,Beverly Hills Cop II,Axel heads for the land of sunshine and palm t...
177,The Fisher King,Two troubled men face their terrible destinies...
195,Trouble in Paradise,Thief Gaston Monescu and pickpocket Lily are p...
225,Man of Iron,"In Warsaw in 1980, the Communist Party sends W..."
281,Strange Days,A former cop turned street-hustler and his bod...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movie_metadata["overview"])
print(tfidf_matrix.shape)

(10000, 36281)


In [4]:
cosine_sim = (tfidf_matrix * tfidf_matrix.T).toarray()
cosine_sim.shape

(10000, 10000)

In [5]:
import operator


def get_recommendations(idx, sim_matrix):
  movie_title = movie_metadata.iloc[idx]["title"]
  print(f"Top recommendations for {movie_title}:")
  sim_scores = list(enumerate(sim_matrix[idx]))
  sim_scores = sorted(sim_scores, key=operator.itemgetter(1), reverse=True)
  sim_scores = sim_scores[1:31]
  movie_indices = [i[0] for i in sim_scores]
  return movie_metadata["title"].iloc[movie_indices]


print(get_recommendations(1, cosine_sim))

Top recommendations for The Fisher King:
id
740533                                               TormenT
485063                                           Masterpiece
380023                                 Figure in a Landscape
399734                                        The Samaritans
1403180                                           Spilt Milk
923367                          Through the Eyes of Children
1123751                                       Love Over Gold
134575                          Secrets of the Heavenly Book
1132360                                       Sexual Desires
1022983      Is There A Future For The Two Who Got Raped!? ?
1325372                              Semana Santa en Tolimán
44151                                          The Crusaders
628005                                     Khun Phaen Begins
1084646                                             Renegade
10446                                       The High Crusade
1370528                                  

In [6]:
# apply sentence transformer to overview
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")



In [7]:
embeddings = model.encode(movie_metadata["overview"].to_list(), show_progress_bar=True)
print(embeddings.shape)

Batches: 100%|██████████| 313/313 [00:14<00:00, 22.06it/s]

(10000, 384)





In [8]:
print(get_recommendations(0, model.similarity(embeddings, embeddings)))

Top recommendations for Beverly Hills Cop II:
id
1314704                                        DIRT
738331                                Sunny Side Up
315610                           Blue Streak O'Neil
122740                                   Highway 13
94652                                Ninja Champion
636973                             Above the Clouds
613835                                    13 Graves
105660                                Double Murder
1396123                        the cry of the earth
12403                             A Perfect Getaway
778769                                  Haulin' Ass
1260893                               Hide And Seek
111499                  God's Country and the Woman
32243                                Crash and Burn
601019                           The Unknown Ranger
156363                      The Phantom of the West
990790        Nick Carter - Le mystère du lit blanc
106269                       Rise of the Scarecrows
860253         