In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [16]:
df = pd.read_csv('dataset/TMDb_updated.CSV', low_memory=False)
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,overview,original_language,vote_count,vote_average
0,0,Ad Astra,"The near future, a time when both hope and har...",en,2853,5.9
1,1,Bloodshot,"After he and his wife are murdered, marine Ray...",en,1349,7.2
2,2,Bad Boys for Life,Marcus and Mike are forced to confront new thr...,en,2530,7.1


In [17]:
df['overview'].head()

0    The near future, a time when both hope and har...
1    After he and his wife are murdered, marine Ray...
2    Marcus and Mike are forced to confront new thr...
3    Armed with the astonishing ability to shrink i...
4    In their quest to confront the ultimate evil, ...
Name: overview, dtype: object

In [18]:
tfidf = TfidfVectorizer(stop_words='english')
df['overview'] = df['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(df['overview'])

print(tfidf_matrix.shape)
print(tfidf.get_feature_names_out())

(10000, 28709)
['00' '000' '006' ... 'žižek' 'βwzvz' '運轉手之戀']


In [19]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)
print(cosine_sim[1])

(10000, 10000)
[0.00563717 1.         0.01311654 ... 0.00938582 0.         0.        ]


In [20]:
#отображение индекса фильмов
mapping = pd.Series(df.index, index=df['title']).drop_duplicates()
mapping

title
Ad Astra                             0
Bloodshot                            1
Bad Boys for Life                    2
Ant-Man                              3
Percy Jackson: Sea of Monsters       4
                                  ... 
Cargo                             9995
The Good Night                    9996
The World Is Yours                9997
The Grand Seduction               9998
Woochi: The Demon Slayer          9999
Length: 10000, dtype: int64

In [21]:
#Функция, которая принимает название фильма в качестве входных данных и выводит наиболее похожие фильмы.
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = mapping[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:15]
    movie_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[movie_indices]

In [22]:
get_recommendations('The Godfather')

233      The Godfather: Part II
1226    The Godfather: Part III
7814                 Blood Ties
9884                   Election
7451                 Proud Mary
2502              Live by Night
7303            Family Business
9689                     Eulogy
8864             American Movie
9201                       Made
6390                 On My Skin
9768        The Look of Silence
1272                   Sinister
21                        Joker
Name: title, dtype: object