In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Загрузка данных
movies_metadata = pd.read_csv('/home/gea/TMS_projects/MoviesDataset/movies_metadata.csv', low_memory=False)
movies = movies_metadata[['title', 'overview']].dropna().reset_index(drop=True)

In [2]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44506 entries, 0 to 44505
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     44506 non-null  object
 1   overview  44506 non-null  object
dtypes: object(2)
memory usage: 695.5+ KB


In [3]:
movies = movies.head(20000)
#movies

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     20000 non-null  object
 1   overview  20000 non-null  object
dtypes: object(2)
memory usage: 312.6+ KB


In [5]:
# Векторизация описаний
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['overview'])
tfidf_matrix.astype('float16')

<20000x47665 sparse matrix of type '<class 'numpy.float16'>'
	with 539811 stored elements in Compressed Sparse Row format>

In [6]:
movies.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [7]:
# Расчет косинусного сходства
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = movies.index[movies['title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Получаем 10 самых похожих фильмов
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

# Пример использования:
print(get_recommendations('Toy Story'))

15282               Toy Story 3
2979                Toy Story 2
10271    The 40 Year Old Virgin
8303                  The Champ
1058      Rebel Without a Cause
11367    For Your Consideration
1916                  Condorman
3039            Man on the Moon
483                      Malice
11573              Factory Girl
Name: title, dtype: object
