In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Загрузка данных
movies_metadata = pd.read_csv('MoviesDataset/movies_metadata.csv', low_memory=False)
movies = movies_metadata[['title', 'overview']].dropna().reset_index(drop=True)

In [2]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44506 entries, 0 to 44505
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     44506 non-null  object
 1   overview  44506 non-null  object
dtypes: object(2)
memory usage: 695.5+ KB


In [3]:
movies = movies.head(20000)
#movies

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     20000 non-null  object
 1   overview  20000 non-null  object
dtypes: object(2)
memory usage: 312.6+ KB


In [5]:
# Векторизация описаний
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['overview'])
tfidf_matrix.astype('float16')

<20000x47665 sparse matrix of type '<class 'numpy.float16'>'
	with 539811 stored elements in Compressed Sparse Row format>

In [6]:
movies.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [7]:
# Расчет косинусного сходства
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = movies.index[movies['title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Получаем 10 самых похожих фильмов
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

# Пример использования:
print(get_recommendations('Toy Story'))

15282               Toy Story 3
2979                Toy Story 2
10271    The 40 Year Old Virgin
8303                  The Champ
1058      Rebel Without a Cause
11367    For Your Consideration
1916                  Condorman
3039            Man on the Moon
483                      Malice
11573              Factory Girl
Name: title, dtype: object


SVD

In [167]:
# Загрузка данных
movies = pd.read_csv('MoviesDataset/movies_metadata.csv', low_memory=False)
ratings = pd.read_csv('MoviesDataset/ratings_small.csv')

In [39]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45463 non-null  float64
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [20]:
ratings = ratings.loc[ratings['movieId']<20000]

In [21]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [168]:
# Преобразование столбца 'id' в числовой формат
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')

In [25]:
# Создание матрицы пользователь-предмет
user_ratings = ratings.pivot(index='userId', columns='movieId', values='rating')
user_ratings = user_ratings.fillna(0)

In [26]:
user_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,8989,8998,8999,9000,9001,9004,9005,9010,9012,9018
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Центрирование рейтингов
user_ratings_mean = user_ratings.mean(axis=1)
user_ratings_demeaned = user_ratings - user_ratings_mean.values.reshape(-1, 1)

In [35]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import numpy as np
# Применение SVD
U, sigma, Vt = svds(user_ratings_demeaned.values, k=50)

# Восстановление прогнозируемой матрицы
sigma = np.diag(sigma)
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.values.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_ratings.columns)

In [52]:
# Функция для рекомендаций
def recommend_movies(user_id, num_recommendations=5):
    user_row = user_id - 1  # userId начинается с 1, не с 0
    sorted_user_predictions = predicted_ratings_df.iloc[user_row].sort_values(ascending=False)
    
    # Получить данные пользователя и объединить с данными о фильмах
    user_data = ratings[ratings.userId == user_id]
    user_full = (user_data.merge(movies, how='left', left_on='movieId', right_on='id').sort_values(['rating'], ascending=False))

    # Рекомендации для пользователя
    recommendations = (movies[~movies['id'].isin(user_full['movieId'])]
                       .merge(pd.DataFrame(sorted_user_predictions).reset_index(), how='left', left_on='id', right_on='movieId')
                       .rename(columns={user_row: 'Predictions'})
                       .sort_values('Predictions', ascending=False)
                       .iloc[:num_recommendations, :-1])

    return user_full, recommendations

# Тестирование функции
user_full, recommendations = recommend_movies(1, 5)
print(recommendations[['title', 'id']])

                      title      id
534    Sleepless in Seattle   858.0
2297               Rocky IV  1374.0
7097   The Butterfly Effect  1954.0
33862          The Pawnshop  2078.0
40441           Dirty Hands  3479.0


In [58]:
import json
def extract_main_genre(genres_str):
    try:
        genres_list = json.loads(genres_str.replace("'", "\""))
        # Ensuring "Science Fiction" remains intact
        genres_names = [genre['name'] for genre in genres_list]
        for name in genres_names:
            if 'Science Fiction' in name:
                return 'Science Fiction'
        return genres_names[0]
    except:
        return 'Unknown'

movies['main_genre'] = movies['genres'].apply(extract_main_genre)

In [57]:
movies.genres

0        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1        [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2        [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3        [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                           [{'id': 35, 'name': 'Comedy'}]
                               ...                        
45461    [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...
45462                        [{'id': 18, 'name': 'Drama'}]
45463    [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...
45464                                                   []
45465                                                   []
Name: genres, Length: 45466, dtype: object

In [59]:
movies['main_genre']

0        Animation
1        Adventure
2          Romance
3           Comedy
4           Comedy
           ...    
45461        Drama
45462        Drama
45463       Action
45464      Unknown
45465      Unknown
Name: main_genre, Length: 45466, dtype: object

In [60]:
for i in ratings:
    ratings['genre'] = movies['main_genre'].isin()

In [91]:
ratings['genre'] = ratings['movieId'].map(movies.set_index('id')['main_genre'])


InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [169]:
ratings = ratings.dropna(subset=['movieId'])
movies = movies.dropna(subset=['id'])
#movies['id'] = movies['id'].astype (int)
#movies['id'] = pd.to_numeric(movies['id'], dtype='int')
movies = movies.sort_values(by='id', ascending=False)
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
20189,False,,0,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",,469172.0,tt0089018,pt,Manoel dans l'île des merveilles,Manuel is a young boy who travels from long ag...,...,1984-08-02,0.0,130.0,"[{'iso_639_1': 'pt', 'name': 'Português'}, {'i...",Released,,Manuel on the Island of Wonders,False,0.0,0.0
45398,False,,1254040,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",http://lmtr.fi/,468707.0,tt5742932,fi,Lauri Mäntyvaaran tuuheet ripset,,...,2017-07-28,0.0,90.0,"[{'iso_639_1': 'fi', 'name': 'suomi'}]",Released,,Thick Lashes of Lauri Mäntyvaara,False,8.0,1.0
21891,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,468343.0,tt0133202,fi,Silja - nuorena nukkunut,"In the 1910s, beautiful young Silja loses both...",...,1956-01-01,0.0,87.0,[],Released,,Silja - nuorena nukkunut,False,0.0,0.0
45273,False,,0,"[{'id': 18, 'name': 'Drama'}]",,467731.0,tt0507700,en,Tragedy in a Temporary Town,Fifteen-year-old girl Dotty Fisher is assaulte...,...,1956-02-19,0.0,60.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Tragedy in a Temporary Town,False,0.0,0.0
45078,False,,0,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",,465044.0,tt5943940,en,Abduction,A horror comedy spoofing conspiracy theory mov...,...,2017-06-28,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Horrifically Funny,Abduction,False,0.0,0.0


In [183]:
for i in range(len(ratings)):
    for j in range(len(movies)):
        if ratings['movieId'][i]==int(movies['id'][j]):
            ratings.loc[i, 'genre']=movies.loc[j, 'main_genre']

KeyError: 19730

In [198]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,genre,pca
0,1,31,2.5,1260759144,2,295.903489
1,1,1029,3.0,1260759179,2,-295.903489
2,1,1061,3.0,1260759182,2,
3,1,1129,2.0,1260759185,2,
4,1,1172,4.0,1260759205,2,


In [235]:
x = np.array([ratings['rating'], ratings['genre']]).T

from sklearn.decomposition import PCA
pca = PCA(n_components=1).fit_transform(x)




In [236]:
pca.shape

(100004, 1)

In [237]:
ratings['pca'] = pd.DataFrame(pca)

In [238]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,genre,pca
0,1,31,2.5,1260759144,2,1.043608
1,1,1029,3.0,1260759179,2,0.543608
2,1,1061,3.0,1260759182,2,0.543608
3,1,1129,2.0,1260759185,2,1.543608
4,1,1172,4.0,1260759205,2,-0.456392


In [239]:
# Создание матрицы пользователь-предмет
user_ratings = ratings.pivot(index='userId', columns='movieId', values='pca')
user_ratings = user_ratings.fillna(0)

In [240]:
user_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,-0.456392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,-0.456392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.0,-0.456392,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.000000,0.0,0.000000,0.0,0.0,-0.456392,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,-0.456392,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [241]:
# Центрирование рейтингов
user_ratings_mean = user_ratings.mean(axis=1)
user_ratings_demeaned = user_ratings - user_ratings_mean.values.reshape(-1, 1)

In [242]:
# Применение SVD
U, sigma, Vt = svds(user_ratings_demeaned.values, k=50)

# Восстановление прогнозируемой матрицы
sigma = np.diag(sigma)
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.values.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_ratings.columns)

In [243]:
# Функция для рекомендаций
def recommend_movies(user_id, num_recommendations=5):
    user_row = user_id - 1  # userId начинается с 1, не с 0
    sorted_user_predictions = predicted_ratings_df.iloc[user_row].sort_values(ascending=False)
    
    # Получить данные пользователя и объединить с данными о фильмах
    user_data = ratings[ratings.userId == user_id]
    user_full = (user_data.merge(movies, how='left', left_on='movieId', right_on='id').sort_values(['rating'], ascending=False))

    # Рекомендации для пользователя
    recommendations = (movies[~movies['id'].isin(user_full['movieId'])]
                       .merge(pd.DataFrame(sorted_user_predictions).reset_index(), how='left', left_on='id', right_on='movieId')
                       .rename(columns={user_row: 'Predictions'})
                       .sort_values('Predictions', ascending=False)
                       .iloc[:num_recommendations, :-1])

    return user_full, recommendations

# Тестирование функции
user_full, recommendations = recommend_movies(1, 5)
print(recommendations[['title', 'id']])

                                          title      id
44153                           Say Anything...  2028.0
45060                        Once Were Warriors   527.0
45008                               Silent Hill   588.0
44928  Harry Potter and the Prisoner of Azkaban   673.0
44484                            28 Weeks Later  1562.0
