In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

In [2]:
links = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/links.csv')
movies = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/movies.csv')
ratings = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/ratings.csv')
tags = pd.read_csv('C:/Users/User/Desktop/Data Scientist/AML/Recsys/data/tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(9742, 3)

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratings.shape

(100836, 4)

In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
tags.shape

(3683, 4)

In [9]:
# Объеденим датасеты рейтингов и тегов
ratings_tags = pd.merge(ratings, tags, how='left', left_on=['userId', 'movieId'], right_on=['userId', 'movieId'])
ratings_tags.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y
0,1,1,4.0,964982703,,
1,1,3,4.0,964981247,,
2,1,6,4.0,964982224,,
3,1,47,5.0,964983815,,
4,1,50,5.0,964982931,,


In [10]:
ratings_tags.shape

(102677, 6)

In [11]:
# Объеденим датасет с тегами и рейтингами с датасетом жанров
ratings_tags_genres = pd.merge(ratings_tags, movies, how='left', on='movieId')
ratings_tags_genres.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres
0,1,1,4.0,964982703,,,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,,,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,,,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,,,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,,,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [12]:
ratings_tags_genres.shape

(102677, 8)

In [13]:
# В жанрах есть неопределённые значения жанров
ratings_tags_genres.loc[ratings_tags_genres.genres == '(no genres listed)'].shape[0]

49

In [14]:
# Удалим данные с неопределёнными значениями жанров
ratings_tags_genres = ratings_tags_genres.loc[ratings_tags_genres.genres != '(no genres listed)'].reset_index(drop=True)

In [15]:
ratings_tags_genres.shape

(102628, 8)

In [16]:
def change_genre(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [17]:
# Изменим фич genres для последующей обработки
ratings_tags_genres.loc[:, 'genres'] = ratings_tags_genres.loc[:, 'genres'].apply(change_genre)

In [18]:
# Объединим теги по фильмам
movie_tags = {}
for movie, group in tqdm(tags.groupby('movieId')):
    movie_tags[movie] = ' '.join([str(s).title().replace(' ', '').replace('-', '').replace('.', '')
                                  for s in group.tag.values])

HBox(children=(FloatProgress(value=0.0, max=1572.0), HTML(value='')))




In [19]:
# Создадим новый фич all_tags - общие теги фильма
ratings_tags_genres = pd.merge(ratings_tags_genres, pd.Series(movie_tags).rename('all_tags'),
                               how='left', left_on='movieId', right_index=True)

In [20]:
ratings_tags_genres.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,all_tags
0,1,1,4.0,964982703,,,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Pixar Pixar Fun
1,1,3,4.0,964981247,,,Grumpier Old Men (1995),Comedy Romance,Moldy Old
2,1,6,4.0,964982224,,,Heat (1995),Action Crime Thriller,
3,1,47,5.0,964983815,,,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Mystery TwistEnding SerialKiller
4,1,50,5.0,964982931,,,"Usual Suspects, The (1995)",Crime Mystery Thriller,Mindfuck Suspense Thriller Tricky TwistEnding ...


In [21]:
# Максимальное и минимальное количество оценок по всем пользователям
ratings_tags_genres.groupby('userId').count().rating.agg(['min', 'max'])

min      20
max    2792
Name: rating, dtype: int64

Создадим новые фичи из метрик:  
 **r_mean_user**: средняя оценка фильма каждого пользователя  
 **r_value_user**: размах умноженный на медиану по оценкам фильма каждого пользователя  
 **r_mean_movie**: средняя оценка фильма по всем пользователям  
 **r_value_movie**: размах умноженный на медиану по всем пользователям для каждого фильма

In [22]:
user_ratings = ratings.groupby('userId').agg(['min', 'max', 'mean', 'median']).rating

In [23]:
user_ratings.rename(columns={'mean': 'r_mean_user'}, inplace=True)

In [24]:
user_ratings['r_value_user'] = user_ratings.apply(lambda row: (row['max'] - row['min']) * row['median']
                                                  if row['max'] != row['min'] else row['median'], axis=1)

In [25]:
user_ratings.head()

Unnamed: 0_level_0,min,max,r_mean_user,median,r_value_user
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,5.0,4.366379,5.0,20.0
2,2.0,5.0,3.948276,4.0,12.0
3,0.5,5.0,2.435897,0.5,2.25
4,1.0,5.0,3.555556,4.0,16.0
5,1.0,5.0,3.636364,4.0,16.0


In [26]:
movie_ratings = ratings.groupby('movieId').agg(['min', 'max', 'mean', 'median']).rating

In [27]:
movie_ratings.rename(columns={'mean': 'r_mean_movie'}, inplace=True)

In [28]:
movie_ratings['r_value_movie'] = movie_ratings.apply(lambda row: (row['max'] - row['min']) * row['median']
                                                    if row['max'] != row['min'] else row['median'], axis=1)

In [29]:
movie_ratings.head()

Unnamed: 0_level_0,min,max,r_mean_movie,median,r_value_movie
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.5,5.0,3.92093,4.0,18.0
2,0.5,5.0,3.431818,3.5,15.75
3,0.5,5.0,3.259615,3.0,13.5
4,1.0,3.0,2.357143,3.0,6.0
5,0.5,5.0,3.071429,3.0,13.5


In [30]:
ratings_tags_genres = pd.merge(ratings_tags_genres, user_ratings[['r_mean_user', 'r_value_user']],
                               how='left', left_on='userId', right_index=True)

In [31]:
ratings_tags_genres = pd.merge(ratings_tags_genres, movie_ratings[['r_mean_movie', 'r_value_movie']],
                               how='left', left_on='movieId', right_index=True)

In [32]:
ratings_tags_genres.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,all_tags,r_mean_user,r_value_user,r_mean_movie,r_value_movie
0,1,1,4.0,964982703,,,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Pixar Pixar Fun,4.366379,20.0,3.92093,18.0
1,1,3,4.0,964981247,,,Grumpier Old Men (1995),Comedy Romance,Moldy Old,4.366379,20.0,3.259615,13.5
2,1,6,4.0,964982224,,,Heat (1995),Action Crime Thriller,,4.366379,20.0,3.946078,16.0
3,1,47,5.0,964983815,,,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Mystery TwistEnding SerialKiller,4.366379,20.0,3.975369,18.0
4,1,50,5.0,964982931,,,"Usual Suspects, The (1995)",Crime Mystery Thriller,Mindfuck Suspense Thriller Tricky TwistEnding ...,4.366379,20.0,4.237745,18.0


In [33]:
# Количество фильмов без тегов
ratings_tags_genres.all_tags.isna().sum()

52503

In [34]:
# Удалим фильмы без тегов
ratings_tags_genres = ratings_tags_genres.loc[ratings_tags_genres.all_tags.notna()].reset_index(drop=True)

In [35]:
ratings_tags_genres.shape

(50125, 13)

In [36]:
# Применим методы CountVectorizer и Tfidf к фичу genres
count_vect_gen = CountVectorizer()
tfidf_transformer_gen = TfidfTransformer()

genres_count_vect = count_vect_gen.fit_transform(ratings_tags_genres.genres)
genres_tfidf = tfidf_transformer_gen.fit_transform(genres_count_vect)

In [37]:
# Создадим датафрейм для фича genres
gen_feat_df = pd.DataFrame(genres_tfidf.toarray(), columns=count_vect_gen.get_feature_names())

In [38]:
# Применим методы CountVectorizer и Tfidf к фичу all_tags
count_vect_tag = CountVectorizer()
tfidf_transformer_tag = TfidfTransformer()

tag_count_vect = count_vect_tag.fit_transform(ratings_tags_genres.all_tags)
tag_tfidf = tfidf_transformer_tag.fit_transform(tag_count_vect)

In [39]:
# Создадим датафрейм для фича all_tags
tag_feat_df = pd.DataFrame(tag_tfidf.toarray(), columns=count_vect_tag.get_feature_names())

In [40]:
# Создадим датафрейм из уникальных фильмов для фича genres
gen_movie_uniq = gen_feat_df.iloc[ratings_tags_genres.drop_duplicates(subset='movieId').index]

In [41]:
neigh_gen = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='euclidean')
neigh_gen.fit(gen_movie_uniq)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=10)

### Рекомендация фильмов по жанру

In [42]:
s = ratings_tags_genres.loc[ratings_tags_genres.movieId == 50].genres.iloc[0]

In [43]:
s

'Crime Mystery Thriller'

In [44]:
pred_count_vect = count_vect_gen.transform([s])

In [45]:
pred_tfidf = tfidf_transformer_gen.transform(pred_count_vect)

In [46]:
predict_gen = neigh_gen.kneighbors(pred_tfidf.todense(), return_distance=True)

In [47]:
gen_neighbors = gen_movie_uniq.index[predict_gen[1][0]]

In [48]:
# Выведем рекомендованные фильмы по жанру
ratings_tags_genres.iloc[gen_neighbors]

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,all_tags,r_mean_user,r_value_user,r_mean_movie,r_value_movie
1202,18,102903,4.0,1513369638,,,Now You See Me (2013),Crime Mystery Thriller,Illusions Overcomplicated Predictable StupidEn...,3.732072,18.0,3.409091,18.0
1846,23,2579,4.0,1107163393,,,Following (1998),Crime Mystery Thriller,BlackAndWhite ChristopherNolan DirectorialDebu...,3.64876,10.5,3.833333,2.0
3,1,50,5.0,964982931,,,"Usual Suspects, The (1995)",Crime Mystery Thriller,Mindfuck Suspense Thriller Tricky TwistEnding ...,4.366379,20.0,4.237745,18.0
41,1,1089,5.0,964982951,,,Reservoir Dogs (1992),Crime Mystery Thriller,EnsembleCast Nonlinear QuentinTarantino Styliz...,4.366379,20.0,4.20229,16.0
1544,20,5630,3.5,1054037295,,,Red Dragon (2002),Crime Mystery Thriller,HannibalLecter,3.590909,16.875,3.435484,15.75
3614,50,8879,3.0,1527542396,,,Murder on the Orient Express (1974),Crime Mystery Thriller,AgathaChristie Oscar(BestSupportingActress) Train,2.780645,12.0,3.25,6.5
228,4,1834,5.0,945174134,,,"Spanish Prisoner, The (1997)",Crime Drama Mystery Thriller,TwistEnding,3.555556,16.0,4.0,8.0
415,6,628,3.0,845555300,,,Primal Fear (1996),Crime Drama Mystery Thriller,EdwardNorton Psychology Suspense ThoughtProvok...,3.493631,12.0,3.642857,14.0
242,4,2467,4.0,945079858,,,"Name of the Rose, The (Name der Rose, Der) (1986)",Crime Drama Mystery Thriller,Religion,3.555556,16.0,3.75,12.0
1165,18,52604,4.5,1457650649,Anthony Hopkins,1457651000.0,Fracture (2007),Crime Drama Mystery Thriller,AnthonyHopkins CourtroomDrama TwistEnding,3.732072,18.0,3.666667,9.375


In [49]:
# Создадим датафрейм из уникальных фильмов для фича all_tags
tag_movie_uniq = tag_feat_df.iloc[ratings_tags_genres.drop_duplicates(subset='movieId').index]

In [50]:
neigh_tag = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='euclidean')
neigh_tag = neigh_tag.fit(tag_movie_uniq)

### Рекомендация фильмов по тегам

In [51]:
s2 = ratings_tags_genres.loc[ratings_tags_genres.movieId == 50, 'all_tags'].iloc[0]

In [52]:
s2

'Mindfuck Suspense Thriller Tricky TwistEnding Heist'

In [53]:
pred_tag_cv = count_vect_tag.transform([s2])

In [54]:
pred_tag_tfidf = tfidf_transformer_tag.transform(pred_tag_cv)

In [55]:
predict_tag = neigh_tag.kneighbors(pred_tag_tfidf.todense(), return_distance=True)

In [56]:
predict_tag

(array([[0.        , 1.07763445, 1.07763445, 1.07763445, 1.07763445,
         1.07763445, 1.07763445, 1.07763445, 1.14030858, 1.1471726 ]]),
 array([[  3, 720, 821, 912, 513, 329, 569, 728,  71,  88]], dtype=int64))

In [57]:
tag_neighbors = tag_movie_uniq.index[predict_tag[1][0]][:4]

In [58]:
# Выведем рекомендованные фильмы по тегам
ratings_tags_genres.loc[ratings_tags_genres.index.isin(tag_neighbors)]

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,all_tags,r_mean_user,r_value_user,r_mean_movie,r_value_movie
3,1,50,5.0,964982931,,,"Usual Suspects, The (1995)",Crime Mystery Thriller,Mindfuck Suspense Thriller Tricky TwistEnding ...,4.366379,20.0,4.237745,18.0
1847,23,2726,4.0,1108703148,,,"Killing, The (1956)",Crime FilmNoir,Heist,3.64876,10.5,4.055556,8.0
2551,33,2391,2.0,939646982,,,"Simple Plan, A (1998)",Crime Drama Thriller,Heist,3.788462,16.0,3.795455,12.0
4091,57,1912,2.0,969754290,,,Out of Sight (1998),Comedy Crime Drama Romance Thriller,Heist,3.392857,16.0,3.859375,16.0


In [59]:
# Создадим датафрейм из уникальных фильмов для рекомендации по жанрам и тегам
gen_tag_movie_uniq = pd.merge(gen_movie_uniq, tag_movie_uniq, left_index=True, right_index=True, suffixes=('_x', ''))

In [60]:
gen_tag_movie_uniq.shape

(1553, 1484)

In [61]:
neigh_gen_tag = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean')
neigh_gen_tag.fit(gen_tag_movie_uniq)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

### Рекомендация фильмов по жанрам и тегам

In [62]:
def rec_movies(user_id, movie_id):
    """
    Returns indexes of recomended movies for User with user_id on Movie with movie_id    
    """
    s1 = ratings_tags_genres.loc[((ratings_tags_genres.userId == user_id) &
                                  (ratings_tags_genres.movieId == movie_id), 'genres')].values[0]
    s2 = ratings_tags_genres.loc[((ratings_tags_genres.userId == user_id) &
                                  (ratings_tags_genres.movieId == movie_id), 'all_tags')].values[0]
    
    pred_cv_gen = count_vect_gen.transform([s1])
    pred_cv_tag = count_vect_tag.transform([s2])
    
    pred_tfidf_gen = tfidf_transformer_gen.transform(pred_cv_gen).todense()
    pred_tfidf_tag = tfidf_transformer_tag.transform(pred_cv_tag).todense()
    
    pred_gen_tag = np.hstack((pred_tfidf_gen, pred_tfidf_tag))
    
    predict_gen_tag = neigh_gen_tag.kneighbors(pred_gen_tag, return_distance=True)
    
    return gen_tag_movie_uniq.index[predict_gen_tag[1][0]]

In [63]:
# Рекомендованные фильмы для пользователя с userId=1 по фильму с movieId=50
ratings_tags_genres.iloc[rec_movies(1, 50)]

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y,title,genres,all_tags,r_mean_user,r_value_user,r_mean_movie,r_value_movie
3,1,50,5.0,964982931,,,"Usual Suspects, The (1995)",Crime Mystery Thriller,Mindfuck Suspense Thriller Tricky TwistEnding ...,4.366379,20.0,4.237745,18.0
228,4,1834,5.0,945174134,,,"Spanish Prisoner, The (1997)",Crime Drama Mystery Thriller,TwistEnding,3.555556,16.0,4.0,8.0
1286,19,911,3.0,965705725,,,Charade (1963),Comedy Crime Mystery Romance Thriller,Heist,2.607397,12.0,3.807692,10.0
415,6,628,3.0,845555300,,,Primal Fear (1996),Crime Drama Mystery Thriller,EdwardNorton Psychology Suspense ThoughtProvok...,3.493631,12.0,3.642857,14.0
1159,18,44665,4.5,1455049870,twist ending,1456948000.0,Lucky Number Slevin (2006),Crime Drama Mystery,TwistEnding,3.732072,18.0,3.855263,12.0
483,7,4963,4.0,1106636702,,,Ocean's Eleven (2001),Crime Thriller,Heist,3.230263,15.75,3.844538,18.0
1846,23,2579,4.0,1107163393,,,Following (1998),Crime Mystery Thriller,BlackAndWhite ChristopherNolan DirectorialDebu...,3.64876,10.5,3.833333,2.0


In [64]:
# Создадим датафрейм с фичами, полученными из тегов и жанров, для всех пользователей и фильмов(с оценками, жанрами, тегами)
gen_tag_df = pd.merge(gen_feat_df, tag_feat_df, left_index=True, right_index=True, suffixes=('_x', ''))

In [65]:
# Объединим все фичи
X = pd.merge(ratings_tags_genres, gen_tag_df, left_index=True, right_index=True)

In [66]:
# Нормализуем некоторые данные
mm_scaler = MinMaxScaler()
X.iloc[:, 9:13] = mm_scaler.fit_transform(X.iloc[:, 9:13])

### Предсказание оценки пользователей

In [67]:
model_pred = {}
model_rmse = {}

for user in ratings_tags_genres.userId.unique():
#     print(user)
    
    X_user = X.iloc[X.loc[X.userId == user].index, 9:]
    y_user = ratings_tags_genres.iloc[X.loc[X.userId == user].index, 2].astype(str)
    
    if len(y_user.unique()) > 1:
        X_train, X_test, y_train, y_test = train_test_split(X_user, y_user, test_size=.25, random_state=13)

        log_reg = LogisticRegression(dual=True, solver='liblinear', max_iter=300)
        log_reg.fit(X_train, y_train)
        rmse = mean_squared_error(y_test, log_reg.predict(X_test), squared=False)

        model_pred[user] = log_reg
        model_rmse[user] = rmse

In [68]:
# Выведем RMSE минимальную, максимальную, среднюю и медиану для всех пользователей
(min([x[1] for x in model_rmse.items()]), max([x[1] for x in model_rmse.items()]),
 np.mean([x[1] for x in model_rmse.items()]), np.median([x[1] for x in model_rmse.items()]))

(0.0, 3.5, 0.9745273149739284, 0.9014726649675061)

### Рекомендация фильмов по предсказанным оценкам

In [69]:
def rec_on_rating(user_id, n):
    """
    Returns dataframe of n first movies with highest predicted ratings for User with user_id
    """
    if user_id not in model_pred.keys():
        return print(f"User's {user_id} the only rating is ",
                     np.mean(ratings_tags_genres.loc[ratings_tags_genres.userId == user_id, 'rating'].values))
    
    else:
        
        feat_rec = X.iloc[ratings_tags_genres.drop_duplicates(subset='movieId').\
                          loc[ratings_tags_genres.userId != user_id].index, 9:]
        title_rec = X.iloc[ratings_tags_genres.drop_duplicates(subset='movieId').\
                           loc[ratings_tags_genres.userId != user_id].index, 6]

        predict_rating = model_pred[user_id].predict(feat_rec)
    
        return pd.DataFrame({'title': title_rec,
                             'predicted_rating': predict_rating.astype(float)}).sort_values(by='predicted_rating', 
                                                                                            ascending=False).head(n)

In [70]:
# Выведем первый 10 фильмов с наивысшей предсказанной оценкой для пользователя с userId=2
rec_on_rating(2, 10)

Unnamed: 0,title,predicted_rating
0,Toy Story (1995),5.0
6358,"Passage to India, A (1984)",5.0
6611,Man Bites Dog (C'est arrivé près de chez vous)...,5.0
6584,"Thin Blue Line, The (1988)",5.0
6380,Deliver Us from Evil (2006),5.0
6373,"Good Night, and Good Luck. (2005)",5.0
6365,Alice Doesn't Live Here Anymore (1974),5.0
6364,Oliver Twist (1948),5.0
6362,Educating Rita (1983),5.0
6360,"Strada, La (1954)",5.0
