#Recommender Systems in Python

In [1]:
import pandas as pd
import numpy as np

In [2]:
# df_ratings = pd.read_csv('drive/MyDrive/dataset/movies_rating.csv')
# df_ratings['date'] = pd.to_datetime(df_ratings['timestamp'],unit='s')
# df_ratings = df_ratings[['date','userId','movieId','rating']]
# df_ratings.columns = ['date','user_id','movie_id','rating']
# print(df_ratings.head(5))
# print('---------------')
# print(df_ratings.shape)

In [3]:
usecols = ['id','title','vote_average','vote_count','overview']
df_movies = pd.read_csv('drive/MyDrive/dataset/movies.csv',usecols=usecols)
df_movies.rename(columns={'id': 'movie_id'}, inplace=True)
df_movies['movie_id'] = pd.to_numeric(df_movies['movie_id'] , errors='coerce').astype('Int32')
df_movies = df_movies.dropna()
df_movies = df_movies.reset_index(drop=True)
print(df_movies.head(5))
print('---------------')
print(df_movies.shape)

   movie_id  ... vote_count
0       862  ...     5415.0
1      8844  ...     2413.0
2     15602  ...       92.0
3     31357  ...       34.0
4     11862  ...      173.0

[5 rows x 5 columns]
---------------
(44506, 5)


###Simple Recommenders

In [4]:
# WeightedRating(WR)=(v/(v+m)*R) + (m/(v+m)*C)
# v- количество голосов за фильм
# m- минимальное количество голосов, необходимое для внесения в таблицу
# R- средний рейтинг фильма
# C- среднее количество голосов по всему отчету

In [5]:
C = df_movies['vote_average'].mean()
print(C)

5.638880150990592


In [6]:
m = df_movies['vote_count'].quantile(0.90)
print(m)

165.0


In [7]:
df_movies_ = df_movies.copy().loc[df_movies['vote_count'] >= m]
df_movies_.shape

(4462, 5)

In [8]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [9]:
df_movies_['score'] = df_movies_.apply(weighted_rating, axis=1)

In [10]:
df_movies_ = df_movies_.sort_values('score', ascending=False)

In [11]:
df_movies_[['title', 'vote_count', 'vote_average', 'score']].head()

Unnamed: 0,title,vote_count,vote_average,score
312,The Shawshank Redemption,8358.0,8.5,8.44461
823,The Godfather,6024.0,8.5,8.423722
10279,Dilwale Dulhania Le Jayenge,661.0,9.1,8.408614
12447,The Dark Knight,12269.0,8.3,8.264687
2825,Fight Club,9678.0,8.3,8.255391


###Content-Based Recommender

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

df_movies['overview'] = df_movies['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(df_movies['overview'])

tfidf_matrix.shape

(44506, 75827)

In [13]:
tfidf_matrix = tfidf_matrix.astype(np.float32)

In [14]:
tfidf.get_feature_names()[5000:5010]

['avails',
 'avaks',
 'avalanche',
 'avalanches',
 'avallone',
 'avalon',
 'avant',
 'avanthika',
 'avanti',
 'avaracious']

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [16]:
cosine_sim.shape

(44506, 44506)

In [17]:
cosine_sim[1]

array([0.01502121, 1.        , 0.0467977 , ..., 0.        , 0.02195746,
       0.00924177], dtype=float32)

In [18]:
indices = pd.Series(df_movies.index, index=df_movies['title']).drop_duplicates()

In [19]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [20]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df_movies['title'].iloc[movie_indices]

In [21]:
get_recommendations('The Godfather')

1164               The Godfather: Part II
43135    The Godfather Trilogy: 1972-1990
1898              The Godfather: Part III
22933                          Blood Ties
11265                    Household Saints
34196                   Start Liquidation
10789                            Election
37341            A Mother Should Be Loved
17628                   Short Sharp Shock
26035                  Beck 28 - Familjen
Name: title, dtype: object