In [79]:
import pandas as pd
import numpy as np
from math import sqrt

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from tqdm.notebook import tqdm

from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

In [108]:
DATA_FOLDER = "../../MovieLens dataset/"
df_genome_scores = pd.read_csv(DATA_FOLDER + "genome-scores.csv")
df_genome_tags = pd.read_csv(DATA_FOLDER + "genome-tags.csv")
df_links = pd.read_csv(DATA_FOLDER + "links.csv")
df_movies = pd.read_csv(DATA_FOLDER + "movies.csv")
df_ratings = pd.read_csv(DATA_FOLDER + "ratings.csv")
df_tags = pd.read_csv(DATA_FOLDER + "tags.csv")

In [109]:
df_ratings_sampled = df_ratings[df_ratings["userId"] < 1000]
print(df_ratings_sampled.shape)

(96412, 4)


In [5]:
# создание объекта класса Reader
reader = Reader(rating_scale=(1, 5))

# создание объекта класса Dataset
dataset = Dataset.load_from_df(
    df_ratings_sampled[["userId", "movieId", "rating"]], reader
)

# # разбиение данных на обучающую и тестовую выборки
trainset, testset = train_test_split(dataset, test_size=0.1)

In [7]:
# # создание объекта класса Reader
# reader = Reader(rating_scale=(1, 5))

# # создание объекта класса Dataset
# dataset = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

# # # разбиение данных на обучающую и тестовую выборки
# trainset, testset = train_test_split(dataset, test_size = 0.1)

# создание экземпляра класса SVD
model = SVD(verbose=True)

# обучение модели на обучающей выборке
model.fit(trainset)

# предсказание рейтингов на тестовой выборке
predictions = model.test(testset)

# оценка качества модели
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))

# R_pred_surprise = np.zeros((n_users, m_movies))
# for u in range(n_users):
#     for m in range(m_movies):
#         R_pred_surprise[u][m] = model.predict(u, m).est

# pd.DataFrame(np.round(R_pred_surprise, 2), users, movies)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
RMSE: 0.8925
RMSE: 0.8925112845253899
MAE:  0.6785
MAE: 0.6785266684960225


In [5]:
df_ratings[df_ratings["userId"] == 1001]

Unnamed: 0,userId,movieId,rating,timestamp
96487,1001,1,5.0,847560392
96488,1001,2,5.0,847560908
96489,1001,9,5.0,847560908
96490,1001,10,4.0,847560269
96491,1001,15,1.0,847561055
...,...,...,...,...
96557,1001,1035,5.0,847560908
96558,1001,1036,3.0,847560647
96559,1001,1037,5.0,847560949
96560,1001,1073,5.0,847560968


In [6]:
df_movies = df_movies.set_index("movieId")

In [18]:
favourite_movies_id = [2571]

In [8]:
df_movies.loc[favourite_movies_id]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller


In [37]:
df_movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
288967,State of Siege: Temple Attack (2021),Action|Drama
288971,Ouija Japan (2021),Action|Horror
288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
288977,Skinford: Death Sentence (2023),Crime|Thriller


In [65]:
df_movies[df_movies.title.str.startswith("Harry Potter")]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy
5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
8368,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
54001,Harry Potter and the Order of the Phoenix (2007),Adventure|Drama|Fantasy|IMAX
69844,Harry Potter and the Half-Blood Prince (2009),Adventure|Fantasy|Mystery|Romance|IMAX
81834,Harry Potter and the Deathly Hallows: Part 1 (...,Action|Adventure|Fantasy|IMAX
88125,Harry Potter and the Deathly Hallows: Part 2 (...,Action|Adventure|Drama|Fantasy|Mystery|IMAX
247038,Harry Potter: A History Of Magic (2017),Documentary
267654,Harry Potter 20th Anniversary: Return to Hogwa...,Documentary


In [28]:
favourite_movies_id = [
    2571,
]

In [8]:
data_matrix = csr_matrix(
    (
        df_ratings.rating.values.astype("f8"),
        (df_ratings.userId.values, df_ratings.movieId.values),
    )
)

In [73]:
_, S, Vt = svds(data_matrix, k=70, return_singular_vectors="vh")

In [74]:
a0 = np.zeros(df_movies.index.max() + 1)

In [77]:
a0[2116] = 5
a0[98809] = 5
a0[260] = 1
a0[1196] = 0
a0[4896] = 2
a0[5816] = 2

In [78]:
df_movies.loc[np.argsort(-Vt.T @ (Vt @ a0))[:15]]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
8368,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy
5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
69844,Harry Potter and the Half-Blood Prince (2009),Adventure|Fantasy|Mystery|Romance|IMAX
81834,Harry Potter and the Deathly Hallows: Part 1 (...,Action|Adventure|Fantasy|IMAX
54001,Harry Potter and the Order of the Phoenix (2007),Adventure|Drama|Fantasy|IMAX
88125,Harry Potter and the Deathly Hallows: Part 2 (...,Action|Adventure|Drama|Fantasy|Mystery|IMAX
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi


## Content-Based RecSys

### Предобработка данных

ref https://github.com/paulmanoj1/Data-Science/blob/master/ML0101EN-RecSys-Content-Based-movies-py-v1.ipynb

In [110]:
# Using regular expressions to find a year stored between parentheses
# We specify the parantheses so we don't conflict with movies that have years in their titles
df_movies["year"] = df_movies.title.str.extract("(\(\d\d\d\d\))", expand=False)
# Removing the parentheses
df_movies["year"] = df_movies.year.str.extract("(\d\d\d\d)", expand=False)
# Removing the years from the 'title' column
df_movies["title"] = df_movies.title.str[:-7]
# Applying the strip function to get rid of any ending whitespace characters that may have appeared
df_movies["title"] = df_movies["title"].apply(lambda x: x.strip())
df_movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [104]:
df_movies.title.str[:-7]

movieId
1                                           Toy Story
2                                             Jumanji
3                                    Grumpier Old Men
4                                   Waiting to Exhale
5                         Father of the Bride Part II
                             ...                     
288967                  State of Siege: Temple Attack
288971                                    Ouija Japan
288975      The Men Who Made the Movies: Howard Hawks
288977                       Skinford: Death Sentence
288983    UNZIPPED: An Autopsy of American Inequality
Name: title, Length: 86537, dtype: object

In [111]:
# Every genre is separated by a | so we simply have to call the split function on |
df_movies["genres"] = df_movies.genres.str.split("|")
df_movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [112]:
# Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = df_movies.copy()

# For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in df_movies.iterrows():
    for genre in row["genres"]:
        moviesWithGenres_df.at[index, genre] = 1
# Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [114]:
# Drop removes a specified row or column from a dataframe
df_ratings = df_ratings.drop(columns="timestamp")
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,110,4.0
2,1,158,4.0
3,1,260,4.5
4,1,356,5.0


### Реализация Content-Based RecSys

In [115]:
userInput = [
    {"title": "Breakfast Club, The", "rating": 5},
    {"title": "Toy Story", "rating": 3.5},
    {"title": "Jumanji", "rating": 2},
    {"title": "Pulp Fiction", "rating": 5},
    {"title": "Akira", "rating": 4.5},
]
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [116]:
df_movies[df_movies.title.str.startswith("Breakfast Club")]

Unnamed: 0,movieId,title,genres,year
1879,1968,"Breakfast Club, The","[Comedy, Drama]",1985


In [118]:
df_movies[df_movies["title"].isin(inputMovies["title"].tolist())]

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
292,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994
1241,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988
1879,1968,"Breakfast Club, The","[Comedy, Drama]",1985
43281,164600,Akira,"[Action, Crime, Thriller]",2016


In [119]:
# Filtering out the movies by title
inputId = df_movies[df_movies["title"].isin(inputMovies["title"].tolist())]
# Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
# Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop(columns="genres").drop(columns="year")
# Final input dataframe
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,164600,Akira,4.5
5,1968,"Breakfast Club, The",5.0


In [120]:
# Filtering out the movies from the input
userMovies = moviesWithGenres_df[
    moviesWithGenres_df["movieId"].isin(inputMovies["movieId"].tolist())
]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1241,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1879,1968,"Breakfast Club, The","[Comedy, Drama]",1985,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43281,164600,Akira,"[Action, Crime, Thriller]",2016,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
# Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)
# Dropping unnecessary columns due to save memory and to avoid issues
userGenreTable = (
    userMovies.drop(columns="movieId")
    .drop(columns="title")
    .drop(columns="genres")
    .drop(columns="year")
)
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
inputMovies["rating"]

0    3.5
1    2.0
2    5.0
3    4.5
4    4.5
5    5.0
Name: rating, dtype: float64

In [124]:
# Dot produt to get weights
userProfile = userGenreTable.transpose().dot(inputMovies["rating"])
# The user profile
userProfile

Adventure             10.0
Animation              8.0
Children               5.5
Comedy                13.0
Fantasy                5.5
Romance                0.0
Drama                  9.5
Action                 9.5
Crime                 10.0
Thriller              10.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [125]:
# Now let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df["movieId"])
# And drop the unnecessary information
genreTable = (
    genreTable.drop(columns="movieId")
    .drop(columns="title")
    .drop(columns="genres")
    .drop(columns="year")
)
genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
genreTable.shape

(86537, 20)

In [127]:
# Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((genreTable * userProfile).sum(axis=1)) / (userProfile.sum())
recommendationTable_df.head()

movieId
1    0.491228
2    0.245614
3    0.152047
4    0.263158
5    0.152047
dtype: float64

In [128]:
# Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
# Just a peek at the values
recommendationTable_df.head()

movieId
5018      0.730994
64645     0.725146
122787    0.725146
144324    0.725146
81132     0.725146
dtype: float64

In [130]:
# The final recommendation table
df_movies.loc[df_movies["movieId"].isin(recommendationTable_df.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
4615,4719,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Roma...",2001
4913,5018,Motorama,"[Adventure, Comedy, Crime, Drama, Fantasy, Mys...",1991
9179,27344,Revolutionary Girl Utena: Adolescence of Utena...,"[Action, Adventure, Animation, Comedy, Drama, ...",1999
9544,31367,"Chase, The","[Action, Adventure, Comedy, Crime, Romance, Th...",1994
12880,64645,The Wrecking Crew,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1968
15392,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010
22573,115479,"Whip Hand, The","[Action, Adventure, Crime, Drama, Sci-Fi, Thri...",1951
23333,117646,Dragonheart 2: A New Beginning,"[Action, Adventure, Comedy, Drama, Fantasy, Th...",2000
23663,118782,Fat Pizza,"[Action, Adventure, Comedy, Crime, Thriller]",2003
24995,122655,The Karate Killers,"[Action, Adventure, Comedy, Crime, Thriller]",1967


## Wide and Deep RecSys Model

In [141]:
df_ratings.loc[df_ratings.userId == 1]

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,110,4.0
2,1,158,4.0
3,1,260,4.5
4,1,356,5.0
...,...,...,...
57,1,40629,4.5
58,1,49647,5.0
59,1,52458,5.0
60,1,53996,5.0
