In [1]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display, HTML
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

In [2]:
from surprise import Dataset
from surprise import accuracy
import pandas as pd
from pathlib import Path

from surprise.model_selection import train_test_split
import numpy as np
from tqdm import tqdm,tqdm_notebook

users = pd.read_csv("users.csv", sep="\t", engine="python")
movies = pd.read_csv("movies.csv", sep="\t", engine="python")
ratings = pd.read_csv("ratings.csv", sep="\t", engine="python")

users['user_id'] = users['user_id'].astype(str)
movies['movie_id'] = movies['movie_id'].astype(str)
ratings['movie_id'] = ratings['movie_id'].astype(str)
ratings['user_id'] = ratings['user_id'].astype(str)

print(users.shape, movies.shape, ratings.shape)


from importlib import reload
import hwer
reload(hwer)




(6040, 5) (3883, 12) (1000209, 4)


<module 'hwer' from '/Users/ahemf/mygit/Hybrid-Weighted-Embedding-Recommender/hwer/__init__.py'>

In [3]:
from ast import literal_eval

movies.genres = movies.genres.fillna("[]").apply(literal_eval)
movies['year'] = movies['year'].fillna(-1).astype(int)

movies.keywords = movies.keywords.fillna("[]").apply(literal_eval)
movies.keywords = movies.keywords.apply(lambda x: " ".join(x))

movies.tagline = movies.tagline.fillna("")
text_columns = ["title","keywords","overview","tagline","original_title"]
movies[text_columns] = movies[text_columns].fillna("")

movies['text'] = movies["title"] +" "+ movies["keywords"] +" "+ movies["overview"] +" "+ movies["tagline"] +" "+ movies["original_title"]
movies["title_length"] = movies["title"].apply(len)
movies["overview_length"] = movies["overview"].apply(len)
movies["runtime"] = movies["runtime"].fillna(0.0)


In [4]:
ratings.head().values
user_item_affinities = [[row[0], row[1], row[2]] for row in ratings.values]


array([['1', '1193', 5, 978300760],
       ['1', '661', 3, 978302109],
       ['1', '914', 3, 978301968],
       ['1', '3408', 4, 978300275],
       ['1', '2355', 5, 978824291]], dtype=object)

In [5]:

from hwer import MultiCategoricalEmbedding, FlairGlove100AndBytePairEmbedding, CategoricalEmbedding, NumericEmbedding
from hwer import Feature, FeatureSet, ContentRecommendation, FeatureType

embedding_mapper = {}
embedding_mapper['gender'] = CategoricalEmbedding(n_dims=1)
embedding_mapper['age'] = CategoricalEmbedding(n_dims=1)
embedding_mapper['occupation'] = CategoricalEmbedding(n_dims=2)
embedding_mapper['zip'] = CategoricalEmbedding(n_dims=8)

embedding_mapper['text'] = FlairGlove100AndBytePairEmbedding()
embedding_mapper['numeric'] = NumericEmbedding(4)
embedding_mapper['genres'] = MultiCategoricalEmbedding(n_dims=16)


recsys = ContentRecommendation(embedding_mapper=embedding_mapper, knn_params=None, n_output_dims=128, rating_scale=(1,5))


u1 = Feature(feature_name="gender", feature_type=FeatureType.CATEGORICAL, values=users.gender.values)
u2 = Feature(feature_name="age", feature_type=FeatureType.CATEGORICAL, values=users.age.astype(str).values)
u3 = Feature(feature_name="occupation", feature_type=FeatureType.CATEGORICAL, values=users.occupation.astype(str).values)
u4 = Feature(feature_name="zip", feature_type=FeatureType.CATEGORICAL, values=users.zip.astype(str).values)
user_data = FeatureSet([u1, u2, u3, u4])

i1 = Feature(feature_name="text", feature_type=FeatureType.STR, values=movies.text.values)
i2 = Feature(feature_name="genres", feature_type=FeatureType.MULTI_CATEGORICAL, values=movies.genres.values)
i3 = Feature(feature_name="numeric", feature_type=FeatureType.NUMERIC, values=movies[["title_length", "overview_length", "runtime"]].values)
item_data = FeatureSet([i1, i2, i3])

kwargs = {}
kwargs['user_data'] = user_data
kwargs['item_data'] = item_data

_ = recsys.fit(users.user_id.values, movies.movie_id.values,
               user_item_affinities, **kwargs)




100%|██████████| 3883/3883 [00:21<00:00, 184.67it/s]


In [6]:
res, dist = zip(*recsys.find_items_for_user(user='1', positive=[], negative=[]))
res = res[:100]

preds = set(movies[movies.movie_id.isin(res)]["title"])
actuals = set(movies.merge(ratings[ratings.user_id=='1'],on='movie_id')["title"])

len(preds.intersection(actuals))


1

In [None]:
# normalize affinity/ratings per user
# Look at surprise SVD++ and NMF
# Try without movie title
# Ranking part


In [3]:
from hwer.utils import normalize_affinity_scores_by_user_item

In [11]:
uid = pd.DataFrame(user_item_affinities, columns=["user", "item", "rating"])
uid.head(1)
mean, bu, bi, spread, uid = normalize_affinity_scores_by_user_item(user_item_affinities)


Unnamed: 0,user,item,rating
0,1,1193,5.0


In [12]:
mean, bu[uid[0][0]], bi[uid[0][1]], uid[0]

mean + bu[uid[0][0]] + bi[uid[0][1]]
mean + bu[uid[0][0]] + bi[uid[0][1]] + spread

spread
uid = pd.DataFrame(uid, columns=["user", "item", "rating"])
uid["rating"].max(), uid["rating"].min()
spread = max(uid["rating"].max(), np.abs(uid["rating"].min()))
spread

(3.581564453029317,
 0.42412658236574274,
 0.749009453633146,
 ('1', '1193', 0.24529951097179392))

4.754700489028206

9.644394911604012

4.889694422575806

(4.889694422575806, -4.8504526245318145)

4.889694422575806

In [173]:
preds = np.array([mean + bu[u] + bi[i] for u, i, r in user_item_affinities])
actuals = np.array([r for u, i, r in user_item_affinities])
np.sqrt(np.mean(np.square(actuals - preds)))

np.argmax(np.abs(actuals - preds)),np.max(np.abs(actuals - preds))


0.8991955379743222

(763427, 4.889694422575806)