In [1]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display, HTML
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

In [6]:
from surprise import Dataset
from surprise import accuracy
import pandas as pd
from pathlib import Path
from surprise.model_selection import train_test_split
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=.25)

print(data.ratings_file)

path = Path(data.ratings_file)
ml_1m_dir = path.resolve().parents[1]
files = list(ml_1m_dir.glob('**/*.dat'))

users = [f for f in files if "users.dat" in str(f)][0]
movies = [f for f in files if "movies.dat" in str(f)][0]
ratings = [f for f in files if "ratings.dat" in str(f)][0]

users = pd.read_csv(str(users),sep="::", header=None, names=["user_id", "gender", "age", "occupation", "zip"], engine='python')
movies = pd.read_csv(str(movies),sep="::", header=None, names=["movie_id", "title", "genres"], engine='python')
ratings = pd.read_csv(str(ratings),sep="::", header=None, names=["user_id", "movie_id", "rating", "timestamp"], engine='python')

movies['genres'] = movies['genres'].apply(lambda x: x.lower().split('|'))
users['user_id'] = users['user_id'].astype(str)
movies['movie_id'] = movies['movie_id'].astype(str)
ratings['movie_id'] = ratings['movie_id'].astype(str)
ratings['user_id'] = ratings['user_id'].astype(str)
# CountVectorizer and make 1 column for each genre

print(users.shape, movies.shape, ratings.shape)

user_item_affinities = list(map(lambda x: tuple([x[0], x[1], x[2]]), data.raw_ratings))

from importlib import reload
import hwer
reload(hwer)

from hwer import MultiCategoricalEmbedding, FlairGlove100AndBytePairEmbedding, CategoricalEmbedding
from hwer import Feature, FeatureSet, ContentRecommendation, FeatureType

embedding_mapper = {}
embedding_mapper['gender'] = CategoricalEmbedding(n_dims=2)
embedding_mapper['age'] = CategoricalEmbedding(n_dims=2)
embedding_mapper['occupation'] = CategoricalEmbedding(n_dims=2)
embedding_mapper['zip'] = CategoricalEmbedding(n_dims=8)

embedding_mapper['title'] = FlairGlove100AndBytePairEmbedding()
embedding_mapper['genres'] = MultiCategoricalEmbedding(n_dims=16)


recsys = ContentRecommendation(embedding_mapper=embedding_mapper, knn_params=None, n_output_dims=24)

kwargs = {'user_item_affinities':user_item_affinities}

u1 = Feature(feature_name="gender", feature_type=FeatureType.CATEGORICAL, values=users.gender.values)
u2 = Feature(feature_name="age", feature_type=FeatureType.CATEGORICAL, values=users.age.astype(str).values)
u3 = Feature(feature_name="occupation", feature_type=FeatureType.CATEGORICAL, values=users.occupation.astype(str).values)
u4 = Feature(feature_name="zip", feature_type=FeatureType.CATEGORICAL, values=users.zip.astype(str).values)
user_data = FeatureSet([u1, u2, u3, u4])

i1 = Feature(feature_name="title", feature_type=FeatureType.STR, values=movies.title.values)
i2 = Feature(feature_name="genres", feature_type=FeatureType.MULTI_CATEGORICAL, values=movies.genres.values)
item_data = FeatureSet([i2])

kwargs['user_data'] = user_data
kwargs['item_data'] = item_data

_ = recsys.fit(user_ids=users.user_id.values, item_ids=movies.movie_id.values, **kwargs)

res, dist = zip(*recsys.find_items_for_user(user='1', positive=[], negative=[]))
res = res[:20]




/Users/ahemf/.surprise_data/ml-1m/ml-1m/ratings.dat
(6040, 5) (3883, 3) (1000209, 4)


<module 'hwer' from '/Users/ahemf/mygit/Hybrid-Weighted-Embedding-Recommender/hwer/__init__.py'>

In [7]:
movies[movies.movie_id.isin(res)]

Unnamed: 0,movie_id,title,genres
37,38,It Takes Two (1995),[comedy]
53,54,"Big Green, The (1995)","[children's, comedy]"
118,120,Race the Sun (1996),[drama]
144,146,"Amazing Panda Adventure, The (1995)","[adventure, children's]"
307,310,Rent-a-Kid (1995),[comedy]
629,634,Theodore Rex (1995),[comedy]
744,754,Gold Diggers: The Secret of Bear Mountain (1995),"[adventure, children's]"
823,834,Phat Beach (1996),[comedy]
1091,1107,Loser (1991),[comedy]
1123,1139,Everything Relative (1996),[drama]


In [4]:
movies.merge(ratings[ratings.user_id=='1'],on='movie_id')

Unnamed: 0,movie_id,title,genres,user_id,rating,timestamp
0,1,Toy Story (1995),"[animation, children's, comedy]",1,5,978824268
1,48,Pocahontas (1995),"[animation, children's, musical, romance]",1,5,978824351
2,150,Apollo 13 (1995),[drama],1,5,978301777
3,260,Star Wars: Episode IV - A New Hope (1977),"[action, adventure, fantasy, sci-fi]",1,4,978300760
4,527,Schindler's List (1993),"[drama, war]",1,5,978824195
5,531,"Secret Garden, The (1993)","[children's, drama]",1,4,978302149
6,588,Aladdin (1992),"[animation, children's, comedy, musical]",1,4,978824268
7,594,Snow White and the Seven Dwarfs (1937),"[animation, children's, musical]",1,4,978302268
8,595,Beauty and the Beast (1991),"[animation, children's, musical]",1,5,978824268
9,608,Fargo (1996),"[crime, drama, thriller]",1,4,978301398


In [None]:
# normalize affinity/ratings per user
# Look at surprise SVD++ and NMF
# Try without movie title
# Ranking part
