In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import pandas as pd
import numpy as np
import torch
import csv
from torch.utils.data import Subset
from transformers import AutoTokenizer, AutoModel

## Import Data

In [2]:
movies_df = pd.read_csv("./ml-latest-small/movies.csv")
tags_df = pd.read_csv("./ml-latest-small/tags.csv")

In [3]:
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [4]:
ratings_dat = open("./ml-1m/ratings.dat")

datContent=[]
for idx,val in enumerate(ratings_dat.readlines()):
    if idx==0:
        datContent.append(['userId', 'movieId', "rating"])
    datContent.append(val.strip().split("::") )

with open("./ratings.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(datContent)


columns_to_keep = ['userId', 'movieId', "rating"]
ratings_df = pd.read_csv("./ratings.csv", usecols=columns_to_keep)
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


## Get a users previously rated movies

In [5]:
user_id = 1
user_watched = ratings_df[ratings_df["userId"]==user_id]
user_watched

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
6,1,1287,5
7,1,2804,5
8,1,594,4
9,1,919,4


## Group the tags for each movieId to bucket_of_tags

In [6]:
bucket_of_tags = tags_df[["movieId","tag"]]
bucket_of_tags = bucket_of_tags.groupby('movieId').agg(lambda x: ' '.join(map(str,x))).reset_index()

movies = []
for i in bucket_of_tags["movieId"]:
    movies.append(movies_df[movies_df["movieId"]==i]["title"].to_string()[4:].strip())
    
titles = pd.DataFrame(movies, columns=["title"])

bucket_of_tags["title"] = titles.values
bucket_of_tags


Unnamed: 0,movieId,tag,title
0,1,pixar pixar fun,Toy Story (1995)
1,2,fantasy magic board game Robin Williams game,Jumanji (1995)
2,3,moldy old,Grumpier Old Men (1995)
3,5,pregnancy remake,Father of the Bride Part II (1995)
4,7,remake,Sabrina (1995)
...,...,...,...
1567,183611,Comedy funny Rachel McAdams,Game Night (2018)
1568,184471,adventure Alicia Vikander video game adaptation,Tomb Raider (2018)
1569,187593,Josh Brolin Ryan Reynolds sarcasm,Deadpool 2 (2018)
1570,187595,Emilia Clarke star wars,Solo: A Star Wars Story (2018)


## Get movies that the user has not yet watched

In [7]:
not_watched_movies = bucket_of_tags[~bucket_of_tags["movieId"].isin(user_watched["movieId"])].dropna()
not_watched_movies = not_watched_movies.reindex()
not_watched_movies

Unnamed: 0,movieId,tag,title
1,2,fantasy magic board game Robin Williams game,Jumanji (1995)
2,3,moldy old,Grumpier Old Men (1995)
3,5,pregnancy remake,Father of the Bride Part II (1995)
4,7,remake,Sabrina (1995)
5,11,politics president,"American President, The (1995)"
...,...,...,...
1567,183611,Comedy funny Rachel McAdams,Game Night (2018)
1568,184471,adventure Alicia Vikander video game adaptation,Tomb Raider (2018)
1569,187593,Josh Brolin Ryan Reynolds sarcasm,Deadpool 2 (2018)
1570,187595,Emilia Clarke star wars,Solo: A Star Wars Story (2018)


## Add the corresponding tags to the users watched movies

In [8]:
watched_movies_tags = pd.merge(user_watched, bucket_of_tags, how="outer", on="movieId")

watched_movies_tags = watched_movies_tags.dropna(how='any',subset=['rating',"tag", "userId", "title"])
watched_movies_tags.shape

(40, 5)

## Construct a "user character" to represent the types of movies the user likes

In [9]:
user_custom_movie = watched_movies_tags.groupby('userId',as_index=False).agg(lambda x: ' '.join(map(str,x)))
user_custom_movie = user_custom_movie.loc[0]
del user_custom_movie["rating"], 
del user_custom_movie["userId"]
user_custom_movie

movieId    1193 914 3408 2355 1197 2804 594 919 595 938 2...
tag        emotional jack nicholson mental illness George...
title      One Flew Over the Cuckoo's Nest (1975) My Fair...
Name: 0, dtype: object

## Shorten the not_watched_movies dataframe and append the "user character" to the bottom

In [10]:
not_watched_movies.loc[99]=user_custom_movie
not_watched_movies = not_watched_movies.reset_index()
not_watched_movies = not_watched_movies[not_watched_movies["index"]<=99]
not_watched_movies

Unnamed: 0,index,movieId,tag,title
0,1,2,fantasy magic board game Robin Williams game,Jumanji (1995)
1,2,3,moldy old,Grumpier Old Men (1995)
2,3,5,pregnancy remake,Father of the Bride Part II (1995)
3,4,7,remake,Sabrina (1995)
4,5,11,politics president,"American President, The (1995)"
...,...,...,...,...
92,95,357,wedding,Four Weddings and a Funeral (1994)
93,96,361,gambling,It Could Happen to You (1994)
94,97,363,Holocaust In Netflix queue,"Wonderful, Horrible Life of Leni Riefenstahl, ..."
95,98,364,Disney Disney animated feature Oscar (Best Mus...,"Lion King, The (1994)"


## Import bert transformer model

In [11]:
model_name = "sentence-transformers/bert-base-nli-mean-tokens"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [12]:
tokens = {"input_ids": [], "attention_mask": []}

## Populate the tokens with the encoded tags

In [13]:
for tag in not_watched_movies["tag"].to_list():
    new_tokens = tokenizer.encode_plus(tag, max_length=128,
                                       truncation=True, padding="max_length",
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])
                      

In [14]:
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [15]:
tokens['input_ids'].shape

torch.Size([97, 128])

## Run model

In [16]:
outputs = model(**tokens)

## Get the final embedding of the tags

In [17]:
embedding = outputs.last_hidden_state
attention = tokens["attention_mask"]
mask = attention.unsqueeze(-1).expand(embedding.shape).float()
mask_embeddings = embedding * mask
mask_embeddings.shape

torch.Size([97, 128, 768])

In [18]:
summed = torch.sum(mask_embeddings, 1)
summed.shape

torch.Size([97, 768])

In [19]:
counts = torch.clamp(mask.sum(1), min=1e-9)
counts.shape

torch.Size([97, 768])

In [20]:
mean_pooled = summed / counts
mean_pooled

tensor([[-0.4440, -0.1235,  0.6534,  ..., -0.9780, -0.1211,  0.2655],
        [ 0.7722,  0.5865,  1.4739,  ...,  0.0662,  0.5222, -0.1200],
        [ 0.2125, -0.5044,  0.8827,  ..., -0.4449,  0.3734, -0.2870],
        ...,
        [ 0.6375,  0.4288,  0.7802,  ...,  0.2792,  0.1875, -0.2632],
        [-0.6468,  0.7421,  0.7190,  ..., -0.3217,  0.3200,  0.2668],
        [-0.4130,  1.4897,  0.1053,  ..., -0.1647,  0.6516,  0.5693]],
       grad_fn=<DivBackward0>)

## Calculate cosine similarity

In [21]:
mean_pooled = mean_pooled.detach().numpy()

similarity_matrix = cosine_similarity(
    [mean_pooled[-1]],
    mean_pooled
                 )

In [22]:
similarity_matrix

array([[ 0.490672  ,  0.14442557,  0.22645575,  0.06465515,  0.26340485,
         0.26340485,  0.21265271,  0.16320299,  0.17395446,  0.30621052,
         0.14463018,  0.09718065,  0.40699   ,  0.21526912,  0.12390544,
         0.623006  ,  0.5559078 ,  0.24456112,  0.10972166,  0.4556393 ,
         0.29680282,  0.09718065,  0.04067534, -0.03809804, -0.01745151,
         0.4337945 ,  0.4616012 ,  0.29603124, -0.10706255, -0.06109754,
         0.31414112,  0.11561764,  0.3419889 ,  0.36872932,  0.22089075,
         0.61101747,  0.21034004,  0.26096293,  0.00737906, -0.03809804,
         0.42186385,  0.21326049, -0.03820926,  0.03726874,  0.26123327,
         0.39036408,  0.01258864,  0.21080358,  0.07043365,  0.41620487,
         0.0439428 ,  0.48458248,  0.21037632,  0.08392439,  0.26123327,
         0.23064673,  0.4003504 ,  0.03404082,  0.06333817,  0.2183402 ,
         0.22903591,  0.16850649, -0.0800937 ,  0.13521227,  0.28879324,
         0.27471295,  0.13526624,  0.17637928,  0.2

In [23]:
not_watched_movies["cosine_similarity"] = similarity_matrix[0]
not_watched_movies

Unnamed: 0,index,movieId,tag,title,cosine_similarity
0,1,2,fantasy magic board game Robin Williams game,Jumanji (1995),0.490672
1,2,3,moldy old,Grumpier Old Men (1995),0.144426
2,3,5,pregnancy remake,Father of the Bride Part II (1995),0.226456
3,4,7,remake,Sabrina (1995),0.064655
4,5,11,politics president,"American President, The (1995)",0.263405
...,...,...,...,...,...
92,95,357,wedding,Four Weddings and a Funeral (1994),-0.017452
93,96,361,gambling,It Could Happen to You (1994),0.092346
94,97,363,Holocaust In Netflix queue,"Wonderful, Horrible Life of Leni Riefenstahl, ...",0.458528
95,98,364,Disney Disney animated feature Oscar (Best Mus...,"Lion King, The (1994)",0.552451


In [24]:
not_watched_movies = not_watched_movies.sort_values(by=['cosine_similarity'],ascending=False)
not_watched_movies

Unnamed: 0,index,movieId,tag,title,cosine_similarity
96,99,1193 914 3408 2355 1197 2804 594 919 595 938 2...,emotional jack nicholson mental illness George...,One Flew Over the Cuckoo's Nest (1975) My Fair...,1.000000
74,77,296,good dialogue great soundtrack non-linear cult...,Pulp Fiction (1994),0.704745
73,76,293,assassin Jean Reno hit men Action assassin ass...,Léon: The Professional (a.k.a. The Professiona...,0.640627
15,16,32,time travel time travel Brad Pitt Bruce Willis...,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),0.623006
35,36,110,beautiful scenery epic historical inspirationa...,Braveheart (1995),0.611017
...,...,...,...,...,...
42,44,160,Michael Crichton,Congo (1995),-0.038209
29,30,62,music,Mr. Holland's Opus (1995),-0.061098
82,85,329,Enterprise,Star Trek: Generations (1994),-0.065977
62,64,257,court,Just Cause (1995),-0.080094


## Get top k recommendations

In [25]:
k = 5
recommendations = [not_watched_movies["title"].to_list()[i] for i in range(1, k+1)]
recommendations

['Pulp Fiction (1994)',
 'Léon: The Professional (a.k.a. The Professiona...',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Braveheart (1995)',
 'Babe (1995)']