In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import pandas as pd
import csv
import torch
from transformers import AutoTokenizer, AutoModel

## Import Data

In [2]:
movies_df = pd.read_csv("./ml-latest-small/movies.csv")
tags_df = pd.read_csv("./ml-latest-small/tags.csv")

In [3]:
user_dat = open("./ml-1m/users.dat")

In [4]:
columns_to_keep = ['userId', 'gender', "age", "occupation"]

user_dat = open("./ml-1m/users.dat")
datContent=[]
for idx,val in enumerate(user_dat.readlines()):
    if idx==0:
        datContent.append(['userId', 'gender', "age", "occupation"])
    datContent.append(val.strip().split("::"))

with open("./users.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(datContent)

columns_to_keep = ['userId', 'gender', "age", "occupation"]

user_df = pd.read_csv("./users.csv", usecols=columns_to_keep).set_index("userId")
user_df

Unnamed: 0_level_0,gender,age,occupation
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,F,1,10
2,M,56,16
3,M,25,15
4,M,45,7
5,M,25,20
...,...,...,...
6036,F,25,15
6037,F,45,1
6038,F,56,1
6039,F,45,0


In [5]:
ratings_dat = open("./ml-1m/ratings.dat")

datContent=[]
for idx,val in enumerate(ratings_dat.readlines()):
    if idx==0:
        datContent.append(['userId', 'movieId', "rating"])
    datContent.append(val.strip().split("::") )

with open("./ratings.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(datContent)


columns_to_keep = ['userId', 'movieId', "rating"]
ratings_df = pd.read_csv("./ratings.csv", usecols=columns_to_keep)
ratings_df


Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [6]:
user_id = 1

## Get users gender, age, and occupation

In [7]:
user = user_df[user_df.index==user_id][["gender","age","occupation"]]
u_gender = user["gender"].to_list()[0]
u_age = user["age"].to_list()[0]
u_occ = user["occupation"].to_list()[0]

## Get other users with the same gender and age range and users occupation

In [8]:
same_gen_age_range = (user_df["gender"]==u_gender)&(user_df["age"]<u_age+5)&(user_df["age"]>u_age-5)
same_occ = (user_df["occupation"]==u_occ)
friends = user_df[same_gen_age_range | same_occ]
# The following line determines how many friends we use to predict rating
friends = friends.iloc[:1]
friends

Unnamed: 0_level_0,gender,age,occupation
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,F,1,10


## Get the user and their friends previously rated movies

In [10]:
user_data = pd.merge(friends, ratings_df, how="inner", on='userId')

## Group the tags for each movieId to bucket_of_tags

In [11]:
bucket_of_tags = tags_df[["movieId","tag"]]
bucket_of_tags = bucket_of_tags.groupby('movieId',as_index=False).agg(lambda x: ' '.join(map(str,x)))

movie_tags = pd.merge(bucket_of_tags, movies_df, how="outer", on='movieId')

del movie_tags["genres"]
bucket_of_tags = movie_tags

bucket_of_tags

Unnamed: 0,movieId,tag,title
0,1,pixar pixar fun,Toy Story (1995)
1,2,fantasy magic board game Robin Williams game,Jumanji (1995)
2,3,moldy old,Grumpier Old Men (1995)
3,5,pregnancy remake,Father of the Bride Part II (1995)
4,7,remake,Sabrina (1995)
...,...,...,...
9737,193581,,Black Butler: Book of the Atlantic (2017)
9738,193583,,No Game No Life: Zero (2017)
9739,193585,,Flint (2017)
9740,193587,,Bungo Stray Dogs: Dead Apple (2018)


## Add the tags to the user (and their firends) previously rated movies

In [12]:
user_rated_tags = pd.merge(user_data, bucket_of_tags, how="outer", on='movieId')[["movieId","rating","tag","title"]]
user_rated_tags = user_rated_tags.dropna(how='any',subset=['rating',"tag"])

user_rated_tags.shape

(40, 4)

In [13]:
movie_id = 914
candidate = bucket_of_tags[bucket_of_tags["movieId"]==movie_id]
_data = [candidate["movieId"].to_string(index=False), 0, candidate["tag"
].to_string(index=False),candidate["title"].to_string(index=False)]
_val = pd.DataFrame([_data], columns= ["movieId","rating", "tag","title"
])


if movie_id in user_rated_tags["movieId"].to_list():
    user_rated_tags = user_rated_tags.reset_index()
    index  = user_rated_tags[user_rated_tags["movieId"]==movie_id].index.to_list()[0]
    indices = user_rated_tags.index.to_list()
    indices[index], indices[-1] = indices[-1], indices[index]
    user_rated_tags = user_rated_tags.reindex(indices)
else:
    user_rated_tags.loc[-1] =_data

user_rated_tags

Unnamed: 0,index,movieId,rating,tag,title
0,0,1193,5.0,emotional jack nicholson mental illness,One Flew Over the Cuckoo's Nest (1975)
39,52,1246,4.0,highschool High School,Dead Poets Society (1989)
2,3,3408,4.0,scandal true story,Erin Brockovich (2000)
3,4,2355,5.0,Pixar,"Bug's Life, A (1998)"
4,5,1197,3.0,Inigo Montoya six-fingered man,"Princess Bride, The (1987)"
5,7,2804,5.0,Christmas,"Christmas Story, A (1983)"
6,8,594,4.0,Disney,Snow White and the Seven Dwarfs (1937)
7,9,919,4.0,Dorothy Toto,"Wizard of Oz, The (1939)"
8,10,595,5.0,Disney,Beauty and the Beast (1991)
9,11,938,4.0,prostitution,Gigi (1958)


## Import bert model

In [14]:
model_name = "sentence-transformers/bert-base-nli-mean-tokens"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [15]:
tokens = {"input_ids": [], "attention_mask": []}

## Populate tokens with the embedded tags

In [27]:
for tag in user_rated_tags["tag"].to_list():
    new_tokens = tokenizer.encode_plus(tag, max_length=128, truncation=True, padding="max_length",return_tensors='pt')

    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

AttributeError: 'Tensor' object has no attribute 'append'

## Run the model

In [17]:
output = model(**tokens)

## Get latest embeddings 

In [18]:
embedding = output.last_hidden_state
attention = tokens["attention_mask"]
mask = attention.unsqueeze(-1).expand(embedding.shape).float()
mask_embeddings = embedding * mask

In [19]:
summed = torch.sum(mask_embeddings, 1)

In [20]:
counts = torch.clamp(mask.sum(1), min=1e-9)

In [21]:
mean_pooled = summed / counts
mean_pooled

tensor([[-0.0889, -0.0111,  0.6810,  ..., -0.9847, -0.4998, -0.8268],
        [-0.6878,  0.5000,  0.7699,  ..., -0.1668,  0.1444, -0.3699],
        [-0.1046, -0.3171,  1.0704,  ...,  0.1529,  0.0250,  0.0140],
        ...,
        [-0.2367,  0.7916,  0.8158,  ..., -0.2780,  0.7770,  0.3085],
        [-0.2544,  0.4656,  0.4933,  ..., -0.3380,  0.3432,  0.1238],
        [ 0.4517,  0.6184,  0.9609,  ..., -0.4463,  0.0583,  1.0208]],
       grad_fn=<DivBackward0>)

## Calculate cosine_similarity

In [22]:
mean_pooled = mean_pooled.detach().numpy()
sim = cosine_similarity([mean_pooled[-1]],mean_pooled)

In [23]:
sim

array([[0.44363657, 0.36449295, 0.39045584, 0.49386793, 0.45102543,
        0.47906128, 0.42879874, 0.57736313, 0.42879874, 0.24651174,
        0.4420464 , 0.65056694, 0.3344221 , 0.33420146, 0.63834953,
        0.39429706, 0.3369038 , 0.28744066, 0.42245883, 0.4744112 ,
        0.63834953, 0.2914983 , 0.42879874, 0.09540707, 0.42879874,
        0.23277798, 0.12350664, 0.35783312, 0.3340716 , 0.658965  ,
        0.47443762, 0.12841475, 0.31717908, 0.42879874, 0.47248077,
        0.522052  , 0.30776656, 0.28666508, 0.25673655, 1.0000001 ]],
      dtype=float32)

In [24]:
user_rated_tags["cosine_similarity"] = sim[0]
user_rated_tags

Unnamed: 0,index,movieId,rating,tag,title,cosine_similarity
0,0,1193,5.0,emotional jack nicholson mental illness,One Flew Over the Cuckoo's Nest (1975),0.443637
39,52,1246,4.0,highschool High School,Dead Poets Society (1989),0.364493
2,3,3408,4.0,scandal true story,Erin Brockovich (2000),0.390456
3,4,2355,5.0,Pixar,"Bug's Life, A (1998)",0.493868
4,5,1197,3.0,Inigo Montoya six-fingered man,"Princess Bride, The (1987)",0.451025
5,7,2804,5.0,Christmas,"Christmas Story, A (1983)",0.479061
6,8,594,4.0,Disney,Snow White and the Seven Dwarfs (1937),0.428799
7,9,919,4.0,Dorothy Toto,"Wizard of Oz, The (1939)",0.577363
8,10,595,5.0,Disney,Beauty and the Beast (1991),0.428799
9,11,938,4.0,prostitution,Gigi (1958),0.246512


## Get Weighted Mean

In [25]:
weighted_mean = sum(user_rated_tags["cosine_similarity"] * user_rated_tags["rating"])/(sum(user_rated_tags["cosine_similarity"]))

In [26]:
weighted_mean

4.189412417861589