# Content base recommender system with Bert Embedding

## **Set up**

In [1]:
import numpy as np
import pandas as pd

###  Bert

In [2]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [3]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1386,  0.1583, -0.2967,  ..., -0.2708, -0.2844,  0.4581],
         [ 0.5364, -0.2327,  0.1754,  ...,  0.5540,  0.4981, -0.0024],
         [ 0.3002, -0.3475,  0.1208,  ..., -0.4562,  0.3288,  0.8773],
         ...,
         [ 0.3799,  0.1203,  0.8283,  ..., -0.8624, -0.5957,  0.0471],
         [-0.0252, -0.7177, -0.6950,  ...,  0.0757, -0.6668, -0.3401],
         [ 0.7535,  0.2391,  0.0717,  ...,  0.2467, -0.6458, -0.3213]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.9377, -0.5043, -0.9799,  0.9030,  0.9329, -0.2438,  0.8926,  0.2288,
         -0.9531, -1.0000, -0.8862,  0.9906,  0.9855,  0.7155,  0.9455, -0.8645,
         -0.6035, -0.6666,  0.3020, -0.1587,  0.7455,  1.0000, -0.4022,  0.4261,
          0.6151,  0.9996, -0.8773,  0.9594,  0.9585,  0.6950, -0.6718,  0.3325,
         -0.9954, -0.2268, -0.9658, -0.9951,  0.6127, -0.7670,  0.0873,  0.0824,
         -0.9518,  0.4713,  1.00

### Sentence embedding 

In [4]:
data = pd.read_csv('/home/kgamegah/Documents/academic/UPC/RecommenderSystems/RecSys/amime/anime.csv')
data = data[['MAL_ID','Name', 'Genres', 'Type', 'Episodes']]
data.columns = ['ID','Title', 'Genre', 'Type', 'Episodes']
len(data)

17562

In [5]:
data.head()

Unnamed: 0,ID,Title,Genre,Type,Episodes
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",TV,52


In [6]:
print(data.isnull().sum())
data = data.dropna()
print(data.isnull().sum())
print(len(data))

ID          0
Title       0
Genre       0
Type        0
Episodes    0
dtype: int64
ID          0
Title       0
Genre       0
Type        0
Episodes    0
dtype: int64
17562


### Data cleaning

In [7]:
import re
import string

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

data["cleaned_Title"] = data["Title"].apply(clean)
data["cleaned_Genre"] = data["Genre"].apply(clean)
data["cleaned_TitleGenre"] = data["cleaned_Title"] + " " + data["cleaned_Genre"]

In [8]:
data.head()

Unnamed: 0,ID,Title,Genre,Type,Episodes,cleaned_Title,cleaned_Genre,cleaned_TitleGenre
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,cowboy bebop,action adventure comedy drama scifi space,cowboy bebop action adventure comedy drama sci...
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,cowboy bebop tengoku no tobira,action drama mystery scifi space,cowboy bebop tengoku no tobira action drama my...
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,trigun,action scifi adventure comedy drama shounen,trigun action scifi adventure comedy drama sho...
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,witch hunter robin,action mystery police supernatural drama magic,witch hunter robin action mystery police super...
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",TV,52,bouken ou beet,adventure fantasy shounen supernatural,bouken ou beet adventure fantasy shounen super...


In [9]:
#indices = pd.Series(data.index,index=data['cleaned_TitleGenre']).drop_duplicates()

In [10]:
#indices

In [11]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

#Load the SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [12]:
#Create the sentence embeddings for the Nice classifications
sentence_embeddings = sbert_model.encode(data["cleaned_TitleGenre"])

In [13]:
sentence_embeddings[:5]

array([[-0.06439349, -0.05691916,  0.00203442, ..., -0.03982692,
         0.00280895, -0.05453705],
       [-0.10773472, -0.01723839, -0.01103228, ..., -0.04913368,
        -0.03100055, -0.04195781],
       [-0.08343405, -0.00814126, -0.05074589, ..., -0.04679877,
        -0.02803142,  0.06145451],
       [-0.01730554, -0.02679987, -0.06414767, ..., -0.05192742,
        -0.00642172,  0.02770149],
       [-0.02698451,  0.02438733,  0.02073102, ..., -0.02894992,
        -0.07004624,  0.00393724]], dtype=float32)

In [14]:
#Define the Cosine Similarity function
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [15]:
query = 'naruto shipuden'
query_vec = sbert_model.encode([query])[0]

In [16]:
#Calculate the similarity of the anime description
ncl_sim = []
for ncl in sentence_embeddings:
    ncl_sim.append(cosine(query_vec, ncl))

In [17]:
data['similarity'] = ncl_sim

In [18]:
#Display the top 20 matches
#data.sort_values(by=['similarity'], ascending=False).head(20).style.set_properties(subset=['cleaned_TitleGenre'], **{'width-min': '50px'})
data.sort_values(by=['similarity'], ascending=False).head(20)

Unnamed: 0,ID,Title,Genre,Type,Episodes,cleaned_Title,cleaned_Genre,cleaned_TitleGenre,similarity
2267,2472,Naruto: Shippuuden Movie 1,"Action, Adventure, Comedy, Fantasy, Shounen",Movie,1,naruto shippuuden movie,action adventure comedy fantasy shounen,naruto shippuuden movie action adventure come...,0.753687
3516,4134,"Naruto: Shippuuden - Shippuu! ""Konoha Gakuen"" Den",Comedy,Special,1,naruto shippuuden shippuu konoha gakuen den,comedy,naruto shippuuden shippuu konoha gakuen den c...,0.722772
8088,19511,Naruto: Shippuuden - Sunny Side Battle,"Action, Comedy, Super Power, Martial Arts, Sho...",Special,1,naruto shippuuden sunny side battle,action comedy super power martial arts shounen,naruto shippuuden sunny side battle action co...,0.704871
1574,1735,Naruto: Shippuuden,"Action, Adventure, Comedy, Super Power, Martia...",TV,500,naruto shippuuden,action adventure comedy super power martial ar...,naruto shippuuden action adventure comedy supe...,0.701417
4553,6325,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",Movie,1,naruto shippuuden movie hi no ishi wo tsugu ...,action comedy martial arts shounen super power,naruto shippuuden movie hi no ishi wo tsugu ...,0.697634
7021,13667,Naruto: Shippuuden Movie 6 - Road to Ninja,"Action, Adventure, Super Power, Martial Arts, ...",Movie,1,naruto shippuuden movie road to ninja,action adventure super power martial arts shounen,naruto shippuuden movie road to ninja action...,0.671199
6285,10589,Naruto: Shippuuden Movie 5 - Blood Prison,"Action, Adventure, Martial Arts, Super Power, ...",Movie,1,naruto shippuuden movie blood prison,action adventure martial arts super power myst...,naruto shippuuden movie blood prison action ...,0.665568
3639,4437,Naruto: Shippuuden Movie 2 - Kizuna,"Action, Martial Arts, Shounen, Supernatural",Movie,1,naruto shippuuden movie kizuna,action martial arts shounen supernatural,naruto shippuuden movie kizuna action martia...,0.662636
4985,7367,Naruto: The Cross Roads,"Action, Adventure, Comedy, Shounen",Special,1,naruto the cross roads,action adventure comedy shounen,naruto the cross roads action adventure comedy...,0.652488
5304,8246,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",Movie,1,naruto shippuuden movie the lost tower,action comedy martial arts shounen super power,naruto shippuuden movie the lost tower actio...,0.639731
