# Content base recommender system with Bert Embedding

## **Set up**

In [1]:
import numpy as np
import pandas as pd

In [32]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=544fbd4f5c346f335712469d86841c1e7b6348447564ad255e6b6e64a6511598
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_tr

###  Bert

In [2]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1386,  0.1583, -0.2967,  ..., -0.2708, -0.2844,  0.4581],
         [ 0.5364, -0.2327,  0.1754,  ...,  0.5540,  0.4981, -0.0024],
         [ 0.3002, -0.3475,  0.1208,  ..., -0.4562,  0.3288,  0.8773],
         ...,
         [ 0.3799,  0.1203,  0.8283,  ..., -0.8624, -0.5957,  0.0471],
         [-0.0252, -0.7177, -0.6950,  ...,  0.0757, -0.6668, -0.3401],
         [ 0.7535,  0.2391,  0.0717,  ...,  0.2467, -0.6458, -0.3213]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.9377, -0.5043, -0.9799,  0.9030,  0.9329, -0.2438,  0.8926,  0.2288,
         -0.9531, -1.0000, -0.8862,  0.9906,  0.9855,  0.7155,  0.9455, -0.8645,
         -0.6035, -0.6666,  0.3020, -0.1587,  0.7455,  1.0000, -0.4022,  0.4261,
          0.6151,  0.9996, -0.8773,  0.9594,  0.9585,  0.6950, -0.6718,  0.3325,
         -0.9954, -0.2268, -0.9658, -0.9951,  0.6127, -0.7670,  0.0873,  0.0824,
         -0.9518,  0.4713,  1.00

### Sentence embedding

In [None]:
data = pd.read_csv('anime_with_synopsis.csv')
data = data[['MAL_ID','Name', 'Score', 'Genres', 'sypnopsis']]
data.columns = ['ID','Title', 'Rating','Genre', 'Description']
len(data)

17562

In [9]:
data.head()

Unnamed: 0,ID,Title,Rating,Genre,Description
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [10]:
print(data.isnull().sum())
data = data.dropna()
print(data.isnull().sum())
print(len(data))

ID             0
Title          0
Rating         0
Genre          0
Description    8
dtype: int64
ID             0
Title          0
Rating         0
Genre          0
Description    0
dtype: int64
16206


### Data cleaning

In [37]:
import re
import string

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+_-$—', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('—', ' ', text)
    text = re.sub('no synopsis information has been added to this title  help improve our database by adding a synopsis here', '', text)
    return text

In [38]:
data["cleaned_Description"] = data["Description"].apply(clean)
data["cleaned_Genre"] = data["Genre"].apply(clean)
data["cleaned_Descrip_Genre"] = data["cleaned_Description"] + " " + data["cleaned_Genre"]

In [39]:
data.head()

Unnamed: 0,ID,Title,Rating,Genre,Description,cleaned_Description,cleaned_Genre,cleaned_Descrip_Genre
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",in the year humanity has colonized several ...,action adventure comedy drama sci fi space,in the year humanity has colonized several ...
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",other day another bounty such is the life of ...,action drama mystery sci fi space,other day another bounty such is the life of ...
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",vash the stampede is the man with a ...,action sci fi adventure comedy drama shounen,vash the stampede is the man with a ...
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,ches are individuals with special powers like ...,action mystery police supernatural drama ...,ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,it is the dark century and the people are suff...,adventure fantasy shounen supernatural,it is the dark century and the people are suff...


In [40]:
indices = pd.Series(data.index,index=data['cleaned_Descrip_Genre']).drop_duplicates()

In [41]:
indices

cleaned_Descrip_Genre
in the year    humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet earth behind  the inter solar system police attempts to keep peace in the galaxy  aided in part by outlaw bounty hunters  referred to as  cowboys   the ragtag team aboard the spaceship bebop are two such individuals  mellow and carefree spike spiegel is balanced by his boisterous  pragmatic partner jet black as the pair makes a living chasing bounties and collecting rewards  thrown off course by the addition of new members that they meet in their travels ein  a genetically engineered  highly intelligent welsh corgi  femme fatale faye valentine  an enigmatic trickster with memory loss  and the strange computer whiz kid edward wong the crew embarks on thrilling adventures that unravel each member s dark and mysterious past little by little  well balanced with high density action and light hearted comedy  cowboy bebop is a space w

In [33]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

#Load the SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [54]:
#Create the sentence embeddings for the Nice classifications
sentence_embeddings = sbert_model.encode(list(data["cleaned_Descrip_Genre"]))


In [55]:
sentence_embeddings[:5]

array([[-0.07416311, -0.04690303, -0.0353293 , ..., -0.02734719,
        -0.01693521, -0.04818274],
       [-0.05698591,  0.02150956,  0.00692327, ..., -0.14802378,
        -0.01423637,  0.00958055],
       [-0.01655654,  0.01858597, -0.08226501, ..., -0.0807069 ,
         0.04303873, -0.01159746],
       [-0.03628742, -0.01973657, -0.07931147, ..., -0.04993864,
         0.05229549, -0.02154196],
       [-0.0328002 ,  0.06347328, -0.10572511, ..., -0.09573739,
         0.03905011,  0.0189361 ]], dtype=float32)

In [52]:
#Define the Cosine Similarity function
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [53]:
query = 'naruto shipuden'
query_vec = sbert_model.encode([query])[0]

In [56]:
#Calculate the similarity of the product description to the Nice classifications
ncl_sim = []
for ncl in sentence_embeddings:
    ncl_sim.append(cosine(query_vec, ncl))

In [57]:
data['similarity'] = ncl_sim

In [58]:
#Display the top 20 matches
#data.sort_values(by=['similarity'], ascending=False).head(20).style.set_properties(subset=['cleaned_TitleGenre'], **{'width-min': '50px'})
data.sort_values(by=['similarity'], ascending=False).head(20)

Unnamed: 0,ID,Title,Rating,Genre,Description,cleaned_Description,cleaned_Genre,cleaned_Descrip_Genre,similarity
3103,4134,"Naruto: Shippuuden - Shippuu! ""Konoha Gakuen"" Den",7.15,Comedy,Naruto school special. Naruto is a new cool st...,naruto school special naruto is a new cool st...,comedy,naruto school special naruto is a new cool st...,0.558621
5510,10659,Naruto Soyokazeden Movie: Naruto to Mashin to ...,6.97,"Action, Comedy, Martial Arts, Shounen, Super P...",Naruto discovers a genie's bottle while he and...,naruto discovers a genie s bottle while he and...,action comedy martial arts shounen super p...,naruto discovers a genie s bottle while he and...,0.556872
10,20,Naruto,7.91,"Action, Adventure, Comedy, Super Power, Martia...","oments prior to Naruto Uzumaki's birth, a huge...",oments prior to naruto uzumaki s birth a huge...,action adventure comedy super power martia...,oments prior to naruto uzumaki s birth a huge...,0.554329
10244,32365,Boruto: Naruto the Movie - Naruto ga Hokage ni...,7.4,"Action, Comedy, Super Power, Martial Arts, Sho...",Bundled with the limited edition of Blu-ray/DV...,bundled with the limited edition of blu ray dv...,action comedy super power martial arts sho...,bundled with the limited edition of blu ray dv...,0.540551
4300,7367,Naruto: The Cross Roads,6.82,"Action, Adventure, Comedy, Shounen",This year is the 10th anniversary of Naruto's ...,this year is the anniversary of naruto s ser...,action adventure comedy shounen,this year is the anniversary of naruto s ser...,0.531001
403,442,Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shin...,7.1,"Adventure, Comedy, Historical, Supernatural, D...","Naruto Uzumaki and his squadmates, Sasuke Uchi...",naruto uzumaki and his squadmates sasuke uchi...,adventure comedy historical supernatural d...,naruto uzumaki and his squadmates sasuke uchi...,0.5285
7957,23511,Choubakuretsu Ijigen Menko Battle: Gigant Shoo...,Unknown,"Action, Game, Comedy, Kids","The anime follows Tsukasa, whose dream is to b...",the anime follows tsukasa whose dream is to b...,action game comedy kids,the anime follows tsukasa whose dream is to b...,0.51219
5518,10686,Naruto: Honoo no Chuunin Shiken! Naruto vs. Ko...,7.16,"Action, Adventure, Martial Arts, Shounen, Supe...",Naruto faces off against his old pupil Konoham...,naruto faces off against his old pupil konoham...,action adventure martial arts shounen supe...,naruto faces off against his old pupil konoham...,0.510022
1508,1735,Naruto: Shippuuden,8.16,"Action, Adventure, Comedy, Super Power, Martia...",It has been two and a half years since Naruto ...,it has been two and a half years since naruto ...,action adventure comedy super power martia...,it has been two and a half years since naruto ...,0.509485
15161,40971,Newsong,6.44,"Sci-Fi, Music",usic video for the track Newsong by Japanese r...,usic video for the track newsong by japanese r...,sci fi music,usic video for the track newsong by japanese r...,0.508827
