In [1]:
!pip install -U sentence-transformers
!pip install billboard.py

Collecting sentence-transformers
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.2 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 24.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 48.8 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 2.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 54.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████

# Music Titles Similarity

https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2

In [7]:
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import pandas as pd
import sklearn
import billboard

from sentence_transformers import SentenceTransformer

## Model

In [4]:
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

In [5]:
model = SentenceTransformer(model_name)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Input data

In [27]:
chart = billboard.ChartData("hot-100")
print(chart)

hot-100 chart from 2021-09-25
-----------------------------
1. 'Stay' by The Kid LAROI & Justin Bieber
2. 'Way 2 Sexy' by Drake Featuring Future & Young Thug
3. 'Bad Habits' by Ed Sheeran
4. 'Knife Talk' by Drake Featuring 21 Savage & Project Pat
5. 'Fancy Like' by Walker Hayes
6. 'Industry Baby' by Lil Nas X & Jack Harlow
7. 'Good 4 U' by Olivia Rodrigo
8. 'Kiss Me More' by Doja Cat Featuring SZA
9. 'Levitating' by Dua Lipa
10. 'Girls Want Girls' by Drake Featuring Lil Baby
11. 'Fair Trade' by Drake Featuring Travis Scott
12. 'Save Your Tears' by The Weeknd & Ariana Grande
13. 'Montero (Call Me By Your Name)' by Lil Nas X
14. 'Butter' by BTS
15. 'Heat Waves' by Glass Animals
16. 'Shivers' by Ed Sheeran
17. 'Essence' by Wizkid Featuring Justin Bieber & Tems
18. 'You Right' by Doja Cat & The Weeknd
19. 'Champagne Poetry' by Drake
20. 'Deja Vu' by Olivia Rodrigo
21. 'Need To Know' by Doja Cat
22. 'No Friends In The Industry' by Drake
23. 'Take My Breath' by The Weeknd
24. 'If I Didn't Lo

In [18]:
sentences = [entry.title for entry in chart.entries]
print(len(sentences))

100


## Convert text into vectors

In [19]:
embeddings = model.encode(sentences)
print(embeddings)

[[-1.3362177e-01 -2.0986642e-01 -1.2747041e-02 ...  1.4741980e-01
   1.6187796e-02 -7.8603350e-02]
 [ 8.9053191e-02 -1.4569113e-02 -1.5063114e-02 ... -6.5363102e-02
   8.5449338e-02 -1.2682423e-01]
 [-2.1607634e-04  7.0611864e-02 -7.9566119e-03 ... -3.0556429e-02
   1.7883635e-01 -4.3392334e-02]
 ...
 [-4.9834721e-02 -2.2736780e-01 -6.6417065e-03 ... -9.4529562e-02
  -4.9998999e-02 -4.5048218e-02]
 [-1.6364185e-02  2.2585472e-01 -1.5700134e-02 ... -1.3459103e-01
  -7.3594995e-02 -3.1004872e-02]
 [-9.3239821e-02  7.5833336e-02 -1.0350703e-02 ...  7.4028417e-02
  -1.1406121e-02  1.5013987e-02]]


## Quick test

In [20]:
def cosine(x, y):
  return np.inner(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [21]:
print(sentences[0])
print(sentences[1])

print("Distance: ", cosine(embeddings[0], embeddings[1]))

Stay
Way 2 Sexy
Distance:  0.21330348


## Define handy functions

In [23]:
def _filter_threshold(x, threshold=0):
  return ((x > threshold) & (x < 1)).any()

In [24]:
def similarity_filter(embeddings_1, embeddings_2, labels_1, labels_2, threshold=0):

  sim = sklearn.metrics.pairwise.cosine_similarity(embeddings_1, embeddings_2)
  np.fill_diagonal(sim, 1)

  tmp = np.apply_along_axis(_filter_threshold, axis=1, arr=sim, threshold=threshold)
  sim = np.delete(sim, ~tmp, axis=0)
  sim = np.delete(sim, ~tmp, axis=1)
  labels_1 = list(map(labels_1.__getitem__, np.where(tmp)[0]))
  labels_2 = list(map(labels_2.__getitem__, np.where(tmp)[0]))
  embeddings_1 = list(map(embeddings_1.__getitem__, np.where(tmp)[0]))
  embeddings_2 = list(map(embeddings_2.__getitem__, np.where(tmp)[0]))


  embeddings_1_col = list()
  embeddings_2_col = list()
  sim_col = list()

  for i in range(len(embeddings_1)):
    for j in range(len(embeddings_2)):
      embeddings_1_col.append(labels_1[i])
      embeddings_2_col.append(labels_2[j])
      sim_col.append(sim[i][j])
      
  df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
                    columns=["embeddings_1", "embeddings_2", "sim"])
  
  return df, labels_1, labels_2

In [25]:
# From TF Hub

def visualize_similarity(embeddings_1,
                         embeddings_2,
                         labels_1,
                         labels_2,
                         plot_title,
                         threshold=0,
                         plot_width=1200, plot_height=600,
                         xaxis_font_size='8pt', yaxis_font_size='8pt'):

  assert len(embeddings_1) == len(labels_1)
  assert len(embeddings_2) == len(labels_2)

  df, labels_1, labels_2 = similarity_filter(embeddings_1, embeddings_2, labels_1, labels_2, threshold)


  mapper = bokeh.models.LinearColorMapper(
      palette=[*reversed(bokeh.palettes.YlOrRd[9])], low=df.sim.min(),
      high=df.sim.max())

  p = bokeh.plotting.figure(title=plot_title, x_range=labels_1,
                            x_axis_location="above",
                            y_range=[*reversed(labels_2)],
                            plot_width=plot_width, plot_height=plot_height,
                            tools="save",toolbar_location='below', tooltips=[
                                ('pair', '@embeddings_1 ||| @embeddings_2'),
                                ('sim', '@sim')])
  p.rect(x="embeddings_1", y="embeddings_2", width=1, height=1, source=df,
         fill_color={'field': 'sim', 'transform': mapper}, line_color=None)

  p.title.text_font_size = '12pt'
  p.axis.axis_line_color = None
  p.axis.major_tick_line_color = None
  p.axis.major_label_standoff = 16
  p.xaxis.major_label_text_font_size = xaxis_font_size
  p.xaxis.major_label_orientation = 0.25 * np.pi
  p.yaxis.major_label_text_font_size = yaxis_font_size
  p.min_border_right = 300

  bokeh.io.output_notebook()
  bokeh.io.show(p)

## Compute similarity

In [29]:
df, labels_1, labels_2 = similarity_filter(embeddings, embeddings, sentences, sentences, 0.6)

In [30]:
df

Unnamed: 0,embeddings_1,embeddings_2,sim
0,Heat Waves,Heat Waves,1.000000
1,Heat Waves,Shivers,0.416397
2,Heat Waves,You Right,0.061505
3,Heat Waves,Need To Know,0.158134
4,Heat Waves,Love All,0.116888
...,...,...,...
319,Knowing You,Todo de Ti,0.659607
320,Knowing You,Thinking 'Bout You,0.568740
321,Knowing You,Come Through,0.398683
322,Knowing You,Ain't Shit,0.191331


## Visualize similarity

In [26]:
visualize_similarity(embeddings_1=embeddings, embeddings_2=embeddings, labels_1=sentences, labels_2=sentences, plot_title="Similarity", threshold=0.6)