Notebook template to calculate the simalrities of descriptions/documents from within a csv file format

In [None]:
# install sentence-transformers package
!pip install sentence-transformers
# !pip install numba==0.53.1 # if error, try installing this specific version

In [None]:
# Imports
# all these packages should be common to an anaconda python distro
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import numpy as np

Input/Output Parameters to change with your file paths

In [None]:
# Input/Output Parameters
# Change these with your file and column names and where you want to save the scores
input_file = './SOME_FILE.csv' # change to whatever file/filepath you are using
output_file = './NEW_SCORE_FILE.csv' # change to your outpath
desc_column = 'Description' # description field column
name_column = 'Technologies' # group name field column. Used for indexes and titles

In [None]:
# Initialize Pre-trained model 
# downloads automatically from hugging face
# You can try other pre-trained models too from hugging face: https://huggingface.co/models
model_name = 'sentence-transformers/paraphrase-distilroberta-base-v2'
model = SentenceTransformer(model_name)

In [None]:
# read input file and covert to df
# covert target columns to list for easier processing
df = pd.read_csv(input_file)
desc_list = df[desc_column].tolist()
name_list = df[name_column].tolist()

In [None]:
# length of df
len(desc_list)

In [None]:
# encode/create embeddings for all descriptions
all_desc_vecs = [model.encode(desc) for desc in desc_list]

In [None]:
# Calculate the cosine similarity between each document
all_scores = []
for idx, vec in enumerate(all_desc_vecs):
    print('Calculating Similarities for Index: ', idx)
    scores = cosine_similarity([vec], all_desc_vecs).flatten()
    all_scores.append(scores)

In [None]:
# Convert to df
final_df = pd.DataFrame(all_scores, index=name_list, columns=name_list,)

In [None]:
# Show Scores 
final_df

In [None]:
# Save df file to output
final_df.to_csv(output_file, index=True)

In [None]:
# Correlation plot visualization
# Type 1 Basic correlation color plot
import seaborn as sns
corr = final_df.corr()
sns.heatmap(corr, cmap="Blues", annot=True)

In [None]:
# Type 2 being fancy and using hover over texts
cmap=sns.diverging_palette(5, 250, as_cmap=True)
corr = final_df.corr()
def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '100px', 'font-size': '12pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())