<a href="https://colab.research.google.com/github/dylanhogg/jupyter-experiments/blob/master/colab/ukplab/sentence-transformers-v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ukplab/sentence-transformers example

https://github.com/ukplab/sentence-transformers

Sentence Transformers: Multilingual Sentence, Paragraph, and Image Embeddings using BERT & Co.

This framework provides an easy method to compute dense vector representations for sentences, paragraphs, and images. The models are based on transformer networks like BERT / RoBERTa / XLM-RoBERTa etc. and achieve state-of-the-art performance in various task. Text is embedding in vector space such that similar text is close and can efficiently be found using cosine similarity.

# Install required packages

In [None]:
%pip install sentence-transformers -q  # Install github.com/ukplab/sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


# Imports

In [None]:
import math
import json
import numpy as np
import pandas as pd
from datetime import datetime
from functools import wraps
from IPython.display import display, HTML

# Load dataframe

In [None]:
df = pd.read_json("https://www.awesomepython.org/github_data.json", orient="table")

In [None]:
df[["_repopath", "_readme_localurl", "_readme_giturl", "_readme_filename"]]

Unnamed: 0,_repopath,_readme_localurl,_readme_giturl,_readme_filename
89,tensorflow/tensorflow,tensorflow~tensorflow~README.md,https://raw.githubusercontent.com/tensorflow/t...,README.md
112,huggingface/transformers,huggingface~transformers~README.md,https://raw.githubusercontent.com/huggingface/...,README.md
76,pytorch/pytorch,pytorch~pytorch~README.md,https://raw.githubusercontent.com/pytorch/pyto...,README.md
189,home-assistant/core,home-assistant~core~README.rst,https://raw.githubusercontent.com/home-assista...,README.rst
957,charliermarsh/ruff,charliermarsh~ruff~README.md,https://raw.githubusercontent.com/charliermars...,README.md
...,...,...,...,...
994,zackeskin/pycausality,zackeskin~pycausality~README.md,https://raw.githubusercontent.com/zackeskin/py...,README.md
1150,geeogi/async-python-lambda-template,geeogi~async-python-lambda-template~README.md,https://raw.githubusercontent.com/geeogi/async...,README.md
520,lydorn/mapalignment,lydorn~mapalignment~README.md,https://raw.githubusercontent.com/lydorn/mapal...,README.md
180,artemyk/dynpy,artemyk~dynpy~README.md,https://raw.githubusercontent.com/artemyk/dynp...,README.md


# Calculate Similarity

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
records = df[["_repopath", "_reponame", "category", "_description", "_topics"]].to_dict("records")

In [None]:
def remove_generic_topics(topics):
    remove_topics = ["python", "python2", "python-2", "python3", "python-3", "python-library", "library"]
    for t in remove_topics:
        if t in topics:
            topics.remove(t)
    return topics

In [None]:
def create_similarity_record(row):
    topics = remove_generic_topics(row["_topics"])
    description = row['_description']
    description = description.replace(row['_reponame'] + ': ', "")  # HACK: remove prefixed repo name from description
    
    description = description.strip().rstrip(".")
    description += ". " + row["category"]
    description += ", " + ", ".join(topics)
    
    return {"repopath": row["_repopath"], "sentence": description, "topics": topics}

In [None]:
similarity_records = [create_similarity_record(v) for v in records]

In [None]:
similarity_records[0:1]

[{'repopath': 'tensorflow/tensorflow',
  'sentence': 'An Open Source Machine Learning Framework for Everyone. ml-dl, tensorflow, machine-learning, deep-learning, deep-neural-networks, neural-network, ml, distributed',
  'topics': ['tensorflow',
   'machine-learning',
   'deep-learning',
   'deep-neural-networks',
   'neural-network',
   'ml',
   'distributed']}]

In [None]:
sentences = [d["sentence"] for d in similarity_records]

In [None]:
embeddings = model.encode(sentences, show_progress_bar=True)
cos_sim = util.cos_sim(embeddings, embeddings)  # Returns torch.Tensor

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

In [None]:
cos_sim[0:2]

tensor([[1.0000, 0.6089, 0.5181,  ..., 0.0997, 0.1834, 0.1511],
        [0.6089, 1.0000, 0.4545,  ..., 0.0414, 0.2857, 0.2346]])

In [None]:
# Add all pairs to a list with their cosine similarity score
pairs = []
for i, x in enumerate(cos_sim):
    for j, x in enumerate(cos_sim):
        if i != j:  # Exclude identity
            pairs.append([float(cos_sim[i][j]), i, j])

In [None]:
# Sort list by the highest cosine similarity score
pairs = sorted(pairs, key=lambda x: x[0], reverse=True)

In [None]:
# Construct lookup_dict and pairwise_dataset
pairwise_dataset = []
lookup_dict = {}
sim_cutoff = 0.50  # TODO: review, mostly crap below 0.65
for idx, (score, i, j) in enumerate(pairs):
    common_topics = list(set(similarity_records[i]["topics"]) & set(similarity_records[j]["topics"]))
    total_topics = len(set(similarity_records[i]["topics"])) + len(set(similarity_records[j]["topics"]))
    sim = float(cos_sim[i][j])
    if sim > sim_cutoff:
        repo1 = similarity_records[i]["repopath"]
        repo2 = similarity_records[j]["repopath"]
        lookup_dict.setdefault(repo1,[]).append((repo2, sim))
        pairwise_dataset.append({
            "idx": idx, 
            "sim": sim, 
            "common_topics": ", ".join(common_topics),
            "common_topics_count": len(common_topics),
            "total_topics_count": total_topics,
            "common_topics_prop": len(common_topics) / total_topics if total_topics > 0 else 0,
            "repo1": repo1,
            "repo2": repo2,
            "sent1": similarity_records[i]["sentence"], 
            "sent2": similarity_records[j]["sentence"],
        })
# Ensure sorted dict value by similarity (item[1])
lookup_dict = {k: sorted(v, key=lambda item: item[1], reverse=True) for k, v in lookup_dict.items()}

# Inspect lookup_dict

In [None]:
len(lookup_dict)

1271

In [None]:
lookup_dict["pallets/flask"]

[('bottlepy/bottle', 0.7904598712921143),
 ('pallets/werkzeug', 0.7790221571922302),
 ('pylons/pyramid', 0.7733316421508789),
 ('pallets/quart', 0.746427059173584),
 ('webpy/webpy', 0.7430812120437622),
 ('pynecone-io/pynecone', 0.7326019406318665),
 ('falconry/falcon', 0.705865740776062),
 ('klen/muffin', 0.7026605010032654),
 ('flet-dev/flet', 0.6750704050064087),
 ('neoteroi/blacksheep', 0.6668559312820435),
 ('r0x0r/pywebview', 0.6623570919036865),
 ('python-restx/flask-restx', 0.6471693515777588),
 ('cherrypy/cherrypy', 0.6420167088508606),
 ('encode/uvicorn', 0.6244233846664429),
 ('emmett-framework/emmett', 0.622439444065094),
 ('ets-labs/python-dependency-injector', 0.6025866270065308),
 ('django/django', 0.592947244644165),
 ('pylons/waitress', 0.5921154618263245),
 ('scrapy/scrapy', 0.5911469459533691),
 ('dylanhogg/python-project-template', 0.5901235342025757),
 ('huge-success/sanic', 0.5831112265586853),
 ('vinta/awesome-python', 0.5806543827056885),
 ('pyeve/eve', 0.580599

In [None]:
lookup_dict["pytorch/pytorch"]

[('tensorly/tensorly', 0.6504698991775513),
 ('xl0/lovely-tensors', 0.6335645914077759),
 ('google/tf-quant-finance', 0.6253831386566162),
 ('arogozhnikov/einops', 0.6087567806243896),
 ('google/jax', 0.59925776720047),
 ('intel/intel-extension-for-pytorch', 0.5928418636322021),
 ('huggingface/accelerate', 0.5912996530532837),
 ('cupy/cupy', 0.583601713180542),
 ('neuralmagic/deepsparse', 0.5795375108718872),
 ('facebookincubator/aitemplate', 0.5757184028625488),
 ('rafiqhasan/auto-tensorflow', 0.5696676969528198),
 ('nyandwi/modernconvnets', 0.5640577077865601),
 ('lightly-ai/lightly', 0.5578372478485107),
 ('ageron/handson-ml2', 0.5575507879257202),
 ('karpathy/micrograd', 0.5568565130233765),
 ('pytorch/glow', 0.5523102283477783),
 ('pytorch/ignite', 0.5516836643218994),
 ('plasma-umass/scalene', 0.5511540770530701),
 ('tensorflow/addons', 0.5481816530227661),
 ('ddbourgin/numpy-ml', 0.5463129281997681),
 ('facebookresearch/pytorch3d', 0.5412343740463257),
 ('nebuly-ai/nebullvm', 0.

# Add back to df

In [None]:
# Add back to df
def lookup_similarity_record(row, lookup_dict, sim_cutoff):
    lookup = lookup_dict.get(row['_repopath'], None)
    result = [] if lookup is None else [x[0] for x in lookup if x[1] >= sim_cutoff]
    return result

df["sim"] = df.apply(
        lambda row: lookup_similarity_record(row, lookup_dict, sim_cutoff=sim_cutoff), axis=1,
    )

In [None]:
df[["_repopath", "_description", "sim"]]

Unnamed: 0,_repopath,_description,sim
89,tensorflow/tensorflow,An Open Source Machine Learning Framework for ...,"[mlflow/mlflow, keras-team/keras, alpa-project..."
112,huggingface/transformers,🤗 Transformers: State-of-the-art Machine Learn...,"[arogozhnikov/einops, huggingface/datasets, ex..."
76,pytorch/pytorch,Tensors and Dynamic neural networks in Python ...,"[tensorly/tensorly, xl0/lovely-tensors, google..."
189,home-assistant/core,🏡 Open source home automation that puts local ...,"[blakeblackshear/frigate, home-assistant/super..."
957,charliermarsh/ruff,"An extremely fast Python linter, written in Rust.","[google/pytype, pola-rs/polars, pycqa/pylint, ..."
...,...,...,...
994,zackeskin/pycausality,Calculate predictive causality between time se...,[]
1150,geeogi/async-python-lambda-template,Build a high-performance Python function in AW...,"[jordaneremieff/mangum, nficano/python-lambda,..."
520,lydorn/mapalignment,Aligning and Updating Cadaster Maps with Remot...,[zorzi-s/maprepair]
180,artemyk/dynpy,Dynamical systems for Python,"[projectmesa/mesa, crflynn/stochastic, bilhim/..."
