In [None]:
!pip install sentence-transformers --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import os
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"

In [2]:
from sentence_transformers import SentenceTransformer,util
from transformers import AutoTokenizer
import torch

In [3]:
def color_score(score):
    if score < 0.55:
        return f"\033[91m{score:.4f}\033[0m"   # red
    else:
        return f"{score:.4f}"

In [14]:
model_name="sentence-transformers/all-mpnet-base-v2"
model=SentenceTransformer(model_name)
tokenizer=AutoTokenizer.from_pretrained(model_name)

summaries = [
    "Angel investing involves individuals—known as angel investors—providing capital to startups or early-stage businesses, typically in exchange for equity or convertible debt. These investments often occur during the seed or pre-seed funding stages, where traditional venture capital may not be available. Angel investors are usually high-net-worth individuals who seek to support innovation and entrepreneurship while aiming for potential financial returns.",
    "Starting up cars efficiently requires a careful inspection of the ignition system and battery health. Mechanics often consider the cost of replacement parts and potential funding for specialized tools. While tuning engines, one might allocate additional funding to diagnostics equipment to ensure smooth startups. Proper maintenance routines can reduce unnecessary expenses, though some funding may still be necessary for performance enhancements.",
    "World War I, also known as the Great War, was a global conflict that began on July 28, 1914, and ended on November 11, 1918. It involved two major alliances: the Allies (including France, the United Kingdom, Russia, and later the United States) and the Central Powers (primarily Germany, Austria-Hungary, and the Ottoman Empire). The war was triggered by the assassination of Archduke Franz Ferdinand of Austria-Hungary",
    "The film follows Mark Zuckerberg, a Harvard student who creates a website called “Facemash” that gains notoriety and leads to the development of Facebook. Zuckerberg teams up with Eduardo Saverin to launch the platform, navigating both rapid success and personal conflicts. As Facebook grows, legal battles arise, including lawsuits from the Winklevoss twins who claim he stole their idea. The movie explores themes of ambition, friendship, betrayal, and the complex origins of one of the world’s most influential social networks.",
    "Building successful startups begins with identifying a real problem and crafting a product or service that effectively addresses it. Strong teams with complementary skills and shared vision are critical to navigating challenges and executing ideas efficiently. Securing adequate funding and managing resources wisely ensures the startup can scale and survive early-stage uncertainties. Continuous learning, adaptability, and focusing on customer feedback help maintain growth and long-term success in a competitive market.",
    "Swiss cheese is a type of cheese originating from Switzerland, known for its distinctive pale yellow color and characteristic holes, or “eyes.” It is made from cow’s milk and is typically mild, slightly nutty, and sweet in flavor. The holes form naturally during the fermentation process when bacteria release carbon dioxide, creating bubbles in the cheese. Swiss cheese is widely used in sandwiches, fondues, and cooking due to its smooth texture and meltability.",
    "I tried to teach my cat to fetch, but apparently, she prefers to watch me run instead. Yesterday, I tripped over a shoe and now my cat thinks I’m a professional entertainer. I considered installing a treadmill just for her amusement, but she gave me the silent judgment stare. In the end, we agreed: I exercise, she naps, and the world remains perfectly balanced."
]

max_tokens=250
def chunk_by_tokens(text, max_tokens=250, overlap=50):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_text = tokenizer.decode(tokens[start:end], skip_special_tokens=True)
        chunks.append(chunk_text)
        start += max_tokens - overlap  # move start by max_tokens minus overlap
    return chunks

In [29]:
chunk_data=[]
for idx,summary in enumerate(summaries):
  chunks=chunk_by_tokens(summary)
  chunk_embs=model.encode(chunks,convert_to_tensor=True,normalize_embeddings=True)
  for emb in chunk_embs:
    chunk_data.append((idx,emb))

query="satirical pieces indicating contradicition between human and animal worlds"
query_emb=model.encode(query,convert_to_tensor=True,normalize_embedding=True)

cos_scores=[util.pytorch_cos_sim(query_emb,emb)[0][0] for _,emb in chunk_data]


In [30]:
import textwrap
summary_scores={}
for (summary_idx,_),score in zip(chunk_data,cos_scores):
  summary_scores[summary_idx]=max(score,summary_scores.get(summary_idx,-1))

top_summary_idx=sorted(summary_scores,key=summary_scores.get,reverse=True)[:3]

print("Query:", query)
print("\nTop matches:")
print("the max similarity score is:",max(summary_scores.values()).item())
print("="*100)
for idx in top_summary_idx:
    if summary_scores[idx]>=0.5:
      print(textwrap.fill(f"  {summaries[idx]} (score: {summary_scores[idx]:.4f})",width=100))
      print('-'*100)

Query: satirical pieces indicating contradicition between human and animal worlds

Top matches:
the max similarity score is: 0.27463793754577637
