In [1]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.14.1-py3-

In [2]:
import json
import os
import time
import gzip
import torch
from sentence_transformers import SentenceTransformer, util, CrossEncoder

In [3]:
if torch.cuda.is_available():
  print("GPU available and ready to go")

GPU available and ready to go


In [4]:
# Create bi-encoder model to encode all the passages
model_name = 'nq-distilbert-base-v1'
bi_encoder = SentenceTransformer(model_name)
top_k = 5 # No.of passages we want to retrive with the bi-encoder

wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
  util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz',
                wikipedia_filepath)

Downloading (…)a2d19/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)17900a2d19/README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)900a2d19/config.json:   0%|          | 0.00/540 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)a2d19/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

Downloading (…)17900a2d19/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)00a2d19/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

  0%|          | 0.00/50.2M [00:00<?, ?B/s]

In [8]:
passages= []
with gzip.open(wikipedia_filepath,"rt",encoding="utf-8") as f:
  for line in f:
    data = json.loads(line.strip())
    for paragraph in data['paragraphs']:
      # We encode the passages as [title,text]
      passages.append([data['title'],paragraph])

print("Passages:",len(passages))

Passages: 509663


In [9]:
passages[:2]

[['Ted Cassidy',
  'Ted Cassidy (July 31, 1932 - January 16, 1979) was an American actor. He was best known for his roles as Lurch and Thing on "The Addams Family".'],
 ['Aileen Wuornos',
  'Aileen Carol Wuornos Pralle (born Aileen Carol Pittman; February 29, 1956\xa0– October 9, 2002) was an American serial killer. She was born in Rochester, Michigan. She confessed to killing six men in Florida and was executed in Florida State Prison by lethal injection for the murders. Wuornos said that the men she killed had raped her or tried to rape her while she was working as a prostitute.']]

In [10]:
# To speed things up, pre-computed embeddings are downloaded.
# The provided file encoded the passages with the model 'nq-distilbert-base-v1'

if model_name == 'nq-distilbert-base-v1':
  embeddings_filepath = 'simplewiki-2020-11-01-nq-distilbert-base-v1.pt'
  if not os.path.exists(embeddings_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01-nq-distilbert-base-v1.pt',
                  embeddings_filepath)
    
  corpus_embeddings = torch.load(embeddings_filepath)
  corpus_embeddings = corpus_embeddings.float() # convert to float
  if torch.cuda.is_available():
    corpus_embeddings = corpus_embeddings.to('cuda')
else:
  corpus_embeddings = bi_encoder.encode(passages,
                                        convert_to_tensor=True,
                                        show_progress_bar=True)

  0%|          | 0.00/783M [00:00<?, ?B/s]

In [11]:
def search(query):
  start_time = time.time()
  ques_embedding = bi_encoder.encode(query,convert_to_tensor=True)
  hits = util.semantic_search(ques_embedding,corpus_embeddings,top_k=top_k)
  hits = hits[0] # get hits for the first query
  end_time = time.time()

  # Output of top-k hits
  print("Input Question:",query)
  print("Results after {:.3f} seconds:".format(end_time - start_time))
  for hit in hits:
    print("\t{:.3f}\t{}".format(hit['score'],passages[hit['corpus_id']]))

In [12]:
search(query = "What is the capital of the France?")

Input Question: What is the capital of the France?
Results after 3.162 seconds:
	0.826	['Capital of France', 'The capital of France is Paris. In the course of history, the national capital has been in many locations other than Paris.']
	0.753	['Arrondissement of Sarlat-la-Canéda', 'The arrondissement of Sarlat-la-Canéda is an arrondissement of France. It is part of the Dordogne "département" in the Nouvelle-Aquitaine region. Its capital is the city of Sarlat-la-Canéda.']
	0.752	['Arrondissement of Figeac', 'The arrondissement of Figeac is an arrondissement of France. It is part of the Lot "département" in the Occitanie region. Its capital is the city of Figeac.']
	0.746	["Arrondissement of Saint-Jean-d'Angély", "The arrondissement of Saint-Jean-d'Angély is an arrondissement of France, in the Charente-Maritime department, Nouvelle-Aquitaine region. Its capital is the city of Saint-Jean-d'Angély."]
	0.745	['Arrondissement of Confolens', 'The arrondissement of Confolens is an arrondisseme

In [13]:
search("When was USA founded?")

Input Question: When was USA founded?
Results after 0.042 seconds:
	0.677	['United States', 'The nation was founded by thirteen colonies of Great Britain along the Atlantic seaboard. On July 4, 1776, they issued the Declaration of Independence, which announced their independence from Great Britain and their creation of a cooperative union. The disobedient states defeated Great Britain in the American Revolutionary War, the first successful colonial war of independence. The Philadelphia Convention adopted the current United States Constitution on September 17, 1787; its approval the following year made the states part of a single republic with a strong central government. The Bill of Rights, making up ten constitutional amendments guaranteeing many basic civil rights and freedoms, was approved in 1791.']
	0.632	['United States', "After the British defeat by American forces helped by the French, Great Britain recognized the independence of the United States and the states' sovereignty ov

In [14]:
search("Who was Ashoka the Great?")

Input Question: Who was Ashoka the Great?
Results after 0.040 seconds:
	0.708	['Ashoka', 'Ashoka (or Asoka) was India\'s great emperor of the Mauryan Dynasty of India who ruled from 304-232 BC. His name means ""He who is loved by the Gods and who is friendly to everyone"".']
	0.574	['Skandagupta', 'Skandagupta () (died 467) was a Gupta Emperor of northern India. People do not know who his ancestors were. He faced some of the greatest challenges in the annals of the empire having to contend with the Pushyamitras and the Hunas (a name by which the "White Huns" were known in India). He died in 467.']
	0.557	['Chandragupta Maurya', 'Chandragupta Maurya was the founder of the Maurya Empire in ancient India. He was born in a humble family, was picked up, taught and counselled by Chanakya (also known as Kautilya) – a Hindu Brahmin who wrote the Arthashastra. Together, Chandragupta and Chanakya built one of the largest empires on the Indian subcontinent.']
	0.553	['Ashoka', "Ashoka is often ci

In [15]:
search("What is the history of Ukraine?")

Input Question: What is the history of Ukraine?
Results after 0.047 seconds:
	0.675	['Sviatoslav I, Prince of Kiev', "Sviatoslav I was an early ruler of the Kievan Rus', a nation that evolved into modern Ukraine. He was the son of Igor I of Kiev and Olga of Kiev. He ruled from 962 until 972. During his reign he destroyed the Khazar Empire and for a short time conquered what is now Bulgaria."]
	0.604	['Ukraine', 'The capital of Ukraine is Kyiv (). It was a part of the Soviet Union from 1922 until 1991.']
	0.604	['Ukraine', "In 1917 an independent Ukrainian People's Republic was established. The Red Army freed it and made it into the Ukrainian Soviet Socialist Republic."]
	0.593	['Ukrainian Soviet Socialist Republic', 'The Ukrainian Soviet Socialist Republic or in short, the Ukrainian SSR or Soviet Ukraine was in the southwestern part of the Soviet Union. It had the second largest population of the fifteen republics of the Soviet Union. This lasted from 1922 to 1991.']
	0.588	['Ukraine',

In [16]:
search("Grey's Anatomy")

Input Question: Grey's Anatomy
Results after 0.039 seconds:
	0.563	["Grey's Anatomy", "Grey's Anatomy is an American television series. It is a story about surgeons in Grey-Sloan Memorial Hospital (previouslySeattle Grace-Mercy West Hospital) ( which was previously Seattle Grace Hospital). It was created by Shonda Rhimes. It has been on ABC since March 2005."]
	0.535	['Ellen Pompeo', 'Ellen Katherine Pompeo (November 10, 1969) is an American actress. She works mostly in television shows. She portrayed the character Meredith Grey in "Grey\'s Anatomy". She also does other television work. She was born in Massachusetts.']
	0.530	['Mary Kay Place', 'Mary Kay Place (born September 23, 1947) is an American actress, director, and screenwriter. She acted in the sitcom "Grey\'s Anatomy". She also acted in "The West Wing".']
	0.441	["Grey's Anatomy", '"Grey’s Anatomy" got its name from a textbook by Henry Gray, written in 1858, which reached its 40th edition in 2008. The title of the text is "An

In [17]:
search("Lost TV Series")

Input Question: Lost TV Series
Results after 0.040 seconds:
	0.603	['Lost (TV series)', "Lost was an American television series about airplane crash survivors on a tropical island in the South Pacific. Most episodes showed events in the present combined with stories from a character's past or future. The show was created by J. J. Abrams, Damon Lindelof, and Jeffrey Lieber. It was mostly filmed in Oahu, Hawaii. The first episode was shown on September 22, 2004. Since then, a total of six seasons have been shown on television. The series is made by ABC Studios, Bad Robot Productions and Grass Skirt Productions. It plays on the ABC Network in the United States. Other television networks show the series in other countries. Because of its large cast and the cost of filming in Hawaii, the series was one of the most expensive on television."]
	0.586	['Lost (TV series)', 'The episodes of Season 1 were shown on television in the United States starting on September 22, 2004. There were 24 episod

In [38]:
from sentence_transformers import SentenceTransformer,util
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

model = SentenceTransformer('nq-distilbert-base-v1')

query_embedding = model.encode('How many people live in London?',
                               convert_to_numpy=True)
# OR
query_embed = model.encode('How many people live in London?')

# Encode passages as [title,text]
passage_embedding = model.encode([['London','London has 9,787,426 inhabitants at the 2011 census.']],
                                 convert_to_numpy=True)

# OR

passage_embed = model.encode([['London','London has 9,787,426 inhabitants at the 2011 census.']])

print('Similarity:',1 - cosine(query_embedding.reshape(-1),
                               passage_embedding.reshape(-1)))

print('Cosine Similarity:',cosine_similarity(query_embedding.reshape(1,-1),
                                             passage_embedding.reshape(1,-1))[0])

print('Pytorch similarity:',util.pytorch_cos_sim(query_embed,
                                                 passage_embed))

Similarity: 0.6502923369407654
Cosine Similarity: [0.6502924]
Pytorch similarity: tensor([[0.6503]])


In [39]:
query_embedding = model.encode('who turned out to be the mother on how i met your mother')

#The passages are encoded as [title, text]
passage_embedding = model.encode([['The Mother (How I Met Your Mother)', 'The Mother (How I Met Your Mother) Tracy McConnell (colloquial: "The Mother") is the title character from the CBS television sitcom "How I Met Your Mother". The show, narrated by Future Ted (Bob Saget), tells the story of how Ted Mosby (Josh Radnor) met The Mother. Tracy McConnell appears in eight episodes, from "Lucky Penny" to "The Time Travelers", as an unseen character; she was first seen fully in "Something New" and was promoted to a main character in season 9. The Mother is played by Cristin Milioti. The story of how Ted met The Mother is the framing device'],
                                  ['Make It Easy on Me', 'and Pete Waterman on her 1993 album "Good \'N\' Ready", on which a remixed version of the song is included. "Make It Easy On Me", a mid-tempo R&B jam, received good reviews (especially for signalling a different, more soulful and mature sound atypical of the producers\' Europop fare), but failed to make an impact on the charts, barely making the UK top 100 peaking at #99, and peaking at #52 on the "Billboard" R&B charts. The pop group Steps covered the song on their 1999 album "Steptacular". It was sung as a solo by Lisa Scott-Lee. Make It Easy on']])

print("Similarity:", util.pytorch_cos_sim(query_embedding, passage_embedding))

Similarity: tensor([[ 0.7562, -0.0835]])


In [40]:
query_embedding = model.encode('where does the story the great gatsby take place')
passage_embedding = model.encode([['The Great Gatsby', 
 'The Great Gatsby The Great Gatsby is a 1925 novel written by American author F. Scott Fitzgerald that follows a cast of characters living in the fictional towns of West Egg and East Egg on prosperous Long Island in the summer of 1922. The story primarily concerns the young and mysterious millionaire Jay Gatsby and his quixotic passion and obsession with the beautiful former debutante Daisy Buchanan. Considered to be Fitzgerald\'s magnum opus, "The Great Gatsby" explores themes of decadence, idealism, resistance to change, social upheaval, and excess, creating a portrait of the Roaring Twenties that has been described as'],
 ['The Producers (1967 film)', '2005 (to coincide with the remake released that year). In 2011, MGM licensed the title to Shout! Factory to release a DVD and Blu-ray combo pack with new HD transfers and bonus materials. StudioCanal (worldwide rights holder to all of the Embassy Pictures library) released several R2 DVD editions and Blu-ray B releases using a transfer slightly different from the North Ameri can DVD and BDs. The Producers (1967 film) The Producers is a 1967 American satirical comedy film written and directed by Mel Brooks and starring Zero Mostel, Gene Wilder, Dick Shawn, and Kenneth Mars. The film was Brooks\'s directorial']
])

print("Similarity:", util.pytorch_cos_sim(query_embedding, passage_embedding))


Similarity: tensor([[ 0.8294, -0.2055]])
