In [48]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
from datasets import load_dataset

from sentence_transformers import SentenceTransformer, util, InputExample, losses, evaluation
from transformers import pipeline

from random import sample, seed, shuffle
from torch.utils.data import DataLoader

In [49]:
PERSON = 'Heungmin Son'
google_html = BeautifulSoup(requests.get(f'https://www.google.com/search?q={PERSON}&hl=en').text).get_text()[:1024]

In [50]:
google_html

'Heungmin Son - Google SearchGoogle×Please click here if you are not redirected within a few seconds.    AllNewsImagesVideos Maps Shopping Books Search tools    Any timeAny timePast hourPast 24 hoursPast weekPast monthPast yearAll resultsAll resultsVerbatimDid you mean: Heung Min SonSon Heung-minSouth Korean football playerView allSon Heung-min is a South Korean professional footballer who plays as a forward for Premier League club Tottenham Hotspur and captains the South Korea national team. WikipediaBorn: July 8, 1992 (age 30\xa0years), Chuncheon-si, South KoreaCurrent teams: Tottenham Hotspur F.C. (#7 / Forward), South Korea national football team (#7 / Forward), and Republic of KoreaHeight: 183\xa0cmSalary: 9.984\xa0million GBP (2023)Parents: Son Woong-jung and Eun Ja KilMovies: SonsationalSiblings: Heung-yun SonSon Heung-min - Wikipediaen.wikipedia.org › wiki › Son_Heung-minSon Heung-min is a South Korean professional footballer who plays as a forward for Premier League club Totte

In [51]:
nlp = pipeline(
    'question-answering',
    model='deepset/roberta-base-squad2',
    tokenizer='deepset/roberta-base-squad2',
    max_length=10
)
nlp(
    f'Who is {PERSON}?',
    google_html
)

{'score': 0.1024279072880745,
 'start': 351,
 'end': 387,
 'answer': 'South Korean professional footballer'}

In [52]:
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()
text

'\ufeffThe Project Gutenberg eBook, The History of Insects, by Unknown\r\n\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r\nre-use it under the terms of the Project Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org\r\n\r\n\r\n\r\n\r\n\r\nTitle: The History of Insects\r\n\r\nAuthor: Unknown\r\n\r\nRelease Date: January 25, 2004  [eBook #10834]\r\n\r\nLanguage: English\r\n\r\n\r\n***START OF THE PROJECT GUTENBERG EBOOK THE HISTORY OF INSECTS***\r\n\r\n\r\nE-text prepared by Internet Archive Children\'s Library, Garrett Alley, and\r\nthe Project Gutenberg Online Distributed Proofreading Team\r\n\r\n\r\n\r\nNote: Project Gutenberg also has an HTML version of this\r\n      file which includes the original illustrations.\r\n      See 10834-h.htm or 10834-h.zip:\r\n      (http://www.ibiblio.org/gutenberg/1/0/8/3/10834/10834-h/10834-h.htm)\r\n      or\r\n      (http://ww

In [53]:
documents = list(filter(lambda x: len(x) > 100, text.split('\r\n\r\n')))
documents = np.array(documents)
print(f'There are {len(documents)} documents/paragraphs')

There are 79 documents/paragraphs


In [54]:
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens
bi_encoder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [55]:
document_embeddings = bi_encoder.encode(documents, convert_to_tensor=True, show_progress_bar=True)
document_embeddings.shape

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

torch.Size([79, 768])

In [56]:
QUESTION = 'How many horns does a flea have?'

In [57]:
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
question_embedding.shape

torch.Size([768])

In [100]:
util.semantic_search(question_embedding, document_embeddings, top_k=3)

[[{'corpus_id': 14, 'score': 0.6277425289154053},
  {'corpus_id': 19, 'score': 0.43602240085601807},
  {'corpus_id': 16, 'score': 0.40866073966026306}]]

In [58]:
hits = util.semantic_search(question_embedding, document_embeddings, top_k=3)[0]
hits

[{'corpus_id': 14, 'score': 0.4899492859840393},
 {'corpus_id': 19, 'score': 0.24793769419193268},
 {'corpus_id': 21, 'score': 0.1847882866859436}]

In [59]:
print(f'Question: {QUESTION}\n')
for i, hit in enumerate(hits):
    print(f'Document {i+1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}\n')

Question: How many horns does a flea have?

Document 1 Cos_Sim 0.490:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.

Document 2 Cos_Sim 0.248:

The Chego is a very small animal, about one fourth the size of a common
flea: it is very troublesome, in warm climates, to the poor blacks, such
as go barefoot, and the slovenly: it penetrates the skin, under which it
lays a bunch of eggs, which swell to the bigness of a small pea.

Document 3 Cos_Sim 0.185:


This is one of the largest of the insect tribe. It is met with in
different countries, and of various sizes, from two or three inches to
nearly a fo

In [60]:
nlp(QUESTION, str(documents[hits[0]['corpus_id']]))

{'score': 0.8524739742279053, 'start': 259, 'end': 262, 'answer': 'two'}

In [61]:
training_qa = load_dataset('adversarial_qa', 'adversarialQA', split='train')
training_qa

W0503 11:30:49.483593 140619849451328 builder.py:798] Found cached dataset adversarial_qa (/root/.cache/huggingface/datasets/adversarial_qa/adversarialQA/1.0.0/92356be07b087c5c6a543138757828b8d61ca34de8a87807d40bbc0e6c68f04b)


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
    num_rows: 30000
})

In [62]:
training_qa[0]

{'id': '7ba1e8f4261d3170fcf42e84a81dd749116fae95',
 'title': 'Brain',
 'context': 'Another approach to brain function is to examine the consequences of damage to specific brain areas. Even though it is protected by the skull and meninges, surrounded by cerebrospinal fluid, and isolated from the bloodstream by the blood–brain barrier, the delicate nature of the brain makes it vulnerable to numerous diseases and several types of damage. In humans, the effects of strokes and other types of brain damage have been a key source of information about brain function. Because there is no ability to experimentally control the nature of the damage, however, this information is often difficult to interpret. In animal studies, most commonly involving rats, it is possible to use electrodes or locally injected chemicals to produce precise patterns of damage and then examine the consequences for behavior.',
 'question': 'What sare the benifts of the blood brain barrir?',
 'answers': {'text': ['isolated

In [63]:
good_training_data = []
bad_training_data = []
last_example = None
for example in training_qa:
    if last_example and example['context'] != last_example['context']:
        bad_training_data.append((example['question'], last_example['context'], 0.0))
    good_training_data.append((example['question'], example['context'], 1.0))
    last_example = example

In [64]:
len(good_training_data), len(bad_training_data)

(30000, 2647)

In [65]:
good_training_data[-1]

('What letter designates what Ektachrome is designed for?',
 'Some high-speed black-and-white films, such as Ilford Delta 3200 and Kodak T-MAX P3200, are marketed with film speeds in excess of their true ISO speed as determined using the ISO testing method. For example, the Ilford product is actually an ISO 1000 film, according to its data sheet. The manufacturers do not indicate that the 3200 number is an ISO rating on their packaging. Kodak and Fuji also marketed E6 films designed for pushing (hence the "P" prefix), such as Ektachrome P800/1600 and Fujichrome P1600, both with a base speed of ISO 400.',
 1.0)

In [66]:
bad_training_data[-1]

('What film beside Ektachrome and Fujichorme is designed for pushing?',
 'The Weston Cadet (model 852 introduced in 1949), Direct Reading (model 853 introduced 1954) and Master III (models 737 and S141.3 introduced in 1956) were the first in their line of exposure meters to switch and utilize the meanwhile established ASA scale instead. Other models used the original Weston scale up until ca. 1955. The company continued to publish Weston film ratings after 1955, but while their recommended values often differed slightly from the ASA film speeds found on film boxes, these newer Weston values were based on the ASA system and had to be converted for use with older Weston meters by subtracting 1/3 exposure stop as per Weston\'s recommendation. Vice versa, "old" Weston film speed ratings could be converted into "new" Westons and the ASA scale by adding the same amount, that is, a film rating of 100 Weston (up to 1955) corresponded with 125 ASA (as per ASA PH2.5-1954 and before). This conver

In [67]:
seed(42)

In [68]:
sampled_training_data = sample(good_training_data, 500) + sample(bad_training_data, 500)
shuffle(sampled_training_data)
training_index = int(.8*len(sampled_training_data))

In [69]:
train_examples = [InputExample(texts=t[:2], label=t[2]) for t in sampled_training_data[:training_index]]
len(train_examples)

800

In [70]:
train_examples[0].__dict__

{'guid': '',
 'texts': ('What changed after the eigth century?',
  'There is disagreement about the origin of the term, but general consensus that "cardinalis" from the word cardo (meaning \'pivot\' or \'hinge\') was first used in late antiquity to designate a bishop or priest who was incorporated into a church for which he had not originally been ordained. In Rome the first persons to be called cardinals were the deacons of the seven regions of the city at the beginning of the 6th century, when the word began to mean “principal,” “eminent,” or "superior." The name was also given to the senior priest in each of the "title" churches (the parish churches) of Rome and to the bishops of the seven sees surrounding the city. By the 8th century the Roman cardinals constituted a privileged class among the Roman clergy. They took part in the administration of the church of Rome and in the papal liturgy. By decree of a synod of 769, only a cardinal was eligible to become pope. In 1059, during th

In [71]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(bi_encoder)

In [None]:
(question_batch, context_batch), labels = next(iter(train_dataloader))
question_batch['input_ids'].shape, context_batch['input_ids'].shape, labels.shape

In [84]:
sentences1, sentences2, scores = zip(*sampled_training_data[training_index:])

In [86]:
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
evaluator

<sentence_transformers.evaluation.EmbeddingSimilarityEvaluator.EmbeddingSimilarityEvaluator at 0x7fe25e619af0>

In [87]:
bi_encoder.evaluate(evaluator)

0.5044913287672261

In [90]:
bi_encoder.fit(
    train_objectives=[(train_dataloader, train_loss)],
    output_path='ir/results',
    epochs=20,
    evaluator=evaluator
)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

Iteration:   0%|          | 0/25 [00:00<?, ?it/s]

In [91]:
bi_encoder.evaluate(evaluator)

0.52163970354764

In [92]:
finetuned_bi_encoder = SentenceTransformer('ir/results')

In [93]:
document_embeddings = finetuned_bi_encoder.encode(documents, convert_to_tensor=True, show_progress_bar=True)
question_embedding = finetuned_bi_encoder.encode(QUESTION, convert_to_tensor=True)
hits = util.semantic_search(question_embedding, document_embeddings, top_k=3)[0]
print(f'Question: {QUESTION}\n')
for i, hit in enumerate(hits):
    print(f'Document {i+1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}\n')

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Question: How many horns does a flea have?

Document 1 Cos_Sim 0.628:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.

Document 2 Cos_Sim 0.436:

The Chego is a very small animal, about one fourth the size of a common
flea: it is very troublesome, in warm climates, to the poor blacks, such
as go barefoot, and the slovenly: it penetrates the skin, under which it
lays a bunch of eggs, which swell to the bigness of a small pea.

Document 3 Cos_Sim 0.409:

In examining the louse with a microscope, its external deformity strikes
us with disgust. It has six feet, two eyes, and a sort of sting,
proboscis, 

In [95]:
def gutenberg_to_documents(gutenberg_url, bi_encoder):
    text = urlopen(gutenberg_url).read().decode()
    documents = np.array(list(filter(lambda x: len(x) > 100, text.split('\r\n\r\n'))))
    print(f'There are {len(documents)} documents/paragraphs')
    return documents, bi_encoder.encode(documents)

In [96]:
def retrieve_relevant_documents(bi_encoder, query, documents, document_embeddings, hits=3):
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, document_embeddings, top_k=hits)[0]
    for i, hit in enumerate(hits):
        print(f'Document {i+1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}\n')
    print(f'Answer from Top Document: {nlp(query, str(documents[hits[0]["corpus_id"]]))}')

In [97]:
banks_to_bassoon_documents, banks_to_bassoon_embeddings = gutenberg_to_documents(
    'https://www.gutenberg.org/cache/epub/27480/pg27480.txt',
    finetuned_bi_encoder
)

There are 1402 documents/paragraphs


In [98]:
retrieve_relevant_documents(
    finetuned_bi_encoder,
    'What is banshee?',
    banks_to_bassoon_documents,
    banks_to_bassoon_embeddings,
    2
)

Document 1 Cos_Sim 0.765:

BANSHEE (Irish _bean sidhe_; Gaelic _ban sith_, "woman of the fairies"), a
supernatural being in Irish and general Celtic folklore, whose mournful
screaming, or "keening," at night is held to foretell the death of some
member of the household visited. In Ireland legends of the banshee belong
more particularly to certain families in whose records periodic visits from
the spirit are chronicled. A like ghostly informer figures in Brittany
folklore. The Irish banshee is held to be the distinction only of families
of pure Milesian descent. The Welsh have the banshee under the name _gwrach
y Rhibyn_ (witch of Rhibyn). Sir Walter Scott mentions a belief in the
banshee as existing in the highlands of Scotland (_Demonology and
Witchcraft_, p. 351). A Welsh death-portent often confused with the gwrach
y Rhibyn and banshee is the _cyhyraeth_, the groaning spirit.

Document 2 Cos_Sim 0.424:

BANYAN, or BANIAN (an Arab corruption, borrowed by the Portuguese from the
Sansk