# Setup

In [1]:
!pip install transformers



In [2]:
!pip install -Uq sentence-transformers # for embedding

In [3]:
!pip install sentencepiece #T5 need this.



In [4]:
import sentencepiece # for T5
from sentence_transformers import SentenceTransformer
import torch

import torch.nn.functional as F # Needed to calculate cosine
from nltk.translate.bleu_score import SmoothingFunction # needed for BLEU score

embmodel = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Needed for Creating Sentence Embedding

# How BLEU score works

In [5]:
from nltk.translate.bleu_score import sentence_bleu
reference = [
    'it is dog'.split(),
    'this is a dog'.split(),
    'dog it is'.split(),
    'a dog, it is'.split()
]

chencherry = SmoothingFunction()
weights = [(1./2., 1./2.),
 (1./3., 1./3., 1./3.),
 (1./4., 1./4., 1./4., 1./4.),
   (1./5., 1./5., 1./5., 1./5., 1./5.)
]

candidate = 'it is a dog'.split() # differance is the char a of a dog
print('BLEU score -> {}'.format(sentence_bleu(reference, candidate, smoothing_function=chencherry.method1, weights=weights)))

candidate = 'it is dog'.split()
print('BLEU score -> {}'.format(sentence_bleu(reference, candidate,smoothing_function=chencherry.method1, weights=weights )))

BLEU score -> [1.0, 0.7937005259840998, 0.4728708045015879, 0.34657242157757323]
BLEU score -> [1.0, 1.0, 0.5623413251903491, 0.39810717055349726]


# How ROUGE Works

In [6]:
!pip install rouge-score

from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer



In [7]:
# scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rouge3', 'rougeL'], use_stemmer=True)
scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'], use_stemmer=True)
scores = scorer.score('The quick brown fox jumps over the lazy dog',
                      'The quick brown dog jumps on the log.')
scores

{'rouge1': Score(precision=0.75, recall=0.6666666666666666, fmeasure=0.7058823529411765),
 'rougeL': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471)}

# T5ForConditionalGeneration

## Default

In [8]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

question = "What is the capital of France?"
context = "Paris is the capital of France."
input_text = f"question: {question} context: {context}"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output_ids = model.generate(input_ids, max_length=50)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Question:", question)
print("Answer:", answer)


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Question: What is the capital of France?
Answer: Paris


In [33]:
question = "What predicament does Yudhishthira face, and how does he seek guidance to resolve it?"
ref_ans = "Yudhishthira faces the predicament of being unable to support the Brahmanas who are following him as he departs for the forest. \
He seeks guidance to resolve this dilemma by approaching his priest, Dhaumya, and inquiring about the appropriate course of action."
context = """Section III
"Vaisampayana said, 'Yudhishthira the son of Kunti, thus addressed by Saunaka, approached his priest and in the midst of his brothers said,
'The Brahmanas versed in the Vedas are following me who am departing for the forest. Afflicted with many calamities I am unable to support them.
I cannot abandon them, nor have I the power to offer them sustenance: Tell me, O holy one, what should be done by me in such a pass.'
"Vaisampayana said, 'After reflecting for a moment seeking to find out the (proper) course by his yoga powers, Dhaumya, that foremost of all virtuous men,
addressed Yudhishthira, in these words, 'In days of old, all living beings that had been created were sorely afflicted with hunger.
And like a father (unto all of them), Savita (the sun) took compassion upon them. And going first into the northern declension,
the sun drew up water by his rays, and coming back to the southern declension, stayed over the earth, with his heat centered in himself.
And while the sun so stayed over the earth, the lord of the vegetable world (the moon), converting the effects of the solar heat (vapours) into clouds
and pouring them down in the shape of water, caused plants to spring up. Thus it is the sun himself, who, drenched by the lunar influence, is transformed,
upon the sprouting of seeds, into holy vegetable furnished with the six tastes. And it is these which constitute the food of all creatures upon the earth.
Thus the food that supporteth the lives of creatures is instinct with solar energy, and the sun is, therefore, the father of all creatures.
Do thou, hence, O Yudhishthira, take refuge even in him. All illustrious monarchs of pure descent and deeds are known to have delivered their people by practising high asceticism.
The great Karttavirya, and Vainya and Nahusha, had all, by virtue of ascetic meditation preceded by vows, delivered their people from heavy afflictions.
Therefore, O virtuous one, as thou art purified by the acts do thou likewise, entering upon a file of austerities. O Bharata, virtuously support the regenerate ones.'
"Janamejaya said, 'How did that bull among the Kurus, king Yudhishthira, for the sake of the Brahmanas adore the sun of wonderful appearance?"
"Vaisampayana said, 'Listen attentively, O king, purifying thyself and withdrawing thy mind from every other thing. And, O king of kings, appoint thou a time.
I will tell thee everything in detail, And, O illustrious one, listen to the one hundred and eight names (of the sun)
as they were disclosed of old by Dhaumya to the high-souled son of Pritha. Dhaumya said, 'Surya, Aryaman, Bhaga, Twastri, Pusha, Arka, Savitri.
Ravi, Gabhastimat, Aja, Kala, Mrityu, Dhatri, Prabhakara, Prithibi, Apa, Teja, Kha, Vayu, the sole stay, Soma, Vrihaspati, Sukra, Budha, Angaraka,
Indra, Vivaswat, Diptanshu, Suchi, Sauri, Sanaichara, Brahma, Vishnu, Rudra, Skanda, Vaisravana, Yama, Vaidyutagni, Jatharagni, Aindhna, Tejasampati,
Dharmadhwaja, Veda-karttri, Vedanga, Vedavahana, Krita, Treta, Dwapara, Kali, full of every impurity, Kala, Kastha, Muhurtta, Kshapa, Yama, and Kshana;
Samvatsara-kara, Aswattha, Kalachakra, Bibhavasu, Purusha, Saswata, Yogin, Vyaktavyakta, Sanatana, Kaladhyaksha, Prajadhyaksha, Viswakarma, Tamounda,
Varuna, Sagara, Ansu, Jimuta, Jivana, Arihan, Bhutasraya, Bhutapati, Srastri, Samvartaka, Vanhi, Sarvadi, Alolupa, Ananta, Kapila, Bhanu, Kamada,
Sarvatomukha, Jaya, Visala, Varada, Manas, Suparna, Bhutadi, Sighraga, Prandharana, Dhanwantari, Dhumaketu, Adideva, Aditisuta, Dwadasatman, Aravindaksha,
Pitri, Matri, Pitamaha, Swarga-dwara, Prajadwara, Mokshadwara, Tripistapa, Dehakarti, Prasantatman, Viswatman, Viswatomukha, Characharatman, Sukhsmatman,
the merciful Maitreya. These are the hundred and eight names of Surya of immeasurable energy, as told by the self-create (Brahma). For the acquisition of prosperity,
I bow down to thee, O Bhaskara, blazing like unto gold or fire, who is worshipped of the gods and the Pitris and the Yakshas, and who is adored by Asuras, Nisacharas,
and Siddhas. He that with fixed attention reciteth this hymn at sunrise, obtaineth wife and offspring and riches and the memory of his former existence,
and by reciting this hymn a person attaineth patience and memory. Let a man concentrating his mind, recite this hymn. By doing so,
he shall be proof against grief and forest-fire and ocean and every object of desire shall be his.'
"Vaisampayana continued, 'Having heard from Dhaumya these words suitable to the occasion, Yudhishthira the just, with heart concentrated within itself and purifying it duly,
became engaged in austere meditation, moved by the desire of supporting the Brahmanas. And worshipping the maker of day with offerings of flowers and other articles,
the king performed his ablutions. And standing in the stream, he turned his face towards the god of day. And touching the water of the Ganges
the virtuous Yudhishthira with senses under complete control and depending upon air alone for his sustenance, stood there with rapt soul engaged in pranayama.
And having purified himself and restrained his speech, he began to sing the hymn of praise (to the sun).'
'Yudhishthira said, "Thou art, O sun, the eye of the universe. Thou art the soul of all corporeal existences. Thou art the origin of all things.
Thou art the embodiment of the acts of all religious men. Thou art the refuge of those versed in the Sankhya philosophy (the mysteries of the 1."""

In [13]:
ref_ans_emb = embmodel.encode(ref_ans)
ref_ans_Vector= torch.tensor(ref_ans_emb, dtype=torch.float) # 384 dim vector

In [14]:
input_text = f"question: {question} context: {context}"
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1500, truncation=True)

output_ids = model.generate(input_ids, max_length=150,)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Question:", question)
print("Answer:", answer)

Question: What predicament does Yudhishthira face, and how does he seek guidance to resolve it?
Answer: 'In days of old, all living beings that had been created were sorely afflicted with hunger. And like a father (unto all of them), Savita (the sun) took compassion upon them.


In [15]:
ans = "'In days of old, all living beings that had been created were sorely afflicted with hunger. And like a father (unto all of them), Savita (the sun) took compassion upon them."

pred_ans_emb = embmodel.encode(ans)
pred_ans_Vector= torch.tensor(pred_ans_emb, dtype=torch.float)

print("Cosine Similarity", F.cosine_similarity(pred_ans_Vector, ref_ans_Vector,dim=-1))

chencherry = SmoothingFunction()
weights = [(1./2., 1./2.),
 (1./3., 1./3., 1./3.),
 (1./4., 1./4., 1./4., 1./4.),
   (1./5., 1./5., 1./5., 1./5., 1./5.)
]

print('BLEU score -> {}'.format(sentence_bleu(ref_ans, ans, smoothing_function=chencherry.method1), weights=weights))
scores = scorer.score(ref_ans, ans)
for k,v in scores.items():
  print(k,v)

Cosine Similarity tensor(0.2162)
BLEU score -> 0.00231862282437237
rouge1 Score(precision=0.16129032258064516, recall=0.12195121951219512, fmeasure=0.1388888888888889)
rougeL Score(precision=0.12903225806451613, recall=0.0975609756097561, fmeasure=0.1111111111111111)


## Trick to over come tokenizer's max_length for token.

### Config 1

In [16]:
import torch
max_length = 256*2  # Maximum token limit of the model
stride = 256      # Adjust the stride based on your preference

# Split the input text into overlapping windows
windows = [input_text[i:i+max_length] for i in range(0, len(input_text), stride)]

# Encode each window and concatenate the results
input_ids_list = [tokenizer.encode(window, max_length=max_length, return_tensors="pt", truncation=True) for window in windows]
input_ids = torch.cat(input_ids_list, dim=1)

output_ids = model.generate(input_ids, max_length=150,)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Question:", question)
print("Answer:", answer)

Question: What predicament does Yudhishthira face, and how does he seek guidance to resolve it?
Answer: 'I am unable to support them,' said the king of the king of the kings. 'I can not abandon them, nor have I the power to offer them sustenance'


In [17]:
ans = "'I am unable to support them,' said the king of the king of the kings. 'I can not abandon them, nor have I the power to offer them sustenance'"

pred_ans_emb = embmodel.encode(ans)
pred_ans_Vector= torch.tensor(pred_ans_emb, dtype=torch.float)

print("Cosine Similarity", F.cosine_similarity(pred_ans_Vector, ref_ans_Vector,dim=-1))

chencherry = SmoothingFunction()
weights = [(1./2., 1./2.),
 (1./3., 1./3., 1./3.),
 (1./4., 1./4., 1./4., 1./4.),
   (1./5., 1./5., 1./5., 1./5., 1./5.)
]

print('BLEU score -> {}'.format(sentence_bleu(ref_ans, ans, smoothing_function=chencherry.method1), weights=weights))
scores = scorer.score(ref_ans, ans)
for k,v in scores.items():
  print(k,v)

Cosine Similarity tensor(0.3893)
BLEU score -> 0.0028014974866929204
rouge1 Score(precision=0.3448275862068966, recall=0.24390243902439024, fmeasure=0.2857142857142857)
rougeL Score(precision=0.20689655172413793, recall=0.14634146341463414, fmeasure=0.1714285714285714)


In [18]:
[[i,i+max_length] for i in range(0, len(input_text), stride)]

[[0, 512],
 [256, 768],
 [512, 1024],
 [768, 1280],
 [1024, 1536],
 [1280, 1792],
 [1536, 2048],
 [1792, 2304],
 [2048, 2560],
 [2304, 2816],
 [2560, 3072],
 [2816, 3328],
 [3072, 3584],
 [3328, 3840],
 [3584, 4096],
 [3840, 4352],
 [4096, 4608],
 [4352, 4864],
 [4608, 5120],
 [4864, 5376],
 [5120, 5632],
 [5376, 5888]]

### Config 2
This takes lot of time

In [19]:
import torch
max_length = 256*4  # Maximum token limit of the model
stride = 256      # Adjust the stride based on your preference

# Split the input text into overlapping windows
windows = [input_text[i:i+max_length] for i in range(0, len(input_text), stride)]

# Encode each window and concatenate the results
input_ids_list = [tokenizer.encode(window, max_length=max_length, return_tensors="pt", truncation=True) for window in windows]
input_ids = torch.cat(input_ids_list, dim=1)

output_ids = model.generate(input_ids, max_length=150,)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Question:", question)
print("Answer:", answer)

Question: What predicament does Yudhishthira face, and how does he seek guidance to resolve it?
Answer: 'Vaisampayana continued, 'after hearing from Dhaumya these words suitable to the occasion,' he said. 'I will tell thee everything in detail, 'how did that bull among the king, ''o king, ''o king, purifying thyself and withdrawing thy mind from every other thing' 'I will tell thee everything in detail,' he said. 'I will tell thee everything in detail


Question: What predicament does Yudhishthira face, and how does he seek guidance to resolve it?   
Answer: 'Vaisampayana continued, 'after hearing from Dhaumya these words suitable to the occasion,' he said. 'I will tell thee everything in detail, 'how did that bull among the king, ''o king, ''o king, purifying thyself and withdrawing thy mind' 'i will tell thee everything in detail,' he said.

In [20]:
ans = "'Vaisampayana continued, 'after hearing from Dhaumya these words suitable to the occasion,' he said. 'I will tell thee everything in detail, 'how did that bull among the king, ''o king, ''o king, purifying thyself and withdrawing thy mind from every other thing' 'I will tell thee everything in detail,' he said. 'I will tell thee everything in detail"

pred_ans_emb = embmodel.encode(ans)
pred_ans_Vector= torch.tensor(pred_ans_emb, dtype=torch.float)

print("Cosine Similarity", F.cosine_similarity(pred_ans_Vector, ref_ans_Vector,dim=-1))

chencherry = SmoothingFunction()
weights = [(1./2., 1./2.),
 (1./3., 1./3., 1./3.),
 (1./4., 1./4., 1./4., 1./4.),
   (1./5., 1./5., 1./5., 1./5., 1./5.)
]

print('BLEU score -> {}'.format(sentence_bleu(ref_ans, ans, smoothing_function=chencherry.method1), weights=weights))
scores = scorer.score(ref_ans, ans)
for k,v in scores.items():
  print(k,v)

Cosine Similarity tensor(0.3525)
BLEU score -> 0.001148942723655602
rouge1 Score(precision=0.1206896551724138, recall=0.17073170731707318, fmeasure=0.14141414141414144)
rougeL Score(precision=0.08620689655172414, recall=0.12195121951219512, fmeasure=0.10101010101010101)


In [21]:
[[i,i+max_length] for i in range(0, len(input_text), stride)]

[[0, 1024],
 [256, 1280],
 [512, 1536],
 [768, 1792],
 [1024, 2048],
 [1280, 2304],
 [1536, 2560],
 [1792, 2816],
 [2048, 3072],
 [2304, 3328],
 [2560, 3584],
 [2816, 3840],
 [3072, 4096],
 [3328, 4352],
 [3584, 4608],
 [3840, 4864],
 [4096, 5120],
 [4352, 5376],
 [4608, 5632],
 [4864, 5888],
 [5120, 6144],
 [5376, 6400]]

### Config 3

In [22]:
import torch
max_length = 256*4  # Maximum token limit of the model
stride = 256*2      # Adjust the stride based on your preference

# Split the input text into overlapping windows
windows = [input_text[i:i+max_length] for i in range(0, len(input_text), stride)]

# Encode each window and concatenate the results
input_ids_list = [tokenizer.encode(window, max_length=max_length, return_tensors="pt", truncation=True) for window in windows]
input_ids = torch.cat(input_ids_list, dim=1)

output_ids = model.generate(input_ids, max_length=150,)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Question:", question)
print("Answer:", answer)

Question: What predicament does Yudhishthira face, and how does he seek guidance to resolve it?
Answer: 'Vaisampayana continued, 'after hearing from Dhaumya these words suitable to the occasion, he said, 'how did that bull among the Kurus, king Yudhishthira, for the sake of the Brahmanas adore the sun of wonderful appearance'


In [23]:
ans = "'Vaisampayana continued, 'after hearing from Dhaumya these words suitable to the occasion, he said, 'how did that bull among the Kurus, king Yudhishthira, for the sake of the Brahmanas adore the sun of wonderful appearance'"

pred_ans_emb = embmodel.encode(ans)
pred_ans_Vector= torch.tensor(pred_ans_emb, dtype=torch.float)

print("Cosine Similarity", F.cosine_similarity(pred_ans_Vector, ref_ans_Vector,dim=-1))

chencherry = SmoothingFunction()
weights = [(1./2., 1./2.),
 (1./3., 1./3., 1./3.),
 (1./4., 1./4., 1./4., 1./4.),
   (1./5., 1./5., 1./5., 1./5., 1./5.)
]

print('BLEU score -> {}'.format(sentence_bleu(ref_ans, ans, smoothing_function=chencherry.method1), weights=weights))
scores = scorer.score(ref_ans, ans)
for k,v in scores.items():
  print(k,v)

Cosine Similarity tensor(0.5413)
BLEU score -> 0.001812905385024148
rouge1 Score(precision=0.34285714285714286, recall=0.2926829268292683, fmeasure=0.31578947368421056)
rougeL Score(precision=0.2, recall=0.17073170731707318, fmeasure=0.1842105263157895)


In [24]:
[[i,i+max_length] for i in range(0, len(input_text), stride)]

[[0, 1024],
 [512, 1536],
 [1024, 2048],
 [1536, 2560],
 [2048, 3072],
 [2560, 3584],
 [3072, 4096],
 [3584, 4608],
 [4096, 5120],
 [4608, 5632],
 [5120, 6144]]

### Config 4

In [25]:
import torch
max_length = 256*4  # Maximum token limit of the model
stride = 256*3      # Adjust the stride based on your preference

# Split the input text into overlapping windows
windows = [input_text[i:i+max_length] for i in range(0, len(input_text), stride)]

# Encode each window and concatenate the results
input_ids_list = [tokenizer.encode(window, max_length=max_length, return_tensors="pt", truncation=True) for window in windows]
input_ids = torch.cat(input_ids_list, dim=1)

output_ids = model.generate(input_ids, max_length=150,)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Question:", question)
print("Answer:", answer)

Question: What predicament does Yudhishthira face, and how does he seek guidance to resolve it?
Answer: 'I will tell thee everything in detail, And, O illustriou, appoint thou a time.'


In [26]:
ans = "'I will tell thee everything in detail, And, O illustriou, appoint thou a time.'"
pred_ans_emb = embmodel.encode(ans)
pred_ans_Vector= torch.tensor(pred_ans_emb, dtype=torch.float)

print("Cosine Similarity", F.cosine_similarity(pred_ans_Vector, ref_ans_Vector,dim=-1))

chencherry = SmoothingFunction()
weights = [(1./2., 1./2.),
 (1./3., 1./3., 1./3.),
 (1./4., 1./4., 1./4., 1./4.),
   (1./5., 1./5., 1./5., 1./5., 1./5.)
]

print('BLEU score -> {}'.format(sentence_bleu(ref_ans, ans, smoothing_function=chencherry.method1), weights=weights))
scores = scorer.score(ref_ans, ans)
for k,v in scores.items():
  print(k,v)

Cosine Similarity tensor(0.1788)
BLEU score -> 0.004849860224029752
rouge1 Score(precision=0.07142857142857142, recall=0.024390243902439025, fmeasure=0.03636363636363637)
rougeL Score(precision=0.07142857142857142, recall=0.024390243902439025, fmeasure=0.03636363636363637)


In [27]:
[[i,i+max_length] for i in range(0, len(input_text), stride)]

[[0, 1024],
 [768, 1792],
 [1536, 2560],
 [2304, 3328],
 [3072, 4096],
 [3840, 4864],
 [4608, 5632],
 [5376, 6400]]

# AutoModelForQuestionAnswering

In [28]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
# model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

inputs = tokenizer(question, context, return_tensors="pt", padding=True, max_length=512, truncation=True)

with torch.no_grad():
    outputs = model(**inputs)

start_scores = outputs.start_logits
end_scores = outputs.end_logits

answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end+1])

print("Question:", question)
print("Answer:", answer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Question: What predicament does Yudhishthira face, and how does he seek guidance to resolve it?
Answer: unable to support them


In [29]:
ans = "unable to support them"
pred_ans_emb = embmodel.encode(ans)
pred_ans_Vector= torch.tensor(pred_ans_emb, dtype=torch.float)

print("Cosine Similarity", F.cosine_similarity(pred_ans_Vector, ref_ans_Vector,dim=-1))

chencherry = SmoothingFunction()
weights = [(1./2., 1./2.),
 (1./3., 1./3., 1./3.),
 (1./4., 1./4., 1./4., 1./4.),
   (1./5., 1./5., 1./5., 1./5., 1./5.)
]

print('BLEU score -> {}'.format(sentence_bleu(ref_ans, ans, smoothing_function=chencherry.method1), weights=weights))
scores = scorer.score(ref_ans, ans)
for k,v in scores.items():
  print(k,v)

Cosine Similarity tensor(0.1697)
BLEU score -> 0.01680450080670213
rouge1 Score(precision=0.75, recall=0.07317073170731707, fmeasure=0.13333333333333333)
rougeL Score(precision=0.75, recall=0.07317073170731707, fmeasure=0.13333333333333333)


# DistilBERT

In [36]:
from transformers import pipeline

question_answerer = pipeline("question-answering") #default model: distilbert-base-cased-distilled-squad, 261 MB

# context = "My name is Sylvain and I work at Hugging Face in Brooklyn"
answer = question_answerer(question=question, context=context,)
answer

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.15807096660137177,
 'start': 4880,
 'end': 4890,
 'answer': 'god of day'}

# roberta

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", "deepset/roberta-base-squad2") #default model: distilbert-base-cased-distilled-squad, 496 MB

# context = "My name is Sylvain and I work at Hugging Face in Brooklyn"
answer = question_answerer(question=question, context=context,)
answer

# text2text generation pipeline

In [None]:
text2text_generator = pipeline("text2text-generation", model = "t5-small")

In [51]:
text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")

Generate config GenerationConfig {
  "_from_model_config": true,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.33.3"
}



[{'generated_text': 'the answer to life, the universe and everything'}]

In [59]:
context = """"In former days, having placed the four Vedas on one side and the Bharata on the other,
these were weighed in the balance by the celestials assembled for that purpose. And as the latter weighed heavier than the four Vedas with their mysteries,
from that period it hath been called in the world Mahabharata (the great Bharata). Being esteemed superior both in substance and gravity of import
it is denominated Mahabharata on account of such substance and gravity of import. He that knoweth its meaning is saved from all his sins.
'Tapa is innocent, study is harmless, the ordinance of the Vedas prescribed for all the tribes are harmless,
the acquisition of wealth by exertion is harmless; but when they are abused in their practices it is then that they become sources of evil.'"
"""
question = "What is saved from sins?"
question = "What is evil?"
text2text_generator("question: "+ question+ "context: " +context, max_length=100)

Generate config GenerationConfig {
  "_from_model_config": true,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.33.3"
}



[{'generated_text': 'sources of evil'}]

# GPT2

# MCQ with

In [6]:
from transformers import  AutoTokenizer, OPTForCausalLM
import torch
import numpy as np

class QAModel():

  def __init__(self, model_name="facebook/opt-1.3b", device='cuda'):  # 2.63GB Model
    self.model = OPTForCausalLM.from_pretrained(model_name).to(device)
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.device = device

  def get_answer(self, q, options):
    scores = []
    for o in options:
      input = self.tokenizer(q+' '+o, return_tensors="pt").input_ids.to(self.device)
      o_input = self.tokenizer(o, return_tensors="pt").to(self.device)
      o_len = o_input.input_ids.size(1)
      target_ids = input.clone()
      target_ids[:, :-o_len] = -100
      with torch.no_grad():
          outputs = self.model(input, labels=target_ids)
          neg_log_likelihood = outputs[0]

      scores.append((-1*neg_log_likelihood.cpu()))
    args = np.argsort(scores)
    return options[args[-1]]


In [7]:
data = [{"question":"Where is capital of France?", "options":["London","Berlin","Paris","Lyon"]},
        {"question":"who is best known for developing the theory of relativity?","options":["Albert Einstein","Isaac Newton","Stephen Hawking","Max Planck"]},
        {"question":"Who is CEO of Tesla?","options":["Bill Gates","Elon Musk","Steve Jobs","Tim cook"]}]

qa_model = QAModel()
for d in data:
  answer = qa_model.get_answer(d['question'],d['options'])
  print('Question:',d['question'])
  print('Answer:',answer)

Downloading (…)lve/main/config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Question: Where is capital of France?
Answer: Paris
Question: who is best known for developing the theory of relativity?
Answer: Albert Einstein
Question: Who is CEO of Tesla?
Answer: Elon Musk
