In [98]:
from langchain_community.llms import Ollama  # llm
llm = Ollama(model="llama2:13b-chat")

In [217]:
from langchain_core.prompts import ChatPromptTemplate

template = """
<<SYS>>
You are a helpful, respectful and honest assistant. Alwasy answer without explanation. You are given a context. Read all the context then respond to the user query. 
<</SYS>>

<context>
{context}
</context>

[INST]
User: Write 5 questions about the context and provide the answers. Each questions must be of maximum 30 token, also the answer. 
Always create the question thinking the context is in a database of various texts and the question must be recognizable fitting the provided context.
Always write Q for question and A for answers. Always write in one line.
[/INST]
"""

    

prompt = ChatPromptTemplate.from_template(template)
chain = prompt | llm

In [218]:
doc = ["""Napoleon Bonaparte (born Napoleone di Buonaparte;[1][b] 15 August 1769 – 5 May 1821), later known by his regnal name Napoleon I, was a French emperor and military commander who rose to prominence during the French Revolution and led successful campaigns during the Revolutionary Wars. 
        He was the leader of the French Republic as First Consul from 1799 to 1804, then of the French Empire as Emperor of the French from 1804 until 1814, and briefly again in 1815. 
        His political and cultural legacy endures as a celebrated and controversial leader. 
        He initiated many enduring reforms, but has been criticized for his authoritarian rule. 
        He is considered one of the greatest military commanders in history and his wars and campaigns are still studied at military schools worldwide. 
        However, historians still debate the degree to which he was responsible for the Napoleonic Wars, in which between three and six million people died."""]

In [219]:
answer = chain.invoke({"context": doc[0]})

In [223]:
answer

"\nQ: When was Napoleon Bonaparte born?\nA: Napoleone di Buonaparte; 15 August 1769\n\nQ: What was Napoleon's role during the French Revolution?\nA: First Consul from 1799 to 1804\n\nQ: What was Napoleon's regnal name?\nA: Emperor of the French\n\nQ: How many people died in the Napoleonic Wars?\nA: between three and six million people died.\n\nQ: Is Napoleon considered a controversial leader?\nA: Yes, he is celebrated but criticized for his authoritarian rule."

In [224]:
import re

def extract_QA(answer):
    regex = r"Q: (.+?)\\nA: (.+?)\\n\\n"
    test_str = (answer+"\n\n").replace('\n', '\\n')

    matches = re.finditer(regex, test_str)

    qa = []
    for matchNum, match in enumerate(matches, start=1):
        tmp = []
        for groupNum in range(0, len(match.groups())):
            groupNum = groupNum + 1
            group = match.group(groupNum)
            tmp.append(re.sub(r'\([^)]*\)', '', group))
        qa.append(tuple(tmp))
    return qa

In [225]:
extract_QA(answer)

[('When was Napoleon Bonaparte born?',
  'Napoleone di Buonaparte; 15 August 1769'),
 ("What was Napoleon's role during the French Revolution?",
  'First Consul from 1799 to 1804'),
 ("What was Napoleon's regnal name?", 'Emperor of the French'),
 ('How many people died in the Napoleonic Wars?',
  'between three and six million people died.'),
 ('Is Napoleon considered a controversial leader?',
  'Yes, he is celebrated but criticized for his authoritarian rule.')]

## squad

In [61]:
from datasets import load_dataset
train_ds = load_dataset("squad", split="train")

Downloading readme:   0%|          | 0.00/7.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [62]:
train_ds

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

Up to 5 question for passage

In [243]:
train_ds[:10]

{'id': ['5733be284776f41900661182',
  '5733be284776f4190066117f',
  '5733be284776f41900661180',
  '5733be284776f41900661181',
  '5733be284776f4190066117e',
  '5733bf84d058e614000b61be',
  '5733bf84d058e614000b61bf',
  '5733bf84d058e614000b61c0',
  '5733bf84d058e614000b61bd',
  '5733bf84d058e614000b61c1'],
 'title': ['University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame'],
 'context': ['Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is

In [237]:
template = """
<<SYS>>
You are a helpful, respectful and honest dataset annotator. Alwasy answer without explanation. You are given a context. 
<</SYS>>

<context>
{context}
</context>

[INST]
User: Write 5 questions about the context and provide the answers. Each questions and answer must be of maximum 30 token.
Always create the question as you have other passages of different topic in a database, for example avoid generic subject.
Always write Q for question and A for answers. Always write in one line.
[/INST]
"""

prompt = ChatPromptTemplate.from_template(template)
chain = prompt | llm

In [238]:
sample = train_ds[1]
answer = chain.invoke({"context": 'This passage is about: ' + sample['title'] + '.' + sample['context']})
extract_QA(answer)

[('Which university is known for its Catholic character and has a golden statue of the Virgin Mary on top of its Main Building?',
  'The University of Notre Dame.'),
 ('What is the name of the basilica located next to the Main Building of the University of Notre Dame?',
  'The Basilica of the Sacred Heart.'),
 ('What is the replica of the grotto at Lourdes, France, located behind the Basilica of the Sacred Heart?',
  'The Grotto.'),
 ('In which direction does the simple, modern stone statue of Mary face?',
  'The main drive.'),
 ('What is the legend inscribed on the statue of Christ with arms upraised in front of the Main Building?',
  '"Venite Ad Me Omnes" .')]

In [None]:
new_qa = []

for sample in tqdm(train_ds):
    context = 'passage about: ' + sample['title'] + '.' + sample['context']
    chain.invoke({"context": train_ds[0]['title'] + '\t' + train_ds[0]['context']})

## Replicate Efficient Prompt Caching via Embedding Similarity


In [334]:
from tqdm.auto import tqdm