In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.33-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downloading langchain_core-0.1.43-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.1/289.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downloa

## Load dataset

In [None]:
import pandas as pd

In [None]:
URL = "./data/dataset/math_train.json"

In [None]:
data = pd.read_json(URL)

df = pd.DataFrame(data)

df.head(10)

In [None]:
arr_data = [line for line in df["data"].values]

print(arr_data[0])

## Translate from Vietnamese to English

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


model_name = "VietAI/envit5-translation"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

inputs = [
    arr_data[0]["question"],
    arr_data[0]["explanation"]
]

outputs = model.generate(tokenizer(inputs, return_tensors="pt", padding=True).input_ids.to('cpu'), max_length=512)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['en: VietAI is a non-profit organization with the mission of nurturing artificial intelligence talents and building an international - class community of artificial intelligence experts in Vietnam.', 'en: According to the latest LinkedIn report on the 2020 list of attractive and promising jobs, AI - related job titles such as AI Specialist, ML Engineer and ML Engineer all rank high.']


## Processing

In [None]:
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter


llm = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(llm)

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=12, chunk_overlap=2, separators=["\n\n", "\n", ". "])


section_text = "Hello. This is some text to split. With a few "\
                "uncharacteristic words to chunk, expecting 2 chunks."

texts = text_splitter.split_text(section_text)
print(texts)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

['Hello. This is some text to split', '. With a few uncharacteristic words to chunk, expecting 2 chunks.']


In [None]:
for text in texts:
  print(text)

Hello. This is some text to split
. With a few uncharacteristic words to chunk, expecting 2 chunks.


## Chunks is different from tokens

In [None]:
section_text = "Hello. This is some text to split. With a few "\
                "uncharacteristic words to chunk, expecting 2 chunks."
encoded_text = tokenizer(section_text)
tokens = tokenizer.convert_ids_to_tokens(encoded_text['input_ids'])
print(tokens)

['▁Hello', '.', '▁This', '▁is', '▁some', '▁text', '▁to', '▁split', '.', '▁With', '▁', 'a', '▁few', '▁un', 'character', 'istic', '▁words', '▁to', '▁chunk', ',', '▁expecting', '▁2', '▁chunk', 's', '.', '</s>']


## Using RAG for multiple choices question

### Embedding Vectors

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.

In [None]:
from sentence_transformers import SentenceTransformer, util

embedding_model_path = "BAAI/bge-small-en-v1.5"

embedding_model = SentenceTransformer(embedding_model_path)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#### Embedding the question

In [None]:
question = "What is google bard?"

q_embeddings = embedding_model.encode(question)

In [None]:
q_embeddings.shape

(384,)

q_embeddings is a single vector and has the length of 384 (other models may use longer vectors for more accurate relations, ex: https://huggingface.co/spaces/mteb/leaderboard)

In [None]:
q_embeddings[:10]

array([-0.03703816, -0.07685136,  0.04402603, -0.05147361, -0.01734494,
       -0.05069026,  0.00479165,  0.03354503, -0.01825725,  0.01302368],
      dtype=float32)

## Collect and Embedding the Wiki documents

In [None]:
!pip install mwclient
!pip install mwparserfromhell

Collecting mwclient
  Downloading mwclient-0.10.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: mwclient
Successfully installed mwclient-0.10.1
Collecting mwparserfromhell
  Downloading mwparserfromhell-0.6.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.0/191.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mwparserfromhell
Successfully installed mwparserfromhell-0.6.6


In [None]:
import mwclient # for downloading example Wikipedia articles
import mwparserfromhell # for splitting Wikipedia articles into sections
import os
import pandas as pd
import re

### Collect documents

In [None]:
CATEGORY_TITLE = "Category:2023 software"
WIKI_SITE = "en.wikipedia.org"


def titles_from_category(
    category: mwclient.listing.Category, max_depth: int
) -> set[str]:
    """Return a set of page titles in a given Wiki category and its subcategories."""
    titles = set()
    for cm in category.members():
        if type(cm) == mwclient.page.Page:
            # ^type() used instead of isinstance() to catch match w/ no inheritance
            titles.add(cm.name)
        elif isinstance(cm, mwclient.listing.Category) and max_depth > 0:
            deeper_titles = titles_from_category(cm, max_depth=max_depth - 1)
            titles.update(deeper_titles)
    return titles


site = mwclient.Site(WIKI_SITE)
category_page = site.pages[CATEGORY_TITLE]
titles = titles_from_category(category_page, max_depth=1)
# ^note: max_depth=1 means we go one level deep in the category tree
print(f"Found {len(titles)} article titles in {CATEGORY_TITLE}.")

  and should_run_async(code)


Found 420 article titles in Category:2023 software.


### Chunk documents

In [None]:
# define functions to split Wikipedia pages into sections

SECTIONS_TO_IGNORE = [
    "See also",
    "References",
    "External links",
    "Further reading",
    "Footnotes",
    "Bibliography",
    "Sources",
    "Citations",
    "Literature",
    "Footnotes",
    "Notes and references",
    "Photo gallery",
    "Works cited",
    "Photos",
    "Gallery",
    "Notes",
    "References and sources",
    "References and notes",
]


def all_subsections_from_section(
    section: mwparserfromhell.wikicode.Wikicode,
    parent_titles: list[str],
    sections_to_ignore: set[str],
) -> list[tuple[list[str], str]]:
    """
    From a Wikipedia section, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    headings = [str(h) for h in section.filter_headings()]
    title = headings[0]
    if title.strip("=" + " ") in sections_to_ignore:
        # ^wiki headings are wrapped like "== Heading =="
        return []
    titles = parent_titles + [title]
    full_text = str(section)
    section_text = full_text.split(title)[1]
    if len(headings) == 1:
        return [(titles, section_text)]
    else:
        first_subtitle = headings[1]
        section_text = section_text.split(first_subtitle)[0]
        results = [(titles, section_text)]
        for subsection in section.get_sections(levels=[len(titles) + 1]):
            results.extend(all_subsections_from_section(subsection, titles, sections_to_ignore))
        return results


def all_subsections_from_title(
    title: str,
    sections_to_ignore: set[str] = SECTIONS_TO_IGNORE,
    site_name: str = WIKI_SITE,
) -> list[tuple[list[str], str]]:
    """From a Wikipedia page title, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    site = mwclient.Site(site_name)
    page = site.pages[title]
    text = page.text()
    parsed_text = mwparserfromhell.parse(text)
    headings = [str(h) for h in parsed_text.filter_headings()]
    if headings:
        summary_text = str(parsed_text).split(headings[0])[0]
    else:
        summary_text = str(parsed_text)
    results = [([title], summary_text)]
    for subsection in parsed_text.get_sections(levels=[2]):
        results.extend(all_subsections_from_section(subsection, [title], sections_to_ignore))
    return results

  and should_run_async(code)


In [None]:
# split pages into sections
# may take ~1 minute per 100 articles
wikipedia_sections = []
for title in titles:
    wikipedia_sections.extend(all_subsections_from_title(title))
print(f"Found {len(wikipedia_sections)} sections in {len(titles)} pages.")

  and should_run_async(code)


Found 2203 sections in 420 pages.


In [None]:
# clean text
def clean_section(section: tuple[list[str], str]) -> tuple[list[str], str]:
    """
    Return a cleaned up section with:
        - <ref>xyz</ref> patterns removed
        - leading/trailing whitespace removed
    """
    titles, text = section
    text = re.sub(r"<ref.*?</ref>", "", text)
    text = text.strip()
    return (titles, text)


wikipedia_sections = [clean_section(ws) for ws in wikipedia_sections]

# filter out short/blank sections
def keep_section(section: tuple[list[str], str]) -> bool:
    """Return True if the section should be kept, False otherwise."""
    titles, text = section
    if len(text) < 16:
        return False
    else:
        return True


original_num_sections = len(wikipedia_sections)
wikipedia_sections = [ws for ws in wikipedia_sections if keep_section(ws)]
print(f"Filtered out {original_num_sections-len(wikipedia_sections)} sections, leaving {len(wikipedia_sections)} sections.")

Filtered out 87 sections, leaving 2116 sections.


  and should_run_async(code)


In [None]:
# print example data
for ws in wikipedia_sections[:5]:
    print(ws[0])
    display(ws[1][:77] + "...")
    print()

['BharOS']


  and should_run_async(code)


'{{Short description|Mobile operating system}}\n{{use mdy dates|date=January 20...'


['BharOS', '==History==']


'[[Google]] is facing a crackdown from the [[Competition Commission of India]]...'


['BharOS', '== Features ==']


'BharOS targets security-conscious groups. BharOS does not come with any prein...'


['BharOS', '== Criticism ==']


'Divya Bhati writing for [[India Today]] noted that instructions on downloadin...'


['IPadOS 17']


'{{Short description|2023 tablet operating system by Apple Inc.}}\n{{More citat...'




### Split long sections to smaller ones

In [None]:
def num_tokens(text: str) -> int:
    """Return the number of tokens in a string."""
    encoded_text = tokenizer(text)
    return len(tokenizer.convert_ids_to_tokens(encoded_text['input_ids']))


def halved_by_delimiter(string: str, delimiter: str = "\n") -> list[str, str]:
    """Split a string in two, on a delimiter, trying to balance tokens on each side."""
    chunks = string.split(delimiter)
    if len(chunks) == 1:
        return [string, ""]  # no delimiter found
    elif len(chunks) == 2:
        return chunks  # no need to search for halfway point
    else:
        total_tokens = num_tokens(string)
        halfway = total_tokens // 2
        best_diff = halfway
        for i, chunk in enumerate(chunks):
            left = delimiter.join(chunks[: i + 1])
            left_tokens = num_tokens(left)
            diff = abs(halfway - left_tokens)
            if diff >= best_diff:
                break
            else:
                best_diff = diff
        left = delimiter.join(chunks[:i])
        right = delimiter.join(chunks[i:])
        return [left, right]


def truncated_string(
    string: str,
    max_tokens: int,
    print_warning: bool = True,
) -> str:
    """Truncate a string to a maximum number of tokens."""
    encoded_text = tokenizer(text)
    encoded_string = tokenizer.convert_ids_to_tokens(encoded_text['input_ids'])
    truncated_string = tokenizer.convert_tokens_to_string(encoded_string[:max_tokens])
    if print_warning and len(encoded_string) > max_tokens:
        print(f"Warning: Truncated string from {len(encoded_string)} tokens to {max_tokens} tokens.")
    return truncated_string


def split_strings_from_subsection(
    subsection: tuple[list[str], str],
    max_tokens: int = 1000,
    max_recursion: int = 5,
) -> list[str]:
    """
    Split a subsection into a list of subsections, each with no more than max_tokens.
    Each subsection is a tuple of parent titles [H1, H2, ...] and text (str).
    """
    titles, text = subsection
    string = "\n\n".join(titles + [text])
    num_tokens_in_string = num_tokens(string)
    # if length is fine, return string
    if num_tokens_in_string <= max_tokens:
        return [string]
    # if recursion hasn't found a split after X iterations, just truncate
    elif max_recursion == 0:
        return [truncated_string(string, max_tokens=max_tokens)]
    # otherwise, split in half and recurse
    else:
        titles, text = subsection
        for delimiter in ["\n\n", "\n", ". "]:
            left, right = halved_by_delimiter(text, delimiter=delimiter)
            if left == "" or right == "":
                # if either half is empty, retry with a more fine-grained delimiter
                continue
            else:
                # recurse on each half
                results = []
                for half in [left, right]:
                    half_subsection = (titles, half)
                    half_strings = split_strings_from_subsection(
                        half_subsection,
                        max_tokens=max_tokens,
                        max_recursion=max_recursion - 1,
                    )
                    results.extend(half_strings)
                return results
    # otherwise no split was found, so just truncate (should be very rare)
    return [truncated_string(string, max_tokens=max_tokens)]

  and should_run_async(code)


In [None]:
# split sections into chunks
MAX_TOKENS = 1600
wikipedia_strings = []
for section in wikipedia_sections:
    wikipedia_strings.extend(split_strings_from_subsection(section, max_tokens=MAX_TOKENS))

print(f"{len(wikipedia_sections)} Wikipedia sections split into {len(wikipedia_strings)} strings.")

  and should_run_async(code)
Token indices sequence length is longer than the specified maximum sequence length for this model (889 > 512). Running this sequence through the model will result in indexing errors


2116 Wikipedia sections split into 2171 strings.


In [None]:
print(wikipedia_strings[2])

NameError: name 'wikipedia_strings' is not defined

### Embed document chunks

In [None]:
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

embeddings = []
for batch_start in range(0, len(wikipedia_strings), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = wikipedia_strings[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = embedding_model.encode(batch)
    batch_embeddings = [e for e in response]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": wikipedia_strings, "embedding": embeddings})

Batch 0 to 999


  and should_run_async(code)


Batch 1000 to 1999
Batch 2000 to 2999


In [None]:
df

NameError: name 'df' is not defined

In [None]:
SAVE_PATH = "./wiki_data.csv"
df.to_csv(SAVE_PATH, index=False)

In [None]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    return dot(a, b)/(norm(a)*norm(b))

  and should_run_async(code)


In [None]:
sim_scores = [cos_sim(q_embeddings, e) for e in df["embedding"].values]

  and should_run_async(code)


In [None]:
df["sim_score"] = sim_scores

  and should_run_async(code)


In [None]:
df

  and should_run_async(code)


Unnamed: 0,text,embedding,sim_score
0,BharOS\n\n{{Short description|Mobile operating...,"[-0.032071378, -0.0075786696, -0.023488378, -0...",0.528278
1,BharOS\n\n==History==\n\n[[Google]] is facing ...,"[-0.02651798, 0.019908803, -0.01595337, -0.053...",0.562429
2,BharOS\n\n== Features ==\n\nBharOS targets sec...,"[-0.01931546, -0.04405502, -0.023263965, -0.07...",0.506736
3,BharOS\n\n== Criticism ==\n\nDivya Bhati writi...,"[-0.030464966, 0.034087025, -0.077004135, -0.0...",0.522346
4,IPadOS 17\n\n{{Short description|2023 tablet o...,"[-0.027212387, 0.0035852494, 0.033611406, -0.0...",0.504371
...,...,...,...
2166,Mittens (chess)\n\n== Release ==\n\nMittens wa...,"[-0.038147595, -0.015718762, 0.010082621, 0.00...",0.548787
2167,Mittens (chess)\n\n== Design ==\n\n[[File:Anat...,"[-0.07589061, 0.05388109, -0.056542512, -0.015...",0.450230
2168,Mittens (chess)\n\n== Rating ==\n\nOn Chess.co...,"[-0.049838375, 0.0046913642, -0.012603863, -0....",0.481857
2169,Mittens (chess)\n\n== Games ==\n\n[[File:Nakam...,"[-0.026671927, 0.01886133, -0.03659197, -0.004...",0.487241


In [None]:
df.sort_values(by=["sim_score"], ascending=False).head(10)

  and should_run_async(code)


Unnamed: 0,text,embedding,sim_score
217,Gemini (chatbot)\n\n== History ==\n\n=== Annou...,"[-0.025718477, -0.006157542, -0.020902231, 0.0...",0.738711
220,Gemini (chatbot)\n\n== History ==\n\n=== Updat...,"[-0.07428802, -0.047069218, -0.023829449, 0.00...",0.735163
219,Gemini (chatbot)\n\n== History ==\n\n=== Launc...,"[-0.049282406, -0.026964901, -0.023209117, -0....",0.689138
218,Gemini (chatbot)\n\n== History ==\n\n=== Annou...,"[-0.03237752, -0.041373145, -0.056740936, 0.01...",0.669357
221,Gemini (chatbot)\n\n== History ==\n\n=== Relau...,"[-0.05726068, -0.03039344, -0.008372254, -0.01...",0.658067
222,Gemini (chatbot)\n\n== Reception ==\n\n=== Cri...,"[-0.044062547, -0.031390358, -0.057875026, 0.0...",0.653904
1059,Microsoft Copilot\n\n== Background ==\n\nIn 20...,"[-0.0076226885, -0.007827519, -0.034073673, -0...",0.624126
1951,The Lord of the Rings: Gollum\n\n==Development...,"[-0.06063247, -0.02548961, 0.007274697, -0.018...",0.622412
119,Artifact (app)\n\n== Features ==\n\nFrequently...,"[-0.018608943, 0.0018425666, -0.009776174, -0....",0.617938
290,GPT-4\n\n== Usage ==\n\n===Other usage===\n\n*...,"[-0.036121324, -0.012729045, -0.031961937, -0....",0.607891


## Re-ranking

In [None]:
rerank_model_path = "BAAI/bge-reranker-base"

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

rerank_tokenizer = AutoTokenizer.from_pretrained(rerank_model_path)
rerank_model = AutoModelForSequenceClassification.from_pretrained(rerank_model_path)
rerank_model.eval()

def calculate_rerank_scores(pairs):
    with torch.no_grad():
        inputs = rerank_tokenizer(pairs, padding=True,
                                  truncation=True,
                                  return_tensors='pt', max_length=512)
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
        return scores

pairs = [(question, chunk) for chunk in df["text"].values]
rerank_scores = calculate_rerank_scores(pairs)
df["rerank_score"] = rerank_scores

  and should_run_async(code)


tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

## Building the context

In [None]:
top_chunks = df.sort_values(by=["sim_score"], ascending=False).head(10)["text"].values
context_arr = []
for t in top_chunks:
    context_arr.append(t)

In [None]:
context = "".join(context_arr)

context

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

llm_answer_path = "HuggingFaceH4/zephyr-7b-beta"
torch_device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(llm_answer_path)
llm_answer = AutoModelForCausalLM.from_pretrained(llm_answer_path,
             device_map=torch_device,
             torch_dtype=torch.float16)

# assuming here that "context" contains the pre-built context
query = "answer the following question, "\
    "based on your knowledge and the provided context. "\
"Keep the answer concise.\n\nquestion:" + question + "\n\ncontext:"+ context

input_ids = tokenizer.encode(query+"\n\nANSWER:",
   return_tensors='pt', return_attention_mask=False).to(torch_device)
greedy_output = llm_answer.generate(input_ids,
                                 max_new_tokens=1024, do_sample=True)
answer = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
print(answer[len(query):])