In [None]:
#pip install transformers trl accelerate torch bitsandbytes peft datasets -qU
#pip install scipy
#torch==2.1.2
#transformers==4.36.0
#trl==0.7.4
#accelerate==0.25.0
#bitsandbytes==0.41.3.post2
#peft==0.7.0
#datasets==2.15.0
#scipy==1.11.4
#sentencepiece==0.1.99

In [16]:
from py_standard.langchain_lit import load_all_documents

docs = load_all_documents('./data-user/', 1000 * 7)

def read_page_contents_from_docs(docs):
   for doc in docs:
      yield doc.page_content

In [None]:
#pip install -U spacy
#python -m spacy download en_core_web_lg==3.7.1
import spacy

nlp = spacy.load('en_core_web_lg')

def read_file_segments(file_path):
   with open(file_path, 'r', encoding='utf-8') as file:
      text = file.read()
   doc = nlp(text)
   for sentence in doc.sents:
      yield sentence.text

a = list(read_file_segments('./documents/live-baccarat-doc.md'))
a

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from IPython.display import display, Markdown, Latex

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16,
)

In [6]:
MODEL_PATH = f"../models/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=nf4_config,
    device_map='auto',
    local_files_only=True,
    #trust_remote_code=False,
    use_cache=True
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [29]:
def ask1():
   global model, tokenizer
   messages = [
      {"role": "user", "content": "What is your favourite condiment?"},
      {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
      {"role": "user", "content": "Do you have mayonnaise recipes?"}
   ]
   model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
   generated_ids = model.generate(model_inputs, 
                                  max_new_tokens=1000, 
                                  do_sample=True,
                                  pad_token_id=tokenizer.pad_token_id)
   decoded_output = tokenizer.batch_decode(generated_ids)
   #print(decoded_output[0])
   return decoded_output[0]

answer = ask1()
answer
   

"<s> [INST] What is your favourite condiment? [/INST]Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s> [INST] Do you have mayonnaise recipes? [/INST]While I don't have the ability to create or prepare recipes myself, I can certainly help you find one for mayonnaise! Here's a classic and simple Homemade Mayonnaise recipe you can try:\n\nIngredients:\n- 1 cup (240 ml) light tasting oil (like canola, safflower, or vegetable oil)\n- 1 large egg yolk\n- 1 tablespoon (15 ml) white wine vinegar or other mild vinegar\n- 1 teaspoon (5 g) Dijon mustard\n- 1/2 teaspoon (3 g) Kosher salt\n- 1/2 teaspoon (3 g) Freshly ground black pepper\n\nInstructions:\n1. Set up a large bowl with an immersion blender. This will make the process simpler. However, you can also use a regular blender or a whisk.\n2. Add the egg yolk, vinegar, Dijon mustard, salt, and pepper to the bowl.\n3. With the immersion 

In [17]:
def lstrip_inst(text):
   token = "[/INST]"
   idx = text.rfind(token)
   if idx != -1:
      extracted_text = text[idx + len(token):]
      return extracted_text.lstrip()
   return text

def rstrip_s(text):
   token = "</s>"
   idx = text.rfind(token)
   if idx != -1:
      extracted_text = text[:idx]
      return extracted_text
   return text


def ask(user_input):
   global model, tokenizer
   messages = [
      {"role": "user", "content": user_input}
   ]
   model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
   generated_ids = model.generate(model_inputs, 
                                  max_new_tokens=1000, 
                                  do_sample=True,
                                  pad_token_id=tokenizer.pad_token_id)
   decoded_output = tokenizer.batch_decode(generated_ids)
   answer = decoded_output[0]
   answer = lstrip_inst(answer)
   return rstrip_s(answer)

"I don't have a name. I'm just a computer program designed to help answer questions. You can call me AI or my friend, the assistant. How may I help you today?"

In [None]:
answer = ask("what is your name?")
answer

In [22]:
def extract_markdown_tables_from_content(page_content):
   prompt = f"""{page_content}
   ----------
   Extract the Markdown table data and its related content from the above content,
   and directly describe the data and related content in English.
   """
   answer_markdown = ask(prompt)
   return answer_markdown

def generate_qa_from_markdown_tables(markdown_content):
   prompt_template = """{content}
   ----------
   Based on the above content, extract the Markdown table data and its related content,
   and generate corresponding questions and answers directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=markdown_content)
   return ask(prompt)


In [23]:
def generate_qa_from_markdown_table_rows(markdown_content):
   prompt_template = """{content}
   ----------
   Based on the above content,
   extract each row of data from the Markdown table data,
   and generate corresponding questions and answers for each individual row directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=markdown_content)
   return ask(prompt)


In [24]:
def generate_qa_from_extract_terms(page_content):
   prompt_template = """{content}
   ----------
   Extract all the common terms from the above content and generate corresponding questions and answers for each one directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=page_content)
   return ask(prompt)


In [25]:
def generate_qa_from_extract_summary(page_content):
   prompt_template = """{content}
   ----------
   Summarize the above content and then generate corresponding questions and answers directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=page_content)
   return ask(prompt)

In [26]:
def generate_qa_from_extract_segments(page_content):
   prompt_template = """{content}
   ----------
   Segment the above content into appropriate paragraphs,
   summarize each segment, Do not output summary, and then generate corresponding questions and answers for each segment directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=page_content)
   return ask(prompt)


In [27]:
def generate_qa_from_extract_values(page_content):
   prompt_template = """{content}
   ----------
   Extract each sentence related to numbers from the above content and generate corresponding question-answer pairs directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=page_content)
   return ask(prompt)

In [18]:
docs1 = docs[0].page_content

In [None]:
answer = generate_qa_from_extract_summary(docs1)
answer

In [55]:
def write_qa_file(file_path, text, mode="a"):
   with open(file_path, mode, encoding='utf-8') as f:
      f.write(text)
      f.flush()
      
def append_delimit(content):
   end_str = "\r\n"
   if content.endswith('\n'):
      end_str = ""
   return f"{end_str}{content}##########\r\n"
   
def append_qa_content(sub_qa, content):
   sub_qa = append_delimit(sub_qa)
   return content + sub_qa
      
def generate_qa_from_page_content(page_content, write_qa_file_fn=None):
   qa_content = ""
   def write_file(text):
      if write_qa_file_fn is None:
         return
      text = append_delimit(text)
      write_qa_file_fn(text)
      
   def append_to_qa_content(sub_qa):
      nonlocal qa_content
      write_file(sub_qa)
      qa_content = append_qa_content(sub_qa, qa_content)
      
   sub_qa = generate_qa_from_extract_summary(page_content)
   append_to_qa_content(sub_qa)

   sub_qa = generate_qa_from_extract_segments(page_content)
   append_to_qa_content(sub_qa)

   sub_qa = generate_qa_from_extract_terms(page_content)
   append_to_qa_content(sub_qa)

   markdown_content = extract_markdown_tables_from_content(page_content)
   sub_qa = generate_qa_from_markdown_tables(markdown_content)
   append_to_qa_content(sub_qa)

   sub_qa = generate_qa_from_markdown_table_rows(markdown_content)
   append_to_qa_content(sub_qa)

   sub_qa = generate_qa_from_extract_values(page_content)
   append_to_qa_content(sub_qa)
   return qa_content


In [56]:
qa_file = "./results/llm-qa1.md"
for idx, doc in enumerate(docs):
   if idx == 0:
      write_qa_file(qa_file, "", mode="w")
   generate_qa_from_page_content(doc.page_content, 
                                 lambda content: write_qa_file(qa_file, content))




