In [None]:
#pip install transformers trl accelerate torch bitsandbytes peft datasets -qU
#pip install scipy
#torch==2.1.2
#transformers==4.36.0
#trl==0.7.4
#accelerate==0.25.0
#bitsandbytes==0.41.3.post2
#peft==0.7.0
#datasets==2.15.0
#scipy==1.11.4
#sentencepiece==0.1.99

In [98]:
from py_standard.langchain_lit import load_all_documents

docs = load_all_documents('./data-user/', 1000 * 2, 500)

def read_page_contents_from_docs(docs):
   for doc in docs:
      yield doc.page_content

In [None]:
#pip install -U spacy
#python -m spacy download en_core_web_lg==3.7.1
import spacy

nlp = spacy.load('en_core_web_lg')

def read_file_segments(file_path):
   with open(file_path, 'r', encoding='utf-8') as file:
      text = file.read()
   doc = nlp(text)
   for sentence in doc.sents:
      yield sentence.text

a = list(read_file_segments('./documents/live-baccarat-doc.md'))
a

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from IPython.display import display, Markdown, Latex

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16,
)

In [6]:
MODEL_PATH = f"../models/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=nf4_config,
    device_map='auto',
    local_files_only=True,
    #trust_remote_code=False,
    use_cache=True
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [29]:
def ask1():
   global model, tokenizer
   messages = [
      {"role": "user", "content": "What is your favourite condiment?"},
      {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
      {"role": "user", "content": "Do you have mayonnaise recipes?"}
   ]
   model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
   generated_ids = model.generate(model_inputs, 
                                  max_new_tokens=1000, 
                                  do_sample=True,
                                  pad_token_id=tokenizer.pad_token_id)
   decoded_output = tokenizer.batch_decode(generated_ids)
   #print(decoded_output[0])
   return decoded_output[0]

answer = ask1()
answer
   

"<s> [INST] What is your favourite condiment? [/INST]Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s> [INST] Do you have mayonnaise recipes? [/INST]While I don't have the ability to create or prepare recipes myself, I can certainly help you find one for mayonnaise! Here's a classic and simple Homemade Mayonnaise recipe you can try:\n\nIngredients:\n- 1 cup (240 ml) light tasting oil (like canola, safflower, or vegetable oil)\n- 1 large egg yolk\n- 1 tablespoon (15 ml) white wine vinegar or other mild vinegar\n- 1 teaspoon (5 g) Dijon mustard\n- 1/2 teaspoon (3 g) Kosher salt\n- 1/2 teaspoon (3 g) Freshly ground black pepper\n\nInstructions:\n1. Set up a large bowl with an immersion blender. This will make the process simpler. However, you can also use a regular blender or a whisk.\n2. Add the egg yolk, vinegar, Dijon mustard, salt, and pepper to the bowl.\n3. With the immersion 

In [121]:
def lstrip_inst(text):
   token = "[/INST]"
   idx = text.rfind(token)
   if idx != -1:
      extracted_text = text[idx + len(token):]
      return extracted_text.lstrip()
   return text

def rstrip_s(text):
   token = "</s>"
   idx = text.rfind(token)
   if idx != -1:
      extracted_text = text[:idx]
      return extracted_text
   return text


def ask(user_input, max_new_tokens=2000):
   global model, tokenizer
   messages = [
      {"role": "user", "content": user_input}
   ]
   model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
   generated_ids = model.generate(model_inputs, 
                                  max_new_tokens=max_new_tokens, 
                                  do_sample=True,
                                  pad_token_id=tokenizer.pad_token_id)
   decoded_output = tokenizer.batch_decode(generated_ids)
   answer = decoded_output[0]
   answer = lstrip_inst(answer)
   return rstrip_s(answer)

In [None]:
answer = ask("what is your name?")
answer

In [86]:
def modify_qa_content(content):
   prompt_template = """{content}
   ----------
   If the content above matches the format 'Question: Answer:', 
   maintain the original format. 
   Otherwise, modify the content related to Q&A to the format 'Question: Answer:'.
   """   
   prompt = prompt_template.format(content=content)
   return ask(prompt)

In [22]:
def extract_markdown_tables_from_content(page_content):
   prompt = f"""{page_content}
   ----------
   Extract the Markdown table data and its related content from the above content,
   and directly describe the data and related content in English.
   """
   answer_markdown = ask(prompt)
   return answer_markdown

def generate_qa_from_markdown_tables(markdown_content):
   prompt_template = """{content}
   ----------
   Based on the above content, extract the Markdown table data and its related content,
   and generate corresponding questions and answers directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=markdown_content)
   return ask(prompt)


In [23]:
def generate_qa_from_markdown_table_rows(markdown_content):
   prompt_template = """{content}
   ----------
   Based on the above content,
   extract each row of data from the Markdown table data,
   and generate corresponding questions and answers for each individual row directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=markdown_content)
   return ask(prompt)


In [24]:
def generate_qa_from_extract_terms(page_content):
   prompt_template = """{content}
   ----------
   Extract all the common terms from the above content and generate corresponding questions and answers for each one directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=page_content)
   return ask(prompt)


In [25]:
def generate_qa_from_extract_summary(page_content):
   prompt_template = """{content}
   ----------
   Summarize the above content and then generate corresponding questions and answers directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=page_content)
   answer = ask(prompt)
   answer = modify_qa_content(page_content)
   return answer

In [84]:
def keep_qa_content(content):
   prompt_template = """{content}
   ----------
   Keep only the QA content from the above. Remove all other content.
   """
   prompt = prompt_template.format(content=content)
   return ask(prompt)

def generate_qa_from_extract_segments(page_content):
   prompt_template = """{content}
   ----------
   Segment the above content into appropriate paragraphs,
   summarize each segment, Do not response summary and segment, and then generate corresponding questions and answers for each segment directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=page_content)
   answer = ask(prompt)
   answer = keep_qa_content(answer)
   return answer


In [85]:
page_content = """Summary:
The text outlines rules and procedures in case of electronic and physical malfunctions, as well as complaints during Live Baccarat games on SBOBET Casino. Electronic malfunctions may cause game results to be displayed incorrectly, but Player equipment problems do not affect game results. Operator system malfunctions and hardware failures will void the game and the table will be closed. Physical malfunctions, such as revealing cards or dealing extra cards, will result in the current round being aborted and the particular shoe being replaced. For complaints or disputes, Players must provide specific details, including the date and time of playing, Table ID, dealer's name, and round number, in order for the complaint to be considered.

Questions:
1. What happens if a card fails to scan during a Live Baccarat game on SBOBET Casino?
   Answer: The dealer will re-scan the card to display the result to the Players.

2. What is the consequence of an Operator system malfunction during Live Baccarat on SBOBET Casino?
   Answer: The game will be voided, and the particular table will be closed.

3. What should be done if a card falls off the table during a Live Baccarat game on SBOBET Casino?
   Answer: The current round will be aborted, and the particular shoe will be replaced with a new shoe.

4. What information should Players provide when making a complaint or dispute regarding a Live Baccarat game on SBOBET Casino?
   Answer: Players must provide the date and time of playing, Table ID, dealer's name, and round number to have their complaint considered.

5. What is the time limit for Players to submit complaints or disputes during Live Baccarat games on SBOBET Casino?
   Answer: Players must address their complaint within 24 hours of the dispute occurring."""
   
answer = modify_qa_content(page_content)
answer

"Question: What happens if a card fails to scan during a Live Baccarat game on SBOBET Casino?\nAnswer: The dealer will re- scan the card to display the result to the Players.\n\nQuestion: What is the consequence of an Operator system malfunction during Live Baccarat on SBOBET Casino?\nAnswer: The game will be voided, and the particular table will be closed.\n\nQuestion: What should be done if a card falls off the table during a Live Baccarat game on SBOBET Casino?\nAnswer: The current round will be aborted, and the particular shoe will be replaced with a new shoe.\n\nQuestion: What information should Players provide when making a complaint or dispute regarding a Live Baccarat game on SBOBET Casino?\nAnswer: Players must provide the date and time of playing, Table ID, dealer's name, and round number to have their complaint considered.\n\nQuestion: What is the time limit for Players to submit complaints or disputes during Live Baccarat games on SBOBET Casino?\nAnswer: Players must addres

In [88]:
page_content = """Question: What should a player do if a card fails to scan during Live Baccarat?
Answer: The dealer will re-scan the card and the game result will not be voided even if there are problems with the Players' equipment.

Question: What information is required to make a complaint or dispute a game result in Live Baccarat?
Answer: The player must provide the date and time of playing, Table ID, dealer's name, and round number.

Question: What happens if a player fails to provide the required information to make a complaint or dispute a game result in Live Baccarat?
Answer: The complaint will be rejected without further explanation.

Question: What happens if there's an Operator system malfunction or hardware failure during Live Baccarat?
Answer: The game will be voided and the particular table will be closed.

Question: What happens if cards are dealt accidentally in more than one place, or if cards are revealed during shuffling or if they fall off the table in Live Baccarat?
Answer: The round will be aborted and a new shoe will be used.

Question: What happens if the dealer deals an extra card after the game is finished in Live Baccarat?
Answer: The result will be considered valid."""
answer = modify_qa_content(page_content)
answer

"Question: What should a player do if a card fails to scan during Live Baccarat?\nAnswer: The player should wait while the dealer re-scans the card. The game result will not be voided even if there are problems with the Player's equipment.\n\nQuestion: What information is required to make a complaint or dispute a game result in Live Baccarat?\nAnswer: The player needs to provide the following information: date and time of playing, Table ID, dealer's name, and round number.\n\nQuestion: What happens if a player fails to provide the required information to make a complaint or dispute a game result in Live Baccarat?\nAnswer: The complaint will not be processed as we cannot proceed with insufficient information.\n\nQuestion: What happens if there's an Operator system malfunction or hardware failure during Live Baccarat?\nAnswer: If there's a technical issue with the Live Baccarat software or hardware, the game will be voided, and the particular table will be closed until the problem is res

In [76]:
answer = generate_qa_from_extract_segments(docs[0].page_content)
answer

"Question: What is Live Dealer Baccarat?\nAnswer: Live Dealer Baccarat is a casino table game presented by a live dealer where players bet on the hand with a point value closest to 9 (Player or Banker).\n\nQuestion: What is the objective of Live Dealer Baccarat?\nAnswer: The objective is to bet on the hand with a point value closest to 9 (Player or Banker).\n\nQuestion: What is the theoretical return to player of Live Deacer Baccarat?\nAnswer: The theoretical return to player is 98.41%.\n\nQuestion: How do I place a bet in Live Dealer Baccarat?\nAnswer: You place a bet in Live Dealer Baccarat by selecting chip values and placing them in the Betting Area on the table layout.\n\nQuestion: What is the Betting Timer, and why is it important?\nAnswer: The Betting Timer is a countdown that indicates the time players have to place their bets. Betting on the table is not allowed after the Betting Timer has expired.\n\nQuestion: What is the 'DOUBLE' bet button used for?\nAnswer: The 'DOUBLE' be

In [27]:
def generate_qa_from_extract_values(page_content):
   prompt_template = """{content}
   ----------
   Extract each sentence related to numbers from the above content and generate corresponding question-answer pairs directly.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro.
   """
   prompt = prompt_template.format(content=page_content)
   return ask(prompt)

In [96]:
def generate_qa_from_extract_classify_terms(page_content):
   prompt_template = """{content}
   ----------
   Identify all terms within the above content and categorize them. 
   Use all relevant content related to terms within the same category as the answer. 
   Generate questions and answers based on this content. 
   The answers must be derived from the content above.
   Only respond with Question and Answer content.
   Response Format:
   Question: What is your name?
   Answer: My name is Astro."""
   prompt = prompt_template.format(content=page_content)
   return ask(prompt)

In [217]:
import re
from py_standard.data_utils import combinations_fn

def terms_to_list(content):
   pattern1 = re.compile(r'([\*\-]|\d+\.)(.*)')
   result = []
   for line in content.split('\n'):
      line = line.strip()
      if line != "":
         if "," in line:
            continue
         if line.rstrip().endswith("."):
            continue
         match = pattern1.match(line)
         if match:
            line = match.group(2).lstrip()
         result.append(line)

   return result

def extract_terms(page_content):
   prompt_template = """{content}
   ----------
   List 10 terms from the above content, and the term must appear in the above content.
   one term without description per line.
   """
   prompt = prompt_template.format(content=page_content)
   answer = ask(prompt, 200)
   terms = terms_to_list(answer)
   return terms

def extract_adj(token):
   prompt_template = """{token}
   ----------
   Extract adjectives. Return an empty string if none are found. No additional explanation needed."""
   prompt = prompt_template.format(token=token)
   answer = ask(prompt, 80)
   return answer

def is_opposing_terms1(term1, term2):
   prompt_template = f"""Based on your understanding,
   are '{term1}' and '{term2}' opposing each other? 
   If yes, respond with 'Yes' directly.
   If you are unable to determine, please respond with 'No'.
   """
   prompt = prompt_template.format()
   answer = ask(prompt, 80)
   if 'Yes' in answer:
      return True, answer
   return False, answer

def is_opposing_terms(term1, term2):
   adj1 = extract_adj(term1)
   adj2 = extract_adj(term2)
   if adj1!="" and adj2!="":
      is_opposing, adj_reason = is_opposing_terms1(adj1, adj2)
      if is_opposing:
         return True, adj_reason
   return is_opposing_terms1(term1, term2)

def generate_qa_from_two_terms(page_content, term1, term2):
   question = f"""What are the differences between "{term1}" and "{term2}"?"""
   prompt_template = """{content}
   ----------
   Answer based on the above content.
   The answer must be present in the above content, otherwise reply with an empty string.
   {question}
   """
   prompt = prompt_template.format(content=page_content, question=question)
   answer = ask(prompt, 500)
   if answer.strip() == "":
      return ""
   return f"Question: {question}\r\nAnswer: {answer}\r\n"

def generate_qa_from_extract_terms_diff(page_content):
   terms_list = extract_terms(page_content)
   two_terms = combinations_fn(terms_list, 2)
   qa_content = ""
   for term1, term2 in two_terms:
      sub_qa = generate_qa_from_two_terms(page_content, term1, term2)
      qa_content += sub_qa
   return qa_content

In [94]:
docs1 = docs[1].page_content

In [220]:
#answer = is_opposing_terms("Small Tiger", "Tiger Tie")
#print(answer)
#answer = is_opposing_terms("Small Tiger", "Big Tiger")
#print(answer)
#answer = is_opposing_terms("Player", "Banker")
#print(answer)
answer = is_opposing_terms("commision", "non-commision")
print(answer)

(True, "Yes, 'commissioned' and 'non-commissioned' are opposing adjectives when used in the context of the military rank. However, in the context of a fee or charge paid for facilitating a transaction or providing a service, 'commission' and 'non-commission' are not adjectives but rather nouns, and they don't have")


In [82]:
def write_qa_file(file_path, text, mode="a"):
   with open(file_path, mode, encoding='utf-8') as f:
      f.write(text)
      f.flush()
      
def append_delimit(content):
   return f"{content}\r\n##########\r\n"
   
def append_qa_content(sub_qa, content):
   sub_qa = append_delimit(sub_qa)
   return content + sub_qa
      
def generate_qa_from_page_content(page_content, write_qa_file_fn=None):
   qa_content = ""
   def write_file(text):
      if write_qa_file_fn is None:
         return
      text = append_delimit(text)
      write_qa_file_fn(text)
      
   def append_to_qa_content(title, sub_qa):
      nonlocal qa_content
      write_file(f"# {title}\r\n")
      write_file(sub_qa)
      qa_content = append_qa_content(sub_qa, qa_content)
      
   sub_qa = generate_qa_from_extract_summary(page_content)
   append_to_qa_content("summary", sub_qa)

   sub_qa = generate_qa_from_extract_segments(page_content)
   append_to_qa_content("segments", sub_qa)

   sub_qa = generate_qa_from_extract_terms(page_content)
   append_to_qa_content("terms", sub_qa)
   
   sub_qa = generate_qa_from_extract_classify_terms(page_content)
   append_to_qa_content("classify terms", sub_qa)

   markdown_content = extract_markdown_tables_from_content(page_content)
   sub_qa = generate_qa_from_markdown_tables(markdown_content)
   append_to_qa_content("tables", sub_qa)

   sub_qa = generate_qa_from_markdown_table_rows(markdown_content)
   append_to_qa_content("rows", sub_qa)

   sub_qa = generate_qa_from_extract_values(page_content)
   append_to_qa_content("amounts", sub_qa)
   return qa_content


In [83]:
qa_file = "./results/llm-qa.md"
docs_len = len(docs)
for idx, doc in enumerate(docs):
   if idx == 0:
      write_qa_file(qa_file, "", mode="w")
   write_qa_file(qa_file, f"# {idx+1}/{docs_len} documents\r\n")
   generate_qa_from_page_content(doc.page_content, 
                                 lambda content: write_qa_file(qa_file, content))