In [87]:
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

# model_url = "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q4_K_M.gguf"

llm = LlamaCPP(
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path="./llm/llama-2-7b-chat.Q4_K_M.gguf",
    temperature=0.1,
    max_new_tokens=512,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [12]:
import json

with open("./input.json", "r") as f:
    data = json.load(f)

with open("./sample.json", "r") as f:
    samples = json.load(f)

In [37]:
def generate_input_str(data):
    input_str = ["Dialogue:"]
    for d in data["dialogue"]:
        utter = f"{d['speaker']}: {d['utterance']}"
        input_str.append(utter)
    summary = f"Summary:\n{data['summary']}"
    input_str.append(summary)

    return "\n".join(input_str)

def generate_sample_str(data):
    input_str = generate_input_str(data)
    return f"{input_str}\nAnswer:\n{data['answer']}\n"

def generate_zeroshot_prompt(input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. Answer with "Yes" or "No".
    
{generate_input_str(input)}
Answer:
"""

def generate_fewshot_prompt(samples, input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. Answer with "Yes" or "No".

{generate_input_str(samples[0])}
Answer: Yes

{generate_input_str(samples[1])}
Answer: No

{generate_input_str(input)}
Answer:
"""

def generate_zeroshot_cot_prompt(input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. Answer with "Yes" or "No".
    
{generate_input_str(input)}

Answer:
Let's think step by step"""

def generate_fewshot_cot_prompt(samples, input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. Answer with "Yes" or "No".

{generate_sample_str(samples[0])}
{generate_sample_str(samples[1])}
{generate_input_str(input)}
Answer:
"""

def generate_fewshot_cot_prompt(samples, input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. Answer with "Yes" or "No".

{generate_sample_str(samples[0])}
{generate_sample_str(samples[1])}
{generate_input_str(input)}
Answer:
"""

def generate_analogical_prompt(input):
    return f"""Your will be give a dialogue and a summary. Your task is to determine whether the summary accurately reflects the dialogue or not. When presented with the dialogue and summary, recall relevant problems as examples. Afterward, proceed with the answer.
# Problem:
{generate_input_str(input)}

# Instructions:
## Relevant Problems:
Recall two examples of dialogues and summaries that are relevant to the initial problem. The examples should be distinct from each other and from the initial problem. For each sample:
- After "Q: ", describe the dialogue and summary
- After "A: ", explain the solution and provide your answer with either 'Yes' or 'No'.
## Solve the initial problem:
Q: Copy and paste the initial problem here.
A: Explain the solution and provide your answer with either 'Yes' or 'No'.
"""

def generate_ps_prompt(input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. Answer with "Yes" or "No".
    
{generate_input_str(input)}

Answer:
Let's first understand the problem and devise a plan to solve the problem. Then let's carry out the plan and solve the problem step by step"""

def generate_psplus_prompt(input):
    return f"""Given the data consists of dialogue and summary below, determine whether the summary accurately reflects the dialogue or not. Answer with "Yes" or "No".
    
{generate_input_str(input)}

Answer: 
Let's first read and understand the dialogue carefully, extract facts from each of the dialogue utterances, read the summary, and last compare your understandings of the summary with facts from the dialogue. Devise a plan to answer the instruction, then carry out the plan to solve the problem step by step."""

In [98]:
prompt = generate_psplus_prompt(generate_input_str(data[1]))

response_iter = llm.stream_complete(prompt)
for response in response_iter:
    print(response.delta, end="", flush=True)

Llama.generate: prefix-match hit


  Yes, I can help you with that! Here's my plan to determine whether the summary accurately reflects the dialogue:
Step 1: Read and understand the dialogue carefully
I have read the dialogue provided and understood each utterance made by Grace and Henry.
Step 2: Extract facts from each of the dialogue utterances
Grace: How are we allocating the budget for this project?
Henry: We should allocate more funds to the advertising segment.
Facts extracted from the dialogue:
* Grace asked how the budget for the project is being allocated.
* Henry suggested that more funds should be allocated to the advertising segment.
Step 3: Read the summary and compare it with the facts extracted from the dialogue
The summary states, "Henry suggested the team find more funds for the project."
Comparison of the summary with the facts extracted from the dialogue:
* The summary does not accurately reflect the dialogue. Henry did not suggest finding more funds for the project as a whole, but rather allocated mo

In [6]:
from transformers import AutoTokenizer

checkpoint = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

messages = [
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]

tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

'<|user|>\nHow many helicopters can a human eat in one sitting?</s>\n'

In [8]:
tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

'<|user|>\nHow many helicopters can a human eat in one sitting?</s>\n<|assistant|>\n'