# LLama 2

## Setup and Installation

Select the model, config and inject the system prompt.

Run the following cells: the cell **below**; the cells in the **Install & Stuff** section; the cells in the **Import** section; the cells in the **Load Model**, Tokenizer & Pipe; the cells in the **Set Up The UI** section and the **last** section.

In [None]:
### Select the language model
model_id = "TheBloke/Llama-2-7B-chat-GPTQ"

# Other available models are:
# model_id = "TheBloke/Llama-2-13B-chat-GPTQ"
# model_id = "TheBloke/Llama-2-70B-chat-GPTQ"

# Configuration
runtimeFlag = "cuda:0"  # Run on GPU (you can't run GPTQ on cpu)
cache_dir = None  # by default, don't set a cache directory?!
scaling_factor = 1.0  # allows for a max sequence length of 16384*6 = 98304! Unfortunately, requires a V100 or A100 to have sufficient RAM.

DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant."

## Install and Stuff

In [None]:
# If you arerunning this notebook for the first time - uncomment and run the below line

# !pip install optimum, auto-gptq, pdfminer-six

In [None]:
import warnings

warnings.filterwarnings("ignore")

## Import

In [None]:
import transformers
import torch
import json
import os
from transformers import AutoTokenizer, TextStreamer
from transformers import AutoTokenizer, AutoModelForCausalLM

## Load Model, Tokenizer & Pipe

If you have this saved locally this is gonna go fast.

In [None]:
%%time
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    # device_map="auto",
    # torch_dtype=torch.float16,
    # rope_scaling = {"type": "dynamic", "factor": scaling_factor}
)

In [None]:
%%time
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    # config=model_config,
    # torch_dtype=torch.float16,
    # quantization_config=bnb_config,
    device_map="auto",
)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

In [None]:
pipe = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1,
)

## Set Up The UI

In [None]:
# Imports for the UI
from IPython.display import display, HTML, clear_output, Markdown
import textwrap, json
import ipywidgets as widgets
import re, time
from pdfminer.high_level import extract_text
import io

In [None]:
B_INST, E_INST = "[INST]", "[/INST]"  # B_INST, E_INST = "Question: ", "Answer: "

B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

SYSTEM_PROMPT = DEFAULT_SYSTEM_PROMPT

# max_doc_length = 50
max_context = 16384  # int(model.config.max_position_embeddings*scaling_factor)
max_doc_length = int(0.75 * max_context)  # max doc length is 75% of the context length
max_doc_words = int(max_doc_length)
max_prompt_len = int(0.85 * max_context) / 1  # Not sure if this is needed
max_gen_len = int(0.10 * max_prompt_len) / 1  # Not sure if this is needed

In [None]:
def generate_response(dialogs, temperature=0.01, top_p=0.9, logprobs=False):
    """Generate the next response to the conversation so far."""
    # torch.cuda.empty_cache()

    prompt = ""  # prompt_tokens = []
    for dialog in dialogs:
        if dialog[0]["role"] != "system":
            dialog = [
                {
                    "role": "system",
                    "content": SYSTEM_PROMPT,
                }
            ] + dialog

        prompt += f"[INST] <<SYS>>\\n{dialog[0]['content'].strip()}\\n<</SYS>>\\n\\n{dialog[1]['content'].strip()} [/INST] { '' if len(dialog) <= 2 else dialog[2]['content'] }"
        for i in range(3, len(dialog), 2):
            prompt += f"<s>[INST] {dialog[i]['content']} [/INST] { '' if i+1 >= len(dialog) else ' '+dialog[i+1]['content']+' </s>' }"

    res = pipe(prompt)
    new_assistant_response_all = res[0]["generated_text"]
    new_assistant_response = new_assistant_response_all.split("[/INST]")[-1].strip()

    return new_assistant_response

In [None]:
def print_wrapped(text):
    """Pretty print the response generated."""
    code_pattern = r"```(.+?)```"
    matches = list(re.finditer(code_pattern, text, re.DOTALL))

    if not matches:
        # If there are no code blocks, display the entire text as Markdown
        display(Markdown(text))
        return

    start = 0
    for match in matches:
        # Display the text before the code block as Markdown
        before_code = text[start : match.start()].strip()
        if before_code:
            display(Markdown(before_code))

        # Display the code block
        code = match.group(0).strip()  # Extract code block
        display(Markdown(code))  # Display code block

        start = match.end()

    # Display the text after the last code block as Markdown
    after_code = text[start:].strip()  # Text after the last code block
    if after_code:
        display(Markdown(after_code))


dialog_history = [{"role": "system", "content": SYSTEM_PROMPT}]

button = widgets.Button(description="Send", style={"button_color": "#ff6e00"})
upload_button = widgets.Button(description="Upload .txt or .pdf")
text = widgets.Textarea(layout=widgets.Layout(width="800px"))

output_log = widgets.Output()


def on_button_clicked(b):
    """Action on button click."""
    user_input = text.value
    dialog_history.append({"role": "user", "content": user_input})

    text.value = ""

    # Change button description and color, and disable it
    button.description = "Processing..."
    button.style.button_color = (
        "#cc2200"  # Use hex color codes for better color choices
    )
    button.disabled = True  # Disable the button when processing

    with output_log:
        clear_output()
        for message in dialog_history:
            print_wrapped(f'**{message["role"].capitalize()}**: {message["content"]}\n')

    assistant_response = generate_response([dialog_history])

    # Re-enable the button, reset description and color after processing
    button.description = "Send"
    button.style.button_color = "#ff6e00"
    button.disabled = False

    dialog_history.append({"role": "assistant", "content": assistant_response})

    with output_log:
        clear_output()
        for message in dialog_history:
            print_wrapped(f'**{message["role"].capitalize()}**: {message["content"]}\n')


button.on_click(on_button_clicked)

# Create an output widget for alerts
alert_out = widgets.Output()

clear_button = widgets.Button(description="Clear Chat")
text = widgets.Textarea(layout=widgets.Layout(width="800px"))


def on_clear_button_clicked(b):
    """Action on clear button click."""
    # Clear the dialog history
    dialog_history.clear()
    # Add back the initial system prompt
    dialog_history.append({"role": "system", "content": SYSTEM_PROMPT})
    # Clear the output log
    with output_log:
        clear_output()


clear_button.on_click(on_clear_button_clicked)

In [None]:
from IPython.display import display, HTML
from ipywidgets import HBox, VBox

# Create the title with HTML
title = f"<h1 style='color: #ff6e00;'>Llama 2 Base Model 🦙</h1> <p>( Max context of: {'max_context'} or {'max_doc_words'} (TBD) )</p>"

# Assuming that output_log, alert_out, and text are other widgets or display elements...
first_row = HBox([button, clear_button])  # Arrange these buttons horizontally

# Arrange the two rows of buttons and other display elements vertically
layout = VBox([output_log, alert_out, text, first_row])

## Chat with Llama 2

The below cell creates a chat interface within the notebook and you can start chatting with Llama2 through the UI.

In [None]:
display(HTML(title))  # Use HTML function to display the title
display(layout)