# Inference with custom GPT
## Install dependencies

In [None]:
#%pip install -U openvino-genai
%pip install gradio==4.44.1

In [None]:
%pip install keras-hub 

Now **restart the kernel**

In [1]:
import os

os.environ["KERAS_BACKEND"] = "jax"
import numpy as np
import keras
import keras_hub


## Read the vocabulary

In [2]:
SEQ_LEN = 128

In [5]:
import pickle
with open('/kaggle/input/c4-vocabulary/c4vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

vocab[500:520]



['よ',
 'ら',
 'り',
 'る',
 'れ',
 'を',
 'ア',
 'ィ',
 'イ',
 'ウ',
 'エ',
 'オ',
 'カ',
 'ガ',
 'キ',
 'ク',
 'グ',
 'コ',
 'サ',
 'シ']

## Instantiate tokenizer

In [6]:
tokenizer= keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

## Load saved model

In [7]:
gpt=keras.saving.load_model("/kaggle/input/gpt2-tpu-c4-it/keras/default/1/gpt2_gpt2-tpu-rope-c4-it.keras",
                            custom_objects=None, compile=True, safe_mode=True)

## Define preprocessing functions

In [8]:
def next(prompt, cache, index):
    logits = gpt(prompt)[:, index - 1, :]
    # Ignore hidden states for now; only needed for contrastive search.
    hidden_states = None
    return logits, hidden_states, cache

In [9]:
# packer adds a start token
start_packer = keras_hub.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)

In [10]:
sampler = keras_hub.samplers.TopPSampler(k=5,
                                         p=0.8,
                                         seed=42,
                                         temperature=0.8)


In [11]:
def format_prompt(prompt):
  formatted_prompt=f"""
### Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
### Instruction:
{prompt}
### Input:

### Response:
"""
  return formatted_prompt

In [12]:
def generate_text(prompt):
  formatted_prompt=prompt
  print ("formatted prompt:\n",formatted_prompt,"\n")
  prompt_tokens = start_packer(tokenizer([formatted_prompt]))
  try:
    output_tokens = sampler(
        next=next,
        prompt=prompt_tokens,
        index=np.where(prompt_tokens.flatten()==0)[0][0],
    )
  except:
    output_tokens = sampler(
        next=next,
        prompt=prompt_tokens,
        index=1,
    )
  txt = tokenizer.detokenize(output_tokens)
  print ("raw response", txt)
  return txt


In [13]:
response=generate_text(format_prompt("write a list of 3 ingredients for a sandwich"))


formatted prompt:
 
### Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
### Instruction:
write a list of 3 ingredients for a sandwich
### Input:

### Response:
 

raw response ['[BOS] # # # below is an instruction that describes a task , paired with an input that provides further context . write a response that appropriately completes the request . # # # instruction : write a list of 3 ingredients for a sandwich # # # input : # # # response : 1 . bread 2 . cheese 3 . lettuce [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']


In [14]:
print(response[0])

[BOS] # # # below is an instruction that describes a task , paired with an input that provides further context . write a response that appropriately completes the request . # # # instruction : write a list of 3 ingredients for a sandwich # # # input : # # # response : 1 . bread 2 . cheese 3 . lettuce [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [15]:
print(response[0].split("# # # response : ")[-1].split("[PAD]")[0])

1 . bread 2 . cheese 3 . lettuce 


## Setup interface

In [16]:
import gradio as gr

In [17]:
def predict(message, history, ):
    print (message)
    if history != []:
        print (history)
        print (len(history))
        messages = "".join(["".join([format_prompt(item[0]), "\n",item[1],"\n\n"]) for item in history])
        messages = messages + format_prompt(message)
        #history_transformer_format = history + [[format_prompt(message), ""]]
    else:
        messages = format_prompt(message)
        #history_transformer_format = [[format_prompt(message), ""]]
    #print (history_transformer_format)
    #messages = "".join(["".join([item[0], "\n",item[1],"\n"])
    #                    for item in history_transformer_format])
    print ("\nContext:\n", messages, "\n")
    response = generate_text(messages)[0].split("# # # response : ")[-1].split("[PAD]")[0]
    print ("formatted response: ",response)
    return str(response)

In [18]:
SYS_PROMPT="Temporary sys prompt"

In [19]:
def set_parameters(temperature, top_p, max_output_tokens, sys_prompt):
    print ("Temporary parameter stub")

In [20]:
### Text-only interface
with gr.Blocks(fill_height=True, elem_id="textonly_block") as demo:
    sys_prompt=gr.Textbox(label="System Prompt",
                          value=SYS_PROMPT,
                          lines=3,
                          interactive=True,
                          show_copy_button=True)
    with gr.Row():
        temperature=gr.Slider(label="Temperature", value=0.5, minimum=0.1, maximum=2.0)
        top_p=gr.Slider(label="Top-P", value=0.8, minimum=0.1, maximum=1.0)
        max_output_tokens=gr.Slider(label="Max Output Tokens", minimum=128, maximum=2048, value=512)
        set_button=gr.Button("Set",size="sm")
    set_button.click(set_parameters, [temperature, top_p, max_output_tokens, sys_prompt])

    gr.ChatInterface(
        predict,
        multimodal=False,
        title="Homemade GPT"
    )

In [21]:
demo.launch(debug=True, share=True, )

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://e2e62cc97e23e05fc8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


tell me a story

Context:
 
### Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
### Instruction:
tell me a story
### Input:

### Response:
 

formatted prompt:
 
### Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
### Instruction:
tell me a story
### Input:

### Response:
 

raw response ['[BOS] # # # below is an instruction that describes a task , paired with an input that provides further context . write a response that appropriately completes the request . # # # instruction : tell me a story # # # input : # # # response : once upon a time , in a small village nestled in the heart of the lush green forest , there lived a young girl named emma . emma lived in a small cottage that was nestled among the trees . she was known throughout the village for her kindness and he



In [22]:
demo.close()

Closing server running on port: 7860
