In [48]:
from openai import OpenAI

from datasets import load_dataset, concatenate_datasets
from transformers import  AutoTokenizer

In [31]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    CBLACKBG  = '\33[40m'
    CREDBG    = '\33[41m'
    CGREENBG  = '\33[42m'
    CYELLOWBG = '\33[43m'
    CBLUEBG   = '\33[44m'
    CVIOLETBG = '\33[45m'
    CBEIGEBG  = '\33[46m'
    CWHITEBG  = '\33[47m'
    CBLACK  = '\33[30m'
    CRED    = '\33[31m'
    CGREEN  = '\33[32m'
    CYELLOW = '\33[33m'
    CBLUE   = '\33[34m'
    CVIOLET = '\33[35m'
    CBEIGE  = '\33[36m'
    CWHITE  = '\33[37m'

In [2]:
%%sh 
nvidia-smi

Mon Jun 17 12:04:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      On  | 00000000:38:00.0 Off |                    0 |
| N/A   32C    P8              16W /  72W |      0MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA L4                      On  | 00000000:3A:00.0 Off |  

In [13]:
# Execute in Shell
#%%sh
#python -m vllm.entrypoints.openai.api_server --model dwetzel/Llama-3-8B-Instruct_Orce_plus --tensor-parallel-size 4 --enable-chunked-prefill --dtype bfloat16

In [3]:
%%sh 
nvidia-smi

Mon Jun 17 12:05:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      On  | 00000000:38:00.0 Off |                    0 |
| N/A   35C    P0              27W /  72W |  20752MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA L4                      On  | 00000000:3A:00.0 Off |  

In [7]:
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=openai_api_key,
    base_url=openai_api_base,
)

model_name = 'dwetzel/Llama-3-8B-Instruct_Orce_plus'

In [19]:
prompt = [{"role": "user", "content": "Hello!"}]

completion = client.chat.completions.create(
  model=model_name,
  messages=prompt
)

print(completion.choices[0].message)

ChatCompletionMessage(content="Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?", role='assistant', function_call=None, tool_calls=[])


In [91]:
def print_prompt(prompt, completion, with_system=False):
    print("="*30 + f" Chat with  --- {model_name} ---  LLM using vLLM " + "="*30 + "\n")
    for idx, message in enumerate(prompt):
        if prompt[idx]['role'] == 'user':
            color = bcolors.CBLUE
            print(color + f"[ {prompt[idx]['role'].upper()} ]" + bcolors.ENDC)
            print(prompt[idx]['content']+ "\n")
        else: 
            if with_system:
                color = bcolors.CVIOLET
                print(color + f"[ {prompt[idx]['role'].upper()} ]" + bcolors.ENDC)
                print(prompt[idx]['content']+ "\n")

    print(bcolors.OKGREEN + f"[ {completion.choices[0].message.role.upper()} ]" + bcolors.ENDC)
    print(completion.choices[0].message.content)

In [71]:
print_prompt(prompt, completion)


[34m[ USER ][0m
Hello!

[92m[ ASSISTANT ][0m
Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?


# Test out with Dataset

In [49]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.bos_token
EOS_TOKEN = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [50]:
print(EOS_TOKEN)

<|eot_id|>


In [58]:
ultra_interact = load_dataset("openbmb/UltraInteract_sft", split='train')
orca_math = load_dataset("microsoft/orca-math-word-problems-200k", split='train')

In [59]:
# Define a function to filter by category and sample the required number of examples
def sample_category(dataset, category, num_samples):
    filtered_dataset = dataset.filter(lambda x: x['task'] == category)
    sampled_dataset = filtered_dataset.shuffle().select(range(num_samples))
    return sampled_dataset

In [60]:
# Sample the required number of examples from each category
logic_examples = sample_category(ultra_interact, "Logic", 5000)
math_pot_examples = sample_category(ultra_interact, "Math_PoT", 2500)
math_cot_examples = sample_category(ultra_interact, "Math_CoT", 2500)
coding_examples = sample_category(ultra_interact, "Coding", 15000)

# Remove unnecessary columns from UltraInteract datasets
logic_examples = logic_examples.remove_columns([col for col in logic_examples.column_names if col not in ['instruction', 'response']])
math_pot_examples = math_pot_examples.remove_columns([col for col in math_pot_examples.column_names if col not in ['instruction', 'response']])
math_cot_examples = math_cot_examples.remove_columns([col for col in math_cot_examples.column_names if col not in ['instruction', 'response']])
coding_examples = coding_examples.remove_columns([col for col in coding_examples.column_names if col not in ['instruction', 'response']])

all_ultra_interact = concatenate_datasets([logic_examples, math_pot_examples, math_cot_examples, coding_examples])

print(logic_examples[0])
print(math_pot_examples[0])
print(math_cot_examples[0])
print(coding_examples[0])

{'instruction': "Solve the following problem step-by-step:\nGiven the context and corresponding question, choose the correct answer from the options.\n\nContext: \nLawyer: If you take something that you have good reason to think is someone else' s property, that is stealing, and stealing is wrong. However, Meyers had no good reason to think that the compost in the public garden was anyone else' s property, so it was not wrong for Meyers to take it.\n\nQuestion: \nThe reasoning in the lawyer's argument is flawed in that the argument\n\nOptions: \nA. takes a condition that by itself is enough to make an action wrong to also be necessary in order for the action to be wrong\nB. fails to consider the possibility that the compost was Meyers' property\nC. concludes that something is certainly someone else's property when there is merely good, but not conclusive, reason to think that it is someone else's property\nD. takes for granted that Meyers would not have taken the compost if he had good

In [61]:
# Normalize the structure of the Orca Math dataset
def normalize_to_qa(example):
    return {
        'question': example['instruction'],
        'answer': example['response']
    }

In [62]:
all_ultra_interact = all_ultra_interact.map(normalize_to_qa)
all_ultra_interact = all_ultra_interact.remove_columns([col for col in all_ultra_interact.column_names if col not in ['question', 'answer']])
print(all_ultra_interact[0])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

{'question': "Solve the following problem step-by-step:\nGiven the context and corresponding question, choose the correct answer from the options.\n\nContext: \nLawyer: If you take something that you have good reason to think is someone else' s property, that is stealing, and stealing is wrong. However, Meyers had no good reason to think that the compost in the public garden was anyone else' s property, so it was not wrong for Meyers to take it.\n\nQuestion: \nThe reasoning in the lawyer's argument is flawed in that the argument\n\nOptions: \nA. takes a condition that by itself is enough to make an action wrong to also be necessary in order for the action to be wrong\nB. fails to consider the possibility that the compost was Meyers' property\nC. concludes that something is certainly someone else's property when there is merely good, but not conclusive, reason to think that it is someone else's property\nD. takes for granted that Meyers would not have taken the compost if he had good re

In [63]:
orca_math_examples = orca_math.shuffle().select(range(30000))

print(orca_math_examples[0])

{'question': "The time it took car P to travel 300 miles was 2 hours less than the time it took car R to travel the same distance. Car P's average speed was 10 miles per hour greater than that of car R. What was car R's average speed, in miles per hour?", 'answer': "Let's denote the average speed of car R as \\( v_R \\) miles per hour. Since car P's average speed is 10 miles per hour greater, car P's average speed will be \\( v_R + 10 \\) miles per hour.\n\nThe time it takes for each car to travel 300 miles can be found by dividing the distance by the speed. So, the time for car R to travel 300 miles is \\( \\frac{300}{v_R} \\) hours, and the time for car P to travel the same distance is \\( \\frac{300}{v_R + 10} \\) hours.\n\nAccording to the problem, the time it took car P to travel 300 miles was 2 hours less than the time it took car R. We can write this as an equation:\n\n\\[ \\frac{300}{v_R} - 2 = \\frac{300}{v_R + 10} \\]\n\nNow, we solve for \\( v_R \\):\n\n\\[ \\frac{300}{v_R} 

In [65]:
# Concatenate all the sampled examples
all_examples = concatenate_datasets([orca_math_examples, all_ultra_interact])
all_examples = all_examples.shuffle()
print(len(all_examples))
print(all_examples[0])
print(all_examples[1])
print(all_examples[2])
print(all_examples[3])

55000
{'question': 'The sale price of sarees listed for some amount after successive discounts of 20% and 5% is Rs. 152. What was the original price of the sarees?', 'answer': "Let's denote the original price of the sarees as \\( P \\).\n\nAfter the first discount of 20%, the price becomes \\( P - 0.20P = 0.80P \\).\n\nAfter the second discount of 5% on the already discounted price, the price becomes \\( 0.80P - 0.05 \\times 0.80P = 0.80P \\times (1 - 0.05) = 0.80P \\times 0.95 \\).\n\nWe are given that the final sale price after both discounts is Rs. 152. So we can set up the equation:\n\n\\[ 0.80P \\times 0.95 = 152 \\]\n\nNow we can solve for \\( P \\):\n\n\\[ 0.76P = 152 \\]\n\n\\[ P = \\frac{152}{0.76} \\]\n\n\\[ P = 200 \\]\n\nSo the original price of the sarees was Rs. 200."}
{'question': 'Write Python code to solve the task.\nCan you determine the numerical value that correctly completes the equation, if we consider that 1 meter equals 1,000 millimeters: 48 m = [?] mm?\nA) 48,0

In [66]:
system_prompt = """ 
You are an expert in the domains math, coding and logical thinking.
You are there to help users solve tasks in these domains efficiently and accurately. 
Follow these guidelines when responding to user instructions:
    \n
    1. **Understand the Task**: Carefully read and comprehend the given instruction or task description.
    2. **Generate Detailed Responses**: Provide clear, concise, and detailed responses to address the user's task. Ensure the response is complete and accurate.
    3. **Maintain Context**: Ensure that your response maintains the context of the task and addresses all aspects of the instruction.
    4. **Use Appropriate Formatting**: Structure your responses in a readable and logical format, using bullet points, lists, or code blocks as necessary.
    5. **Be Accurate and Concise**: Avoid unnecessary information. Be precise and to the point, ensuring the information is correct and useful.
\n
Now solve the following task based on the given instruction:
\n
### Task: 
\n
"""

In [81]:
# Define the apply_chat_template function
def apply_chat_template_inference(examples):
    formatted_dict = []
    for instruction, response in zip(examples['question'], examples['answer']):
        user_message = f"###Question:\n{instruction} \n\n ###Answer:"
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message},
        ]
        formatted_dict.append(messages)
    return formatted_dict

In [93]:
sampled_dataset = all_examples.select(range(5))
questions = apply_chat_template_inference(sampled_dataset)
print(questions[0])

[{'role': 'system', 'content': " \nYou are an expert in the domains math, coding and logical thinking.\nYou are there to help users solve tasks in these domains efficiently and accurately. \nFollow these guidelines when responding to user instructions:\n    \n\n    1. **Understand the Task**: Carefully read and comprehend the given instruction or task description.\n    2. **Generate Detailed Responses**: Provide clear, concise, and detailed responses to address the user's task. Ensure the response is complete and accurate.\n    3. **Maintain Context**: Ensure that your response maintains the context of the task and addresses all aspects of the instruction.\n    4. **Use Appropriate Formatting**: Structure your responses in a readable and logical format, using bullet points, lists, or code blocks as necessary.\n    5. **Be Accurate and Concise**: Avoid unnecessary information. Be precise and to the point, ensuring the information is correct and useful.\n\n\nNow solve the following task 

In [95]:
for idx, question in enumerate(questions):
  completion = client.chat.completions.create(
    model=model_name,
    messages=question, 
    extra_body={
        "min_p" : 0.05 # If it's set to 0.05, then it will allow tokens at least 1/20th as probable as the top token to be generated.
    }
  )
  print("="*62 + f"  RUN {idx+1}  " + "="*62 + "\n\n")

  print_prompt(question, completion, with_system=False)




[34m[ USER ][0m
###Question:
The sale price of sarees listed for some amount after successive discounts of 20% and 5% is Rs. 152. What was the original price of the sarees? 

 ###Answer:

[92m[ ASSISTANT ][0m
Let's denote the original price of the sarees as x.

After the first discount of 20%, the price becomes 80% of the original price, which is 0.8x.

Then, after the second discount of 5%, the price becomes 95% of the price after the first discount, which is 0.95(0.8x).

We are given that the sale price is Rs. 152. Therefore, we can set up the equation:

0.95(0.8x) = 152

To solve for x, we can first multiply both sides of the equation by 100 to get rid of the decimals:

0.95(0.8x) * 100 = 152 * 100

This gives us:

95(8x) = 15200

Now, we can divide both sides of the equation by 95 to solve for x:

8x = 15200 / 95

8x = 160

Finally, we can divide both sides of the equation by 8 to find the original price x:

x = 160 / 8

x = 20

So, the original price of the sarees is Rs. 20

# Streaming Responses

In [99]:
def print_stream(prompt, stream, with_system=False):
    print("="*30 + f" Chat with  --- {model_name} ---  LLM using vLLM " + "="*30 + "\n")
    for idx, message in enumerate(prompt):
        if prompt[idx]['role'] == 'user':
            color = bcolors.CBLUE
            print(color + f"[ {prompt[idx]['role'].upper()} ]" + bcolors.ENDC)
            print(prompt[idx]['content']+ "\n")
        else: 
            if with_system:
                color = bcolors.CVIOLET
                print(color + f"[ {prompt[idx]['role'].upper()} ]" + bcolors.ENDC)
                print(prompt[idx]['content']+ "\n")

    print(bcolors.OKGREEN + f"[ ASSISTANT ]" + bcolors.ENDC + "\n")
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            print(chunk.choices[0].delta.content, end="")

In [100]:
stream = client.chat.completions.create(
    model=model_name,
    messages=question,
    stream=True, 
    extra_body={
        "min_p" : 0.05 # If it's set to 0.05, then it will allow tokens at least 1/20th as probable as the top token to be generated.
    }
)
print_stream(question , stream)


[34m[ USER ][0m
###Question:
Jennie makes quilts using two types of material: Cotton and Silk. She can make 13 quilts with 37 yards of cotton material and 9 quilts with 27 yards of silk material. Jennie has limited time and can only make a total of 25 quilts in the given time. How many yards of each material would be required to make the desired number of quilts while using up the material most efficiently? 

 ###Answer:

[92m[ ASSISTANT ][0m

Let's analyze the situation and

 find the most efficient way for Jennie to make the desired number of quilts using the available time and material.

First, let's calculate the total number of yards of cotton material needed to make 25 quilts:

13 quilts/37 yards = 1 quilt/2.85 yards

To make 25 quilts, Jennie would need:

25 quilts x 2.85 yards/quilt = 71.25 yards of cotton material

Next, let's calculate the total number of yards of silk material needed to make 25 quilts:

9 quilts/27 yards = 1 quilt/3 yards

To make 25 quilts, Jennie would need:

25 quilts x 3 yards/quilt = 75 yards of silk material

However, Jennie only has a limited time to make quilts, so she needs to use up the material most efficiently. Let's analyze the situation further:

* If Jennie makes 25 quilts using the cotton material, she would have 37 - 71.25 = -34.25 yards of cotton material left over.
* If Jennie makes 25 quilts using the silk material, she would have 27 - 75 = -48 yards of silk material left over.

Since Jennie needs to use up th