# Inference Experimentation

## Preliminaries

In [3]:
!nvidia-smi

Tue Dec 12 20:51:51 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.04              Driver Version: 546.17       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0  On |                  Off |
|  0%   37C    P8              26W / 450W |   2428MiB / 24564MiB |      4%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [12]:
!conda install -qy scikit-learn scipy matplotlib
!pip install -q -U python-dotenv
!pip install -q -U  datasets # The version in conda is broken
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U guidance


Retrieving notices: ...working... done
Channels:
 - defaults
 - nvidia
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - matplotlib
    - scikit-learn
    - scipy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    brotli-1.0.9               |       h5eee18b_7          18 KB
    brotli-bin-1.0.9           |       h5eee18b_7          19 KB
    contourpy-1.2.0            |  py311hdb19cb5_0         263 KB
    cycler-0.11.0              |     pyhd3eb1b0_0          12 KB
    cyrus-sasl-2.1.28          |       h52b45da_1         237 KB
    dbus-1.13.18               |       hb2f20db_0         504 KB
    expat-2.5.0                |       h6a678d5_0         172 KB
    fontconfig-2.14.1          |       h4c34cd2_2         281 KB

In [3]:
!pip install -q -U  datasets # The version in conda is broken

[0m

In [1]:
import random
import time
from guidance import models, gen, select

## Load the model and data

### Base Model

Quantizied to 4 bits using bits and bytes

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

### Lora


In [4]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "mistral-discern-finetune/checkpoint-11000")

### Validation Dataset

In [5]:
from datasets import load_dataset

eval_dataset = load_dataset("json", data_files='./validation_data.jsonl', split='train')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

# Inference Test

In [6]:
prompt, origional_answer = random.choice(eval_dataset['document']).split("### Solution:")
prompt += "### Solution:\r\n"


In [39]:
import time 

start_time = time.time()

model_input = tokenizer(prompt, return_tensors="pt").to("cuda")
input_length = model_input['input_ids'].shape[1]

ft_model.eval()
with torch.no_grad():
    print("Mistral Anaswer:")
    output_tokens = ft_model.generate(**model_input, max_new_tokens=1000, repetition_penalty=1.15)
    output_text = tokenizer.batch_decode(output_tokens[:, input_length:])[0]
    print(output_text)
    #print(tokenizer.decode(output_tokens[0], skip_special_tokens=True))

end_time = time.time()
total_time = end_time - start_time
print(f"Time taken: {total_time} seconds")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Mistral Anaswer:
{  "sports-and-athletics": "1.0",  "sports-and-athletics-confidence": "5.0",  "environmentalism-and-sustainability": "0.0",  "environmentalism-and-sustainability-confidence": "10.0",  "gaming-and-e-sports": "0.0",  "gaming-and-e-sports-confidence": "10.0",  "college-and-career": "0.0",  "college-and-career-confidence": "3.0",  "cooking-and-food": "0.0",  "cooking-and-food-confidence": "10.0",  "reading-and-literature": "0.0",  "reading-and-literature-confidence": "10.0",  "writing-and-creative-writing": "0.0",  "writing-and-creative-writing-confidence": "10.0",  "science-and-technology": "0.0",  "science-and-technology-confidence": "10.0",  "mathematics-and-statistics": "0.0",  "mathematics-and-statistics-confidence": "10.0",  "history-and-social-studies": "1.0",  "history-and-social-studies-confidence": "4.0",  "creative-arts": "0.0",  "creative-arts-confidence": "10.0",  "animals-and-nature": "0.0",  "animals-and-nature-confidence": "10.0",  "note": "The student show

# Batch Inference

In [72]:
batch_size = 8
prompts = []
answers = []


for x in range(batch_size):
    p, a = random.choice(eval_dataset['document']).split("### Solution:")
    p += "### Solution:\r\n"

    prompts.append(p)
    answers.append(a)

tokenizer.pad_token = tokenizer.eos_token
model_input = tokenizer(prompts, padding=True, return_tensors="pt").to("cuda")
input_length = model_input['input_ids'].shape[1]

len(prompts), input_length

(8, 1117)

In [71]:
import time 

start_time = time.time()

ft_model.eval()
with torch.no_grad():
    output_tokens = ft_model.generate(**model_input, max_new_tokens=1000, repetition_penalty=1.15)
    output_text = tokenizer.batch_decode(output_tokens[:, input_length:])
    for index, value in enumerate(output_text):
        print(f"\n\n**** MISTRAL ANSWER {index}********\:\n\n")
        print(value)    

end_time = time.time()
total_time = end_time - start_time
print(f"Time taken: {total_time} seconds")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




**** MISTRAL ANSWER 0********\:


{  "sports-and-athletics": "0.0",  "sports-and-athletics-confidence": "10.0",  "environmentalism-and-sustainability": "0.0",  "environmentalism-and-sustainability-confidence": "10.0",  "gaming-and-e-sports": "0.0",  "gaming-and-e-sports-confidence": "10.0",  "college-and-career": "0.0",  "college-and-career-confidence": "10.0",  "cooking-and-food": "0.0",  "cooking-and-food-confidence": "10.0",  "reading-and-literature": "1.0",  "reading-and-literature-confidence": "9.0",  "writing-and-creative-writing": "0.0",  "writing-and-creative-writing-confidence": "10.0",  "science-and-technology": "1.0",  "science-and-technology-confidence": "8.0",  "mathematics-and-statistics": "1.0",  "mathematics-and-statistics-confidence": "7.0",  "history-and-social-studies": "1.0",  "history-and-social-studies-confidence": "8.0",  "creative-arts": "0.0",  "creative-arts-confidence": "10.0",  "animals-and-nature": "0.0",  "animals-and-nature-confidence": "10.0",  "note":

# Microsoft Guidance

Guidance is a system that manipulates the logprops and token selection to make the output conform to a template

In [73]:
guidance_llm = models.Transformers(ft_model, tokenizer)

In [74]:
import time 

start_time = time.time()

def generate_number():
    return gen(regex='[0-9\.]+', temperature=0.0, stop='"')

prompted = guidance_llm + prompt
prompted += f"""{{
  "sports-and-athletics": "{generate_number()}",
  "sports-and-athletics-confidence": "{generate_number()}",
  "environmentalism-and-sustainability": "{generate_number()}",
  "environmentalism-and-sustainability-confidence": "{generate_number()}",
  "gaming-and-e-sports": "{generate_number()}",
  "gaming-and-e-sports-confidence": "{generate_number()}",
  "college-and-career": "{generate_number()}",
  "college-and-career-confidence": "{generate_number()}",
  "cooking-and-food": "{generate_number()}",
  "cooking-and-food-confidence": "{generate_number()}",
  "reading-and-literature": "{generate_number()}",
  "reading-and-literature-confidence": "{generate_number()}",
  "writing-and-creative-writing": "{generate_number()}",
  "writing-and-creative-writing-confidence": "{generate_number()}",
  "science-and-technology": "{generate_number()}",
  "science-and-technology-confidence": "{generate_number()}",
  "mathematics-and-statistics": "{generate_number()}",
  "mathematics-and-statistics-confidence": "{generate_number()}",
  "creative-arts": "{generate_number()}",
  "creative-arts-confidence": "{generate_number()}",
  "animals-and-nature": "{generate_number()}",
  "animals-and-nature-confidence": "{generate_number()}",
  "history-and-social-studies": "{generate_number()}",
  "history-and-social-studies-confidence": "{generate_number()}",
  "note": "{gen(temperature=0.0, stop='"')}"
}}"""

end_time = time.time()
total_time = end_time - start_time
print(f"Time taken: {total_time} seconds")

Time taken: 20.50229024887085 seconds
