In [1]:
import re
import json
import torch

from transformers import LlamaForCausalLM, LlamaTokenizer
from typing import List

from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

2023-09-21 05:12:23.067389: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-09-21 05:12:23.067425: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [4]:
path_to_model = "../Llama-2-13b-chat-hf"

tokenizer = LlamaTokenizer.from_pretrained(path_to_model)
tokenizer.pad_token = tokenizer.eos_token 

model = LlamaForCausalLM.from_pretrained(
    path_to_model, 
    load_in_8bit=True, 
    device_map='auto', 
    torch_dtype=torch.float16
).eval()


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/netcrk/cp39/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /home/netcrk/cp39/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...


  warn(msg)
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
model.device

device(type='cuda', index=0)

In [6]:
def get_input_prompt(schema: str, context: str, n_examples: int) -> str:
    return f"""[INST] <<SYS>>
                You are QA Engineer and you need to generate {n_examples} test examples with following schema to test API: {schema} 
                Generate your answer in following format: "examples": []. example word has to be in double qoutes.
                Take this context into account when generating: {context}. Answer without any description.
                <</SYS>>[/INST]"""


def generate_test_examples(inputs: List[dict], 
                           n_examples: int = 5, 
                           batch_size: int = 10,
                           max_new_tokens: int = 300) -> str: 
    
    input_prompts = [get_input_prompt(input_["schema"], input_["context"], n_examples) for input_ in inputs]
    
    dataloader = DataLoader(input_prompts, batch_size=batch_size)
    
    with torch.no_grad():
        model_outputs = []
        
        for batch in tqdm(dataloader):
            model_input = tokenizer(batch, return_tensors="pt", padding=True).to("cuda")
        
            model_output = model.generate(**model_input, max_new_tokens=max_new_tokens)
            model_output = tokenizer.batch_decode(model_output, skip_special_tokens=True)
            
            model_outputs.extend(model_output)
        
    return model_outputs


def clear_response(responses: List[str]) -> str:
    outputs = []
    
    for response in responses:
        try:
            response = response.split("\n\n")[1]
            response = json.loads("{" + response + "}")
            
            outputs.append(response)
        except:
            outputs.append({"status": "Error while parsing.", "response": response})
            
    return outputs

In [7]:
from flask import Flask, request

app = Flask(__name__)


@app.route("/", methods=["GET"])
def get_test_examples():
    inputs, params = request.json["inputs"], request.args
    
    n_examples = params.get("n_examples") or 3
    batch_size = params.get("batch_size") or 5
    max_new_tokens = params.get("max_new_tokens") or 300
    
    response = generate_test_examples(inputs, n_examples, int(batch_size), int(max_new_tokens))
        
    torch.cuda.empty_cache()
        
    return clear_response(response)

app.run(host="0.0.0.0", port=5016)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5016
 * Running on http://10.112.2.242:5016
[33mPress CTRL+C to quit[0m


  0%|          | 0/1 [00:00<?, ?it/s]

10.236.151.95 - - [21/Sep/2023 05:13:40] "GET /?n_examples=3&batch_size=10&max_new_tokens=300 HTTP/1.1" 200 -


  0%|          | 0/1 [00:00<?, ?it/s]

10.236.151.95 - - [21/Sep/2023 05:14:08] "GET /?n_examples=3&batch_size=10&max_new_tokens=300 HTTP/1.1" 200 -


  0%|          | 0/1 [00:00<?, ?it/s]

10.236.151.95 - - [21/Sep/2023 05:16:26] "GET /?n_examples=3&batch_size=10&max_new_tokens=300 HTTP/1.1" 200 -
