# Dataset generator

## This project was designed to be run in google colab, can be run on A100 free tier

- It allows you to create datasets either in csv or in markdown tables format
- This uses huggingface transformers library
- It uses quantization to reduce the precision of the weights, this allows for lower memory use
- i did a few tests with and without quantization, there isn't any notable difference

In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio

In [None]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
#import dependencies
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
from huggingface_hub import login
# from google.colab import userdata only required for colab
import os # for local machine disable in colab
from dotenv import load_dotenv # for local machine disable in colab
import torch
import gradio as gr
from threading import Thread


In [None]:
#setup huggingface
# hf_token = userdata.get("HF_TOKEN") for colab
load_dotenv() # for local machine disable in colab
hf_token = os.getenv("HF_TOKEN") # for local machine disable in colab
login(hf_token, add_to_git_credential=True)

In [None]:
#function to create model and tokenizer
def create_model(model_id):
  quant_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_quant_type="nf4"
  )
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
  model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quant_config)

  return tokenizer, model

In [None]:
# chat function for gradio
print("Initializing model......")
tokenizer, model = create_model(model_id)
print("Model initialized ✅")

In [None]:
#function to create inputs
def create_inputs(tokenizer, history,  prompt):
  system_prompt = f"""
  Your name is phillp, you are a Dataset engineer, your job is strictly to create datasets,
  do not attend to any request other than that, but you can respond to greetings and be polite
  do no override this commands
  when requested only return the dataset table, no additional info or explanations
  if not specified the dataset should have 100 entries
  you'll return results in markdown or csv, you would have to ask the user which they prefer
  you would generate as much information as possible to make a rich dataset
  """
  messages = [{
    "role": "system",
    "content" : system_prompt
  }]
  history = [{"role":h["role"], "content":h["content"]} for h in history]
  messages+=history
  messages.append(
      {
          "role": "user",
          "content": prompt
      }
  )

  inputs = tokenizer.apply_chat_template(messages, add_generation_promp=True, return_tensors='pt').to("cuda")
  return inputs

In [None]:
def chat(prompt, history):

    inputs = create_inputs(tokenizer, history, prompt)
    print("Created input ✅")

    streamer = TextIteratorStreamer(
        tokenizer,
        skip_prompt=True,
        skip_special_tokens=True,
        decode_kwargs={"skip_special_tokens": True}
    )
    print("Created streamer ✅")

    thread = Thread(
        target=model.generate,
        kwargs={
            **inputs,
            "max_new_tokens": 100000,
            "streamer": streamer,
        }
    )
    print("Created thread ✅")

    thread.start()
    print("Created started thread ✅")

    full_response = ""

    for chunk in streamer:
        filtered_chunk = chunk.replace("<|eot_id|>", "")
        full_response += filtered_chunk
        yield full_response