MODEL SIZE IS **2.24 GB** ONLY, BUT EXCELS GREATLY AT ALL NLP TASKS!

Choose **Runtime  ->  Change runtime type  ->  T4 GPU (HIT SAVE)**

Hit **Ctrl + F9** to run all cells in 1-click.

In [None]:
!pip install -U bitsandbytes

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import os
from threading import Thread


try:
    from google.colab import drive
    IN_COLAB = True
except ImportError:
    IN_COLAB = False
#from IPython.display import HTML, display #remove or comment out

# Check if running in Colab and set save directory accordingly
if IN_COLAB:
    drive.mount('/content/drive')
    save_directory = '/content/drive/MyDrive/llama_model'
else:
    save_directory = 'llama_model'  # Or any local path you prefer

# Check if the model already exists in the save directory
if not os.path.exists(save_directory):
    # If the model doesn't exist, download it from Hugging Face
    print("Model not found in Drive. Downloading from Hugging Face...")
    tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-Instruct-bnb-4bit")
    model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-3B-Instruct-bnb-4bit")

    # Create the save directory if it doesn't exist
    os.makedirs(save_directory, exist_ok=True)

    # Save the tokenizer and model to the save directory
    tokenizer.save_pretrained(save_directory)
    model.save_pretrained(save_directory)
else:
    # If the model already exists, load it from the save directory
    print("Model found in Drive. Loading...")
    tokenizer = AutoTokenizer.from_pretrained(save_directory)
    model = AutoModelForCausalLM.from_pretrained(save_directory)

In [None]:
from IPython.display import HTML, display

def prompt(prompt):

    display(HTML("""
    <style>
    pre {
        white-space: pre-wrap;       /* Wrap long lines */
        word-wrap: break-word;       /* Break words if necessary */
        width: 70% !important; /* Adjust the percentage as needed */
    }
    </style>
    """))


    model.generation_config.pad_token_id = tokenizer.pad_token_id
    prompt = {prompt}

    messages = [
        {"role": "system", "content": "Give concise answers!"},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)


    # Create a streamer object
    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    # Pass the streamer to the generate method
    generation_kwargs = dict(
        **model_inputs,
        streamer=streamer,
        max_new_tokens=5000,
    )
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Iterate over the generated tokens and print them
    for new_text in streamer:
        print(new_text, end="", flush=True)

    thread.join()

In [None]:
def prompt_value():
  input_value = input("Enter your prompt: ")
  print("\nResponse: ")
  prompt(input_value)

In [None]:
prompt_value()