# Running LLMs Locally

## Using Hugging Face Transformers Library

In [1]:
pip install transformers



In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2"  # or any other model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

input_text = "To live a sustainable life we have to reduce the usage of"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(inputs["input_ids"])
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


To live a sustainable life we have to reduce the usage of fossil fuels.

The world is


## Using PyTorch

In [5]:
pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [6]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

input_text = "To live a sustainable life we have to reduce the usage of"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(inputs["input_ids"])
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


To live a sustainable life we have to reduce the usage of fossil fuels.

The world is


## Using TensorFlow

In [11]:
pip install tensorflow



In [12]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = TFGPT2LMHeadModel.from_pretrained(model_name)

input_text = "To live a sustainable life we have to reuse"
inputs = tokenizer(input_text, return_tensors="tf")
outputs = model.generate(inputs["input_ids"])
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


To live a sustainable life we have to reuse our resources.

We need to be able to


##  ONNX Runtime
ONNX Runtime allows you to optimize and run models trained in frameworks like PyTorch and TensorFlow with improved performance.

In [13]:
pip install onnx onnxruntime transformers

Collecting onnx
  Downloading onnx-1.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnxruntime
  Downloading onnxruntime-1.18.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnx, humanfriendly

In [17]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import onnx
import onnxruntime as ort

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# Prepare input for the model
input_text = "Once upon a time"
inputs = tokenizer(input_text, return_tensors="pt")

# Convert the model to ONNX
dummy_input = inputs["input_ids"]
torch.onnx.export(
    model,
    dummy_input,
    "gpt2.onnx",
    input_names=["input_ids"],
    output_names=["logits"],
    opset_version=11,
    dynamic_axes={"input_ids": {0: "batch_size", 1: "sequence"}}
)

# Load the ONNX model
onnx_model = onnx.load("gpt2.onnx")
onnx.checker.check_model(onnx_model)

# Create an ONNX Runtime session
ort_session = ort.InferenceSession("gpt2.onnx")

# Define a function to convert PyTorch tensors to numpy arrays
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# Convert inputs to numpy
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(inputs["input_ids"])}

# Run inference
ort_outs = ort_session.run(None, ort_inputs)

# Decode the output logits to tokens
generated_tokens = torch.tensor(ort_outs[0]).argmax(dim=-1)
generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

print("Generated text:", generated_text)

Generated text:  the a time,
