# Large Language Model Fundamentals & Practical Applications (Demo)
## KaggleX Speaker Workshop
### By: David Ramirez ([GitHub](https://github.com/dframirez-usmc))

In [1]:
# Gemma-2 information:

# Google Model Card
# https://ai.google.dev/gemma/docs/model_card_2

# HuggingFace
# https://huggingface.co/google/gemma-2-2b

# Kaggle
# https://www.kaggle.com/models/google/gemma-2/transformers/gemma-2-2b

# Best guess on memory requirements
# https://huggingface.co/google/gemma-2b/discussions/59

In [None]:
# You must agree to the use terms of the Gemma model
# You must create your own Hugging Face Access Token
access_token = "YOUR_HUGGINGFACE_ACCESS_TOKEN"

In [None]:
# You may download the model and code through git from Hugging Face
# NOTE: Replace with your Hugging Face USERNAME and ACCESSTOKEN
!git clone https://USERNAME:ACCESSTOKEN@huggingface.co/google/gemma-2-2b

In [4]:
# Check for Nvidia GPU (System Management Interface)
!nvidia-smi

Thu Sep  5 15:29:31 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       On  |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
# Check for Nvidia CUDA (Nvidia CUDA Compiler)
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [None]:
# Keep your pip package up to date
!pip install pip --upgrade #--quiet

In [None]:
!pip install transformers accelerate --upgrade #--quiet

In [None]:
# This pip install assumes you have CUDA 12.4 installed
# CUDA setup is usually seperate from python or pip
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --upgrade #--quiet

In [9]:
# It may be necessary to uninstall in order to reinstall PyTorch
#!pip uninstall torch torchvision torchaudio --yes

In [None]:
# Check against your installed pip packages
#!pip list

In [None]:
# Check against your installed pip packages
#!pip list

In [12]:
import os
import torch
import numpy as np

In [13]:
torch.cuda.is_available()

True

In [14]:
torch.cuda.get_device_name()

'Tesla T4'

In [39]:
# Load model and tokenizer directly
from transformers import AutoTokenizer, AutoModelForCausalLM, Gemma2ForCausalLM

model_id = "google/gemma-2-2b"

tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
model = Gemma2ForCausalLM.from_pretrained(
    model_id, 
    token=access_token,
    torch_dtype=torch.bfloat16,
).to("cuda")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [40]:
tokenizer.vocab_size

256000

In [45]:
def generate_text(input_text):  

    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
    print("input_ids =")
    print(input_ids.cpu().numpy()[0])
    print("Input token IDs", end="\n\n")
    
    outputs = model.generate(
        input_ids, 
        max_new_tokens=16,
        do_sample=False,
        return_dict_in_generate=True,
        output_logits=True,
    )
    
    logits = outputs.logits[0][0]
    
    print("len(output) =")
    print(len(logits.cpu().numpy()))
    print("Quantity of words in the dictonary", end="\n\n")
    
    print("output.logits =")
    print(logits.cpu().numpy())
    print("Output raw values", end="\n\n")
    
    probs = torch.nn.functional.softmax(logits, dim=-1)
    percents = [f'{i*100:.1f}%' for i in probs.cpu().numpy()]
    print("softmax(output.logits) =")
    print(percents[0:3], end="")
    print(" ... ", end="")
    print(percents[-4:-1])
    print("Very small percentages", end="\n\n")
    
    print("argmax(softmax) =")
    print(torch.argmax(probs).cpu().numpy())
    # TODO: Convert into percentages %
    print("max(softmax) =")
    max_percent = '{:.2%}'.format(torch.max(probs).cpu().numpy())
    print(max_percent)
    print("Best next token ID and percent chance", end="\n\n")
    
    print("output =")
    print(outputs[0][0].cpu().numpy())
    print("All output token IDs", end="\n\n")
    
    generated_text = tokenizer.decode(outputs.sequences[0])

    # Format the code output
    formatted_code = generated_text.strip() + "\n"
    print(formatted_code)

In [46]:
generate_text("Next token prediction will continue this statement to")

input_ids =
[    2  6353  8447 25166   877  5374   736  6218   577]
Input token IDs

len(output) =
256000
Quantity of words in the dictonary

output.logits =
[-15.9375    9.0625   -9.875   ...  -1.71875  -2.34375 -15.9375 ]
Output raw values

softmax(output.logits) =
['0.0%', '0.0%', '0.0%'] ... ['0.0%', '0.0%', '0.0%']
Very small percentages

argmax(softmax) =
573
max(softmax) =
14.08%
Best next token ID and percent chance

output =
[     2   6353   8447  25166    877   5374    736   6218    577    573
   1580    576    573   1162 235265    109    651   3712    576    573
   2351   8447    603   5043    577]
All output token IDs

<bos>Next token prediction will continue this statement to the end of the year.

The price of the next token is expected to



In [None]:
# Clean up the GPU usage to run a different way
del model
torch.cuda.empty_cache()

In [None]:
# Use a pipeline instead of model+tokenizer
from transformers import pipeline

model_id = "google/gemma-2-2b"

pipeline = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    do_sample=False,
    token=access_token,
)

In [None]:
outputs = pipeline(
    "Next token prediction will continue this statement",
    max_new_tokens=16,
    do_sample=False
)
print(outputs[0]["generated_text"])