<a href="https://colab.research.google.com/github/gokturkberke/LLM-Engineering-Portfolio/blob/main/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [3]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [4]:
#instruct models
LLAMA="meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct"
MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1"

In [5]:
messages = [
    {"role" : "system","content":"You are a helpful assistant"},
    {"role":"users","content":"Tell a light-hearted joke for a room of Data Scientists"}
]

In [6]:
#Quantization Config = This allows us to load the model into memory and use less memory

quant_config = BitsAndBytesConfig( #bellek kullanimini azaltma icin kullandigimiz bir teknik
    load_in_4bit=True,  #modelin agirliklarini 32 bit veya 16 bit yerine 4 bit hassasiyetinde yapar
    bnb_4bit_use_double_quant=True, #daha da sikistirir twice quantization ve tasarruf saglar
    bnb_4bit_compute_dtype=torch.bfloat16, #veri tipimiz
    bnb_4bit_quant_type="nf4" #quantization islemini nasil yapilacagini belirleyen metodoloji
)

In [7]:
#Tokenizer

tokenizer= AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token #teknik satir ayarai (eos token = end of sequence modeln cumlenin veya yanitin bittigini saglayna ozel isaret)
inputs = tokenizer.apply_chat_template(messages,return_tensors="pt") #pt = PyTorch

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
model = AutoModelForCausalLM.from_pretrained(LLAMA, quantization_config = quant_config,device_map="auto")
# .from_pretrained = modeli HuggingFaceHubdan indirip yukleyen asil metod
#device map auto = Modelin katmanlarının hangi donanım üzerine (GPU, CPU, RAM) yerleştirileceğini otomatik olarak yönetir. (tek gpu ise tamami o gpuya birden fazla varsa otomatik paylastirir)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model

In [None]:
outputs = model.generate(inputs,max_new_tokens=80) #üretecegi cevap en fazla 80 tokenlik olsun
print(tokenizer.decode(outputs[0]))

In [None]:
del inputs, outputs ,model
torch.cuda.empty_cache()

In [None]:
#Wrapping everything in afunction and adding Streaming

def generate(model,messages):
  tokenizer = AutoTokenizer.from_pretrained(model)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages,return_tensors="pt").to("cuda")
  streamer = TextStreamer(tokenizer)
  model = AutoModelForCausalLM.from_pretrained(model,device_map="auto",quantization_config=quant_config)
  outputs = model.generate(inputs, max_new_tokens = 80, streamer=streamer)
  del tokenizer,streamer,model,inputs,outputs
  torch.cuda.empty_cache()

In [None]:
generate(PHI3,messages)

In [None]:
messages = [
    {"role":"user","content": "Tell a light hearted joke for a room of Data scientists"}
]

generate(GEMMA2,messages)