In [None]:
!pip install openai wikipedia-api tiktoken transformers

In [None]:
# OPENAI KEY lesen
import os
try:
    from google.colab import userdata
    OPENAI_KEY = userdata.get('OPENAI_KEY')
except:
    OPENAI_KEY = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = OPENAI_KEY


# Beispiel-Dokumente von Wikipedia

In [None]:
import tiktoken
import wikipediaapi
from pathlib import Path

In [None]:
page_name = 'Elvis_Presley'

wiki = wikipediaapi.Wikipedia('LangChain RAG', 'de', extract_format=wikipediaapi.ExtractFormat.WIKI)
text = wiki.page(page_name).text

In [None]:
len(text)

In [None]:
from openai import OpenAI
client = OpenAI()

## Übersetzen

In [None]:
prompt = f'''{text}
Übersetze den folgenden Text ins englische:'''

In [None]:
completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}]
    )

completion.choices[0].message.content

### Tokens zählen

siehe auch den Tokenizer auf der OpenAI Website: https://platform.openai.com/tokenizer

In [None]:
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
len(encoding.encode(text))

In [None]:
# wir schneiden den text nach n-Tokens ab
n = 15000
tokenized_text = encoding.encode(text)
tokenized_text = tokenized_text[:n]

tokenized_text[:10], text[:100]

In [None]:
encoding.decode([6719])

In [None]:
text = encoding.decode(tokenized_text)

In [None]:
prompt

In [None]:
prompt = f'''{text}

Übersetze den Text vollständig ins englische:'''

In [None]:
completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}],
    max_tokens=None
    )

completion.choices[0].message.content

In [None]:
len(encoding.encode(completion.choices[0].message.content))

#### "finish_reason"

In [None]:
completion

## Model input / output (Huggingface / OpenSource Modelle)

![llm.png](images/llm.png)

### Wie kommen wir an den "neuen" Token?

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype=torch.float32, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

In [None]:
inputs = tokenizer('''Elvis Aaron Presley (* January 8, 1935 in''', return_tensors="pt", return_attention_mask=False)
inputs['input_ids'].shape

In [None]:
outputs = model(**inputs)
outputs.logits.shape

In [None]:
new_token_logits = outputs.logits[:, -1, :]
new_token_logits.shape

In [None]:
new_token = new_token_logits.argmax(dim=1)
tokenizer.decode(new_token)

### Text generieren mit Huggingface

In [None]:
outputs = model.generate(**inputs, max_length=20)
text = tokenizer.batch_decode(outputs)[0]
text

### Text generieren ist eine "for loop"

In [None]:
inputs = tokenizer('''Elvis Aaron Presley (* January 8, 1935 in''', return_tensors="pt", return_attention_mask=False)

for i in range(20):
    # Neuen Token generieren
    outputs = model(**inputs)
    new_token_logits = outputs.logits[:, -1, :]
    new_token = new_token_logits.argmax(dim=1)

    # Anhängen des neuen Tokens an die Inputs
    inputs["input_ids"] = torch.cat((inputs["input_ids"], new_token.unsqueeze(-1)), dim=1)

In [None]:
tokenizer.decode(inputs["input_ids"][0])