In [1]:
!pip install torch transformers sentencepiece datasets gcsfs accelerate
!pip install tensorflow




# Daten abrufen (Wikipedia & TinyStories)

In [3]:
import sentencepiece as spm
import gcsfs

fs = gcsfs.GCSFileSystem()

# Lade den Tokenizer aus Google Cloud Storage (GCS)
TOKENIZER_PATH = 'gs://transformer-ngrams/32768.model'

# Öffne den Tokenizer mit gcsfs
with fs.open(TOKENIZER_PATH, 'rb') as f:
    tokenizer = spm.SentencePieceProcessor(model_proto=f.read())


In [4]:
import gcsfs
import pandas as pd

fs = gcsfs.GCSFileSystem()

# TinyStories Daten
TINYSTORIES_TRAINING_DATA_PATH = 'gs://transformer-ngrams/TinyStories/training_data/'

# Wikipedia Daten
WIKIPEDIA_TRAINING_DATA_PATH = 'gs://transformer-ngrams/Wikipedia/train_data/'

# Lade TinyStories (erste 2 Dateien, um Speicher zu sparen)
tiny_files = [f'gs://{file}' for file in fs.ls(TINYSTORIES_TRAINING_DATA_PATH)[:2]]
tiny_dfs = [pd.read_parquet(fs.open(file, 'rb')) for file in tiny_files]
df_tiny = pd.concat(tiny_dfs)

# Lade Wikipedia (erste 2 Dateien)
wiki_files = [f'gs://{file}' for file in fs.ls(WIKIPEDIA_TRAINING_DATA_PATH)[:2]]
wiki_dfs = [pd.read_parquet(fs.open(file, 'rb')) for file in wiki_files]
df_wiki = pd.concat(wiki_dfs)

# Wikipedia `observation` & `target` in `tokens`-Format umwandeln und Daten mit dem richtigen Tokenizer neu tokenisieren weil sonst Fehler
df_wiki["tokens"] = df_wiki.apply(lambda row: tokenizer.encode(" ".join(map(str, row["observation"]))) +
                                             tokenizer.encode(" ".join(map(str, row["target"]))), axis=1)


# Nur die `tokens`-Spalte behalten, um TinyStories-Format nachzuahmen
df_wiki = df_wiki[["tokens"]]

print("TinyStories Beispiel:")
print(df_tiny.head())

print("\nWikipedia Beispiel (angepasst auf TinyStories-Format):")
print(df_wiki.head())



TinyStories Beispiel:
                                              tokens
0  [2178, 769, 280, 4922, 32600, 3746, 3031, 351,...
1  [603, 275, 13556, 1071, 5199, 360, 16875, 305,...
2  [340, 280, 7094, 32599, 6027, 305, 26360, 1792...
3  [22565, 11111, 360, 280, 7320, 2888, 4550, 200...
4  [1220, 383, 529, 3031, 32642, 4, 4, 13666, 305...

Wikipedia Beispiel (angepasst auf TinyStories-Format):
                                              tokens
0  [11, 13, 9, 32578, 14, 10, 8, 32578, 9, 9, 17,...
1  [11, 10, 13, 15, 16, 32578, 12, 32578, 9, 17, ...
2  [11, 10, 14, 8, 8, 32578, 16, 12, 11, 8, 32578...
3  [11, 10, 14, 8, 8, 32578, 13, 12, 8, 32578, 14...
4  [11, 8, 12, 32578, 10, 16, 8, 32578, 11, 10, 1...


# Tokenizer laden

In [5]:
import sentencepiece as spm

TOKENIZER_PATH = 'gs://transformer-ngrams/32768.model'

with fs.open(TOKENIZER_PATH, 'rb') as f:
    tokenizer = spm.SentencePieceProcessor(model_proto=f.read())

# Teste den Tokenizer
text = "This is a test"
tokens = tokenizer.encode_as_ids(text)
print(tokens)  

[674, 328, 275, 1272]


# mit dem kleinsten Modell 160M


In [6]:
from transformers import GPT2Config, GPT2LMHeadModel
import torch

# 160M-Modell für TinyStories & Wikipedia (wie aus Paper)
config = GPT2Config(
    vocab_size=32768,  # Tokenizer-Vokabular
    n_positions=1024,  # Maximale Kontextgröße 1024
    n_embd=1024,  # Größe der Einbettungen 1024
    n_layer=24,  # Anzahl der Transformer-Schichten 24
    n_head=16,  # Anzahl der Attention-Köpfe 16
)

model = GPT2LMHeadModel(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(device)



cuda


#  Trainingsdaten vorbereiten

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class WikipediaTinyDataset(Dataset):
    def __init__(self, dfs, context_size=512):
        self.data = []
        for df in dfs:
            self.data.extend(df['tokens'].tolist())  # Token-Sequenzen extrahieren
        self.context_size = context_size
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokens = self.data[idx]

        input_ids = torch.tensor(tokens[:-1], dtype=torch.long)
        labels = torch.tensor(tokens[1:], dtype=torch.long)
        
        return input_ids, labels

# Daten von TinyStories + Wikipedia kombinieren
dataset = WikipediaTinyDataset([df_tiny, df_wiki], context_size=512)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [8]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Falls der Tokenizer keinen `pad_token_id` hat, setzen wir ihn auf das EOS-Token oder 0
if not hasattr(tokenizer, "pad_token_id") or tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = 0  # Alternativ EOS-Token nutzen, falls bekannt

# Padding-Funktion für Dataloader
def collate_fn(batch):
    input_ids = [b[0].clone().detach() for b in batch]
    labels = [b[1].clone().detach() for b in batch]

    # Kürze alle Sequenzen auf `n_positions=1024`
    max_length = 1024
    input_ids = [input_id[:max_length] for input_id in input_ids]  # Kürzen
    labels = [label[:max_length] for label in labels]  # Kürzen

    # Padding auf `n_positions=1024`
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return input_ids, labels


In [9]:
# Erstelle den DataLoader mit Padding
train_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)


# Modelltraining

In [10]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Debug-Modus für CUDA


In [11]:
vocab_size = tokenizer.vocab_size()  # Methode aufrufen, um die Zahl zu bekommen


In [12]:
print(f" tokenizer.vocab_size: {vocab_size}")
print(f" tokenizer.vocab_size Typ: {type(vocab_size)}")

 tokenizer.vocab_size: 32768
 tokenizer.vocab_size Typ: <class 'int'>


In [13]:
for batch in train_loader:
    input_ids, labels = batch
    
    print(f"Erste Batch geladen: input_ids Größe: {input_ids.shape}, labels Größe: {labels.shape}")
    print(f"Max Token-ID in input_ids: {input_ids.max()} (Sollte < {vocab_size} sein)")
    print(f"Max Token-ID in labels: {labels.max()} (Sollte < {vocab_size} oder -100 sein)")
    
    assert input_ids.max() < vocab_size, f" Token-Wert zu groß! Max-Wert: {input_ids.max()}, Vocab Size: {vocab_size}"
    assert labels.max() < vocab_size or labels.max() == -100, f"Label-Wert fehlerhaft! Max-Wert: {labels.max()}"
    
    break  # Nur erste Batch prüfen


Erste Batch geladen: input_ids Größe: torch.Size([8, 1024]), labels Größe: torch.Size([8, 1024])
Max Token-ID in input_ids: 32642 (Sollte < 32768 sein)
Max Token-ID in labels: 32642 (Sollte < 32768 oder -100 sein)


In [14]:
print("Prüfe TinyStories Daten:")
print(f"Max Token-ID in TinyStories: {df_tiny['tokens'].explode().max()} (Sollte < {vocab_size} sein)")

print("\nPrüfe Wikipedia Daten:")
print(f"Max Token-ID in Wikipedia: {df_wiki['tokens'].explode().max()} (Sollte < {vocab_size} sein)")


Prüfe TinyStories Daten:
Max Token-ID in TinyStories: 32760 (Sollte < 32768 sein)

Prüfe Wikipedia Daten:
Max Token-ID in Wikipedia: 32578 (Sollte < 32768 sein)


In [15]:
# Zeigt alle Token in Wikipedia, die größer als vocab_size sind
invalid_tokens = df_wiki['tokens'].explode()[df_wiki['tokens'].explode() >= vocab_size]
print("Fehlerhafte Wikipedia-Token (sollten nicht existieren):")
print(invalid_tokens.value_counts())  # Welche Token sind am häufigsten betroffen?


Fehlerhafte Wikipedia-Token (sollten nicht existieren):
Series([], Name: count, dtype: int64)


In [16]:
for batch in train_loader:
    input_ids, labels = batch
    
    # Verschiebe die Eingabedaten und Labels auf das gleiche Gerät wie das Modell
    input_ids, labels = input_ids.to(device), labels.to(device)
    
    print(f"✅ input_ids auf {input_ids.device}, labels auf {labels.device}")

    # Vorwärtspass im Modell
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    
    # Training fortsetzen
    optimizer.step()




✅ input_ids auf cuda:0, labels auf cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 22.30 GiB is allocated by PyTorch, and 84.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [20]:
from transformers import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-4)

# Trainingsloop
num_epochs = 4

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    
    for batch in loop:
        optimizer.zero_grad()
        
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        
        # Fortschritt anzeigen
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

print("Training abgeschlossen!")



  0%|                                                                                         | 0/1338 [00:00<?, ?it/s]


RuntimeError: CUDA error: out of memory
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


haben wir mehrere GPUs? Dann können wir nutzen:

In [None]:
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)


# Modell speichern

In [None]:
model.save_pretrained("wikipedia_tinystories_model")
hf_tokenizer.save_pretrained("wikipedia_tinystories_tokenizer")


# Laden des Modells


In [None]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

model = GPT2LMHeadModel.from_pretrained("wikipedia_tinystories_model")
tokenizer = PreTrainedTokenizerFast.from_pretrained("wikipedia_tinystories_tokenizer")


In [4]:
import gcsfs
import pandas as pd

fs = gcsfs.GCSFileSystem()

# TinyStories Daten
TINYSTORIES_TRAINING_DATA_PATH = 'gs://transformer-ngrams/TinyStories/training_data/'
tiny_files = [f'gs://{file}' for file in fs.ls(TINYSTORIES_TRAINING_DATA_PATH)]

# Wikipedia Daten
WIKIPEDIA_TRAINING_DATA_PATH = 'gs://transformer-ngrams/Wikipedia/train_data/'
wiki_files = [f'gs://{file}' for file in fs.ls(WIKIPEDIA_TRAINING_DATA_PATH)]

# Lade TinyStories (z. B. die ersten 10 Dateien, um Speicher zu sparen)
tiny_dfs = [pd.read_parquet(fs.open(file, 'rb')) for file in tiny_files[:10]]
df_tiny = pd.concat(tiny_dfs)

# Lade Wikipedia (z. B. die ersten 10 Dateien)
wiki_dfs = [pd.read_parquet(fs.open(file, 'rb')) for file in wiki_files[:10]]
df_wiki = pd.concat(wiki_dfs)

print("TinyStories Beispiel:")
print(df_tiny.head())

print("\nWikipedia Beispiel:")
print(df_wiki.head())


TinyStories Beispiel:
                                              tokens
0  [2178, 769, 280, 4922, 32600, 3746, 3031, 351,...
1  [603, 275, 13556, 1071, 5199, 360, 16875, 305,...
2  [340, 280, 7094, 32599, 6027, 305, 26360, 1792...
3  [22565, 11111, 360, 280, 7320, 2888, 4550, 200...
4  [1220, 383, 529, 3031, 32642, 4, 4, 13666, 305...

Wikipedia Beispiel:
                                         observation  \
0  [351, 620, 11950, 884, 20420, 750, 537, 4797, ...   
1  [32578, 4, 19807, 2459, 14932, 6511, 433, 5483...   
2  [32600, 8430, 17510, 6834, 280, 7684, 14567, 1...   
3  [32600, 540, 6556, 23957, 304, 569, 6751, 3258...   
4  [304, 280, 32578, 10, 8, 8, 9, 305, 32578, 10,...   

                                              target  
0  [620, 11950, 884, 20420, 750, 537, 4797, 13977...  
1  [4, 19807, 2459, 14932, 6511, 433, 5483, 795, ...  
2  [8430, 17510, 6834, 280, 7684, 14567, 1216, 20...  
3  [540, 6556, 23957, 304, 569, 6751, 32584, 377,...  
4  [280, 32578, 10, 8, 8, 9