In [None]:
pip install torch transformers sentencepiece datasets gcsfs accelerate
pip install tensorflow


# Daten abrufen (Wikipedia & TinyStories)

In [None]:
import gcsfs
import pandas as pd

fs = gcsfs.GCSFileSystem()

# TinyStories Daten
TINYSTORIES_TRAINING_DATA_PATH = 'gs://transformer-ngrams/TinyStories/training_data/'

# Wikipedia Daten
WIKIPEDIA_TRAINING_DATA_PATH = 'gs://transformer-ngrams/Wikipedia/training_data/'

# Eine Datei laden (es gibt 100 Dateien pro Datensatz)
with fs.open(TINYSTORIES_TRAINING_DATA_PATH + '001.parquet', 'rb') as f:
    df_tiny = pd.read_parquet(f)

with fs.open(WIKIPEDIA_TRAINING_DATA_PATH + '001.parquet', 'rb') as f:
    df_wiki = pd.read_parquet(f)

print(df_tiny.head())  # TinyStories Token-Daten
print(df_wiki.head())  # Wikipedia Token-Daten


# Tokenizer laden

In [None]:
import sentencepiece as spm

TOKENIZER_PATH = 'gs://transformer-ngrams/32768.model'

with fs.open(TOKENIZER_PATH, 'rb') as f:
    tokenizer = spm.SentencePieceProcessor(model_proto=f.read())

# Teste den Tokenizer
text = "This is a test"
tokens = tokenizer.encode_as_ids(text)
print(tokens)  # -> [2345, 23, 12, 543]

# mit dem kleinsten Modell 160M


In [None]:
from transformers import GPT2Config, GPT2LMHeadModel

# 160M-Modell für TinyStories & Wikipedia
config = GPT2Config(
    vocab_size=32768,  # Tokenizer-Vokabular
    n_positions=1024,  # Maximale Kontextgröße
    n_embd=896,  # Größe der Einbettungen
    n_layer=12,  # Anzahl der Transformer-Schichten
    n_head=16,  # Anzahl der Attention-Köpfe
)

model = GPT2LMHeadModel(config)
model.cuda()  # Falls du auf einer GPU trainierst


#  Trainingsdaten vorbereiten

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class WikipediaTinyDataset(Dataset):
    def __init__(self, dfs, context_size=512):
        self.data = []
        for df in dfs:
            self.data.extend(df['tokens'].tolist())  # Token-Sequenzen extrahieren
        self.context_size = context_size
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokens = self.data[idx]

        input_ids = torch.tensor(tokens[:-1], dtype=torch.long)
        labels = torch.tensor(tokens[1:], dtype=torch.long)
        
        return input_ids, labels

# Daten von TinyStories + Wikipedia kombinieren
dataset = WikipediaTinyDataset([df_tiny, df_wiki], context_size=512)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)


# Modelltraining

In [None]:
from transformers import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-4)

# Trainingsloop
num_epochs = 4

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    
    for batch in loop:
        optimizer.zero_grad()
        
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        
        # Fortschritt anzeigen
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

print("Training abgeschlossen!")


haben wir mehrere GPUs? Dann können wir nutzen:

In [None]:
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)


# Modell speichern

In [None]:
model.save_pretrained("wikipedia_tinystories_model")
hf_tokenizer.save_pretrained("wikipedia_tinystories_tokenizer")


# Laden des Modells


In [None]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

model = GPT2LMHeadModel.from_pretrained("wikipedia_tinystories_model")
tokenizer = PreTrainedTokenizerFast.from_pretrained("wikipedia_tinystories_tokenizer")
