In [None]:
#wikipedia Model wie im paper

import os
import torch
import sentencepiece as spm
import gcsfs
import pandas as pd
from transformers import GPT2Config, GPT2LMHeadModel, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Speichern des Modells
MODEL_SAVE_PATH = os.path.join(os.getcwd(), 'wikipedia_160M_model')

# Google Cloud Storage Setup
fs = gcsfs.GCSFileSystem()
TOKENIZER_PATH = 'gs://transformer-ngrams/32768.model'
with fs.open(TOKENIZER_PATH, 'rb') as f:
    tokenizer = spm.SentencePieceProcessor(model_proto=f.read())

# Tokenizer-Test
print("Test Tokenisierung:", tokenizer.encode("Wikipedia is a free encyclopedia"))

# Wikipedia-Daten laden
WIKIPEDIA_TRAINING_DATA_PATH = 'gs://transformer-ngrams/Wikipedia/train_data/'
wiki_files = [f'gs://{file}' for file in fs.ls(WIKIPEDIA_TRAINING_DATA_PATH)]
wiki_dfs = [pd.read_parquet(fs.open(file, 'rb')) for file in wiki_files]
df_wiki = pd.concat(wiki_dfs)

# Modell-Architektur nach Paper (160M Modell)
config = GPT2Config(
    vocab_size=32768,  # Gleiches Vokabular
    n_positions=2048,  # Paper nutzt 2048 Token Kontext
    n_embd=896,  # Paper: 896 für 160M
    n_layer=12,  # 12 Schichten
    n_head=16,  # 16 Heads
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel(config).to(device)

# Dataset-Klasse
class WikipediaDataset(Dataset):
    def __init__(self, df, context_size=2048):
        self.data = df["tokens"].tolist()
        self.context_size = context_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx]
        input_ids = torch.tensor(tokens[:-1], dtype=torch.long)
        labels = torch.tensor(tokens[1:], dtype=torch.long)
        return input_ids, labels

# Padding für den DataLoader
def collate_fn(batch):
    input_ids = [b[0].clone().detach() for b in batch]
    labels = [b[1].clone().detach() for b in batch]
    
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)
    return input_ids, labels

# Optimierer + Schedulers (Cosine Decay + Warmup)
batch_size = 8
gradient_accumulation_steps = 16  # Effektive Batch-Size = 128
num_epochs = 1  # Wikipedia wird nur 1 Epoche trainiert!

optimizer = AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
num_training_steps = len(df_wiki) * num_epochs // (batch_size * gradient_accumulation_steps)
lr_scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=num_training_steps)

# Training mit exakten Paper-Spezifikationen
dataset = WikipediaDataset(df_wiki, context_size=2048)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

print(f"Training gestartet für {num_epochs} Epoche...")
for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)

    for step, batch in enumerate(loop):
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()

        if step % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

# Modell speichern
model.save_pretrained(MODEL_SAVE_PATH)
print(f"Training abgeschlossen. Modell gespeichert in: {MODEL_SAVE_PATH}")
