![RadioGPT Banner](https://openfileserver.chloelavrat.com/workshops/RadioGPT/assets/radiogpt-banner.png)



> 💡 **PLEASE CONNECT USING A GPU SESSION FOR MORE COMPUTE POWER** :
>
> `Runtime > Change runtime type > T4 GPU > Save`


In [6]:
#@title Initialize the notebook
!git clone https://github.com/chloelavrat/RadioGPT.git > /dev/null 2>&1
!cd RadioGPT && git checkout clavrat/first-version > /dev/null 2>&1
!pip install torch datasets tqdm transformers > /dev/null 2>&1

# 📀 Dataset Overview

In [8]:
#@title Select your radio dataset
#@markdown Please select your favorit Radio Station ! Then run the cell to load the dataset 🌱
import os
import subprocess
from RadioGPT.gptmodel.core.dataset import AlpacaDataset
from RadioGPT.gptmodel.core.utils import download_dataset

radio_station = 'France Inter' # @param ["France Inter", "Mouv’", "France Culture"]

base_url = "https://openfileserver.chloelavrat.com/workshops/RadioGPT/dataset/"

if radio_station == 'France Inter':
  block_size = 64
  file = "Acquiesce_data_110k_instructions.json"
  destination = "dataset/inter.json"

if radio_station == 'Mouv’':
  print("bli")
  # load Mouv’ dataset

if radio_station == 'France Culture':
  print("bli")
  # load France Culture dataset

download_dataset(base_url+file, destination)
dataset = AlpacaDataset(destination, block_size)
print("Dataset loaded !")

Downloading dataset...
Dataset downloaded!
Loaded 110368 conversations
Maximum sequence length: 64
Dataset loaded !


# 🧠 Model definition

In [9]:
#@title Loading ...
# Load RadioGPT's checkpoint :)
from RadioGPT.gptmodel.core.model import GPTlite
from RadioGPT.gptmodel.core.utils import load_model
import os, subprocess, torch
# get device
device = (
    torch.device("mps") if torch.backends.mps.is_available() else
    torch.device("cuda") if torch.cuda.is_available() else
    torch.device("cpu")
)

# downloading model
print("Downloading model...")
os.makedirs("models", exist_ok=True)
subprocess.run(["wget", "https://openfileserver.chloelavrat.com/workshops/RadioGPT/models/model_gpt_chat_best.pth", "-O", "models/model_gpt_chat_best.pth"])

def load_model(model_path, device, config):
    # Load the model checkpoint
    checkpoint = torch.load(
        model_path, map_location=device, weights_only=False)

    model = GPTlite(config)
    model.load_state_dict(checkpoint)

    return model

# Hyperparameters
block_size = 64
n_embd = 512
n_head = 8
n_layer = 10
dropout = 0.2

config = {
    'context_size': block_size,
    'vocab_size': dataset.vocab_size,
    'embedding_dim': n_embd,
    'num_heads': n_head,
    'num_layers': n_layer,
    'dropout': dropout
}

# Loading model in memory
print("Loading model...")
model = load_model("models/model_gpt_chat_best.pth", device, config)
model = model.to(device)
print("Model loaded !")

# Get the total number of parameters in the model
total_params = sum(p.numel() for p in model.parameters())

print(f"You will use your {device}")
print(f"Total number of parameters   {total_params / 1e6:.1f}M")

Downloading model...
Loading model...
Model loaded !
You will use your cpu
Total number of parameters   83.1M


# ⛳️ RadioGPT Training



In [10]:
# training parameters
learning_rate = 1e-3
epochs = 1000
batch_size = 128
grad_clip = 0.5
eval_every = 100

In [None]:
#@title Casual Training Loop
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import torch.optim as optim
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm

# --- Setup --- #
print("Let's fry some eggs!! (your loss should be less than 5, restart cell if not...)")
print("-" * 5)

# Initialize scaler for mixed precision training
scaler = torch.amp.GradScaler()

# Split dataset into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

# Initialize optimizer and learning rate scheduler
optimizer = optim.AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=0.1,
    betas=(0.9, 0.999)
)

lr_scheduler = OneCycleLR(
    optimizer,
    max_lr=learning_rate,
    epochs=epochs,
    steps_per_epoch=len(train_loader),
    pct_start=0.1,
    div_factor=25,
    final_div_factor=1e4
)

# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            with torch.cuda.amp.autocast():
                _, loss = model(x, y)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# --- Training Loop --- #
best_validation_loss = float('inf')
no_improvement_count = 0

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")

    # Training phase
    for batch_idx, (x, y) in enumerate(train_progress_bar):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()

        # Forward pass with mixed precision
        with torch.cuda.amp.autocast():
            logits, loss = model(x, y)

        # Backward pass
        scaler.scale(loss).backward()

        if grad_clip is not None:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

        scaler.step(optimizer)
        scaler.update()

        # Update learning rate scheduler
        lr_scheduler.step()

        total_train_loss += loss.item()

        # Update progress bar
        train_progress_bar.set_postfix({
            'loss': f'{loss.item():.3f}',
            'lr': f'{optimizer.param_groups[0]["lr"]:.2e}'
        })

    # Validation phase every `eval_every` epochs
    if (epoch + 1) % eval_every == 0:
        avg_train_loss = total_train_loss / len(train_loader)
        val_loss = evaluate(model, val_loader)

        print("-" * 30)
        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Average Train Loss: {avg_train_loss:.4f}")
        print(f"Validation Loss: {val_loss:.4f}")
        print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.2e}")
        print("-" * 30)

        # Save the best model based on validation loss
        if val_loss < best_validation_loss:
            best_validation_loss = val_loss
            no_improvement_count = 0
            torch.save(model.state_dict(), f"{config['Training']['save_dir']}/best_model.pth")
            print(f"New best model saved! Loss: {best_validation_loss:.4f}")
        else:
            no_improvement_count += 1

        # Early stopping condition
        if no_improvement_count >= config["Training"]["patience"]:
            print(f"No improvement in {config['Training']['patience']} evaluations. Early stopping.")
            break

        model.train()



# 🎨 Let's Play

In [None]:
#@title Selected prompt

# play with the model
def generate_response(model, dataset, prompt, device, max_new_tokens):
    model.eval()
    # Encode the prompt
    input_tensor = dataset.encode(prompt).unsqueeze(0).to(device)

    # Generate text using the model's generate method
    with torch.no_grad():
        generated_indices = model.generate(input_tensor, max_new_tokens)
        generated_text = dataset.decode(generated_indices[0].tolist())

    # Return only the newly generated part (after the prompt)
    return generated_text[len(prompt):]


prompt = 'Prépare une recette de pâtes à la carbonara.' # @param ["Prépare une recette de pâtes à la carbonara.", "Quel est l'élément chimique avec le numéro atomique 29 ?", "Rédige un court paragraphe sur le thème de l'amitié et de la confiance."]
max_new_tokens = 106 # @param {type:"slider", min:5, max:500, step:1}


prompt = f"Question: {prompt}\nAnswer:"
prompt = prompt + generate_response(model, dataset, prompt, device, max_new_tokens)
print(prompt)

In [None]:
#@title Let's prompt it!

# play with the model
def generate_response(model, dataset, prompt, device, max_new_tokens):
    model.eval()
    # Encode the prompt
    input_tensor = dataset.encode(prompt).unsqueeze(0).to(device)

    # Generate text using the model's generate method
    with torch.no_grad():
        generated_indices = model.generate(input_tensor, max_new_tokens)
        generated_text = dataset.decode(generated_indices[0].tolist())

    # Return only the newly generated part (after the prompt)
    return generated_text[len(prompt):]


prompt = 'Décris les différences entre le modèle GPT-2 et le modèle GPT-3.' # @param {type:"string"}
max_new_tokens = 180 # @param {type:"slider", min:5, max:500, step:1}


prompt = f"Question: {prompt}\nAnswer:"
prompt = prompt + generate_response(model, dataset, prompt, device, max_new_tokens)
print(prompt)