In [2]:
import argparse
import json
import logging
import math
import os
import random
from itertools import chain
from pathlib import Path

import datasets
import torch
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from datasets import load_dataset
from huggingface_hub import HfApi
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    default_data_collator,
    get_scheduler,
)
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

In [3]:
import os
import math
import logging
import random
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from transformers import get_scheduler
from accelerate import Accelerator
from tqdm.notebook import tqdm
from itertools import chain
from datasets.utils.logging import set_verbosity_warning, set_verbosity_info

# Initialize logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [6]:
# Example configuration variables for Jupyter notebook
args = {
    'dataset_name': None,  # or a custom dataset path
    'train_file': None,
    'validation_file': None,
    'output_dir': './output',
    'model_name_or_path': 'STLForCausalLM',
    'tokenizer_name': 'STLTokenizer',
    'block_size': 128,
    'batch_size': 8,
    'gradient_accumulation_steps': 1,
    'num_train_epochs': 3,
    'learning_rate': 5e-5,
    'weight_decay': 0.01,
    'num_warmup_steps': 0,
    'max_train_steps': None,
    'seed': 42,
    'with_tracking': False,
    'hub_model_id': None,
    'push_to_hub': False,
    'trust_remote_code': False,
    'overwrite_cache': False,
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 8,
    'checkpointing_steps': 'epoch',  # or 'steps' with an int value
    'resume_from_checkpoint': None,
}

# Initialize the accelerator
accelerator = Accelerator()

# Send telemetry for resource tracking (assuming you have this function)
# send_example_telemetry("run_clm_no_trainer", args)

# Set seed
if args['seed'] is not None:
    torch.manual_seed(args['seed'])

In [9]:
# Prepare the datasets
if args['dataset_name']:
    raw_datasets = load_dataset(args['dataset_name'])
else:
    data_files = {}
    if args['train_file']:
        data_files["train"] = args['train_file']
    if args['validation_file']:
        data_files["validation"] = args['validation_file']
    raw_datasets = load_dataset('text')

In [13]:
from huggingface_hub import login

# Inserisci il tuo token Hugging Face qui
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /u/dssc/scandu00/.cache/huggingface/token
Login successful


In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'])

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=['text'])

block_size = args['block_size']
def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)

train_dataset = lm_datasets['train']
eval_dataset = lm_datasets['validation']

# DataLoader creation
train_dataloader = DataLoader(train_dataset, batch_size=args['per_device_train_batch_size'], shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=args['per_device_eval_batch_size'])

# Optimizer setup
optimizer = torch.optim.AdamW(model.parameters(), lr=args['learning_rate'], weight_decay=args['weight_decay'])

# Scheduler setup
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args['gradient_accumulation_steps'])
if args['max_train_steps'] is None:
    args['max_train_steps'] = args['num_train_epochs'] * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name='linear', optimizer=optimizer, num_warmup_steps=args['num_warmup_steps'],
    num_training_steps=args['max_train_steps']
)

# Prepare everything with accelerator
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

# Training Loop
for epoch in range(args['num_train_epochs']):
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch}")
    total_loss = 0
    for step, batch in enumerate(progress_bar):
        with accelerator.accumulate(model):
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        progress_bar.set_postfix(loss=total_loss / (step + 1))

    # Evaluation
    model.eval()
    eval_loss = 0
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
            eval_loss += outputs.loss.item()

    eval_loss /= len(eval_dataloader)
    perplexity = math.exp(eval_loss) if eval_loss < 100 else float('inf')
    print(f"Epoch {epoch} evaluation loss: {eval_loss}, perplexity: {perplexity}")

    # Save checkpoint
    if args['checkpointing_steps'] == 'epoch' and (epoch + 1) % 1 == 0:
        output_dir = os.path.join(args['output_dir'], f"epoch_{epoch}")
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

# Final model saving
model.save_pretrained(args['output_dir'])
tokenizer.save_pretrained(args['output_dir'])

print("Training completed!")
