In [1]:
"""
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
on a text file or a dataset without using HuggingFace Trainer.

Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=text-generation
"""

'\nFine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)\non a text file or a dataset without using HuggingFace Trainer.\n\nHere is the full list of checkpoints on the hub that can be fine-tuned by this script:\nhttps://huggingface.co/models?filter=text-generation\n'

In [2]:
import argparse
import json
import logging
import math
import os
import random
from itertools import chain
from pathlib import Path

import datasets
import torch
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from datasets import load_dataset
from huggingface_hub import HfApi
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    default_data_collator,
    get_scheduler,
)
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

In [3]:
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.48.0.dev0")

logger = get_logger(__name__)

require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

ImportError: This example requires a source install from HuggingFace Transformers (see `https://huggingface.co/docs/transformers/installation#install-from-source`), but the version found is 4.43.1.
Check out https://github.com/huggingface/transformers/tree/main/examples#important-note for the examples corresponding to other versions of HuggingFace Transformers.

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import logging
import os
import random
import math
from torch.optim import AdamW
from transformers import get_scheduler

# Configura il logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Funzione di tokenizzazione
def tokenize_function(examples, tokenizer, text_column_name="text"):
    return tokenizer(examples[text_column_name])

# Funzione di preprocessing (con blocchi di testo)
def group_texts(examples, tokenizer, block_size):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# Funzione di addestramento (senza trainer)
def train_model(model, tokenizer, train_dataset, eval_dataset, args):
    # Creazione dei DataLoader
    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args["per_device_train_batch_size"]
    )
    eval_dataloader = DataLoader(
        eval_dataset, collate_fn=default_data_collator, batch_size=args["per_device_eval_batch_size"]
    )

    # Ottimizzatore
    optimizer = AdamW(model.parameters(), lr=args["learning_rate"])

    # Scheduler
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args["gradient_accumulation_steps"])
    if args["max_train_steps"] is None:
        args["max_train_steps"] = args["num_train_epochs"] * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        name=args["lr_scheduler_type"],
        optimizer=optimizer,
        num_warmup_steps=args["num_warmup_steps"],
        num_training_steps=args["max_train_steps"]
    )

    # Ciclo di addestramento
    progress_bar = tqdm(range(args["max_train_steps"]))
    completed_steps = 0

    for epoch in range(args["num_train_epochs"]):
        model.train()
        total_loss = 0

        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            # Backpropagation
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            # Aggiornamento della barra di progresso
            progress_bar.update(1)
            completed_steps += 1

            if completed_steps >= args["max_train_steps"]:
                break

        # Valutazione
        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            loss = outputs.loss
            losses.append(loss.item())

        eval_loss = torch.mean(torch.tensor(losses))
        perplexity = math.exp(eval_loss)

        logger.info(f"Epoch {epoch}: Perplexity: {perplexity}, Eval Loss: {eval_loss.item()}")

        # Fine dell'addestramento
        if completed_steps >= args["max_train_steps"]:
            break

    # Salvataggio del modello finale
    model.save_pretrained(args["output_dir"])
    tokenizer.save_pretrained(args["output_dir"])

    return model, tokenizer

# Funzione principale che esegue la demo
def main_demo():
    # Parametri
    args = {
        "dataset_name": "wikitext",
        "model_name_or_path": "gpt2",
        "learning_rate": 5e-5,
        "num_train_epochs": 3,
        "max_train_steps": 1000,
        "block_size": 512,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "gradient_accumulation_steps": 1,
        "num_warmup_steps": 0,
        "lr_scheduler_type": "linear",
        "output_dir": "./output",
    }

    # Caricamento del dataset
    raw_datasets = load_dataset(args["dataset_name"])

    # Tokenizzazione
    tokenizer = AutoTokenizer.from_pretrained(args["model_name_or_path"])
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]
    tokenized_datasets = raw_datasets.map(
        lambda x: tokenize_function(x, tokenizer, text_column_name),
        batched=True,
        remove_columns=column_names
    )

    # Preprocessing dei dati in blocchi
    block_size = args["block_size"]
    lm_datasets = tokenized_datasets.map(
        lambda x: group_texts(x, tokenizer, block_size),
        batched=True,
        desc=f"Grouping texts in chunks of {block_size}"
    )

    # Dataset di addestramento e di valutazione
    train_dataset = lm_datasets["train"]
    eval_dataset = lm_datasets["validation"]

    # Creazione e addestramento del modello
    model = AutoModelForCausalLM.from_pretrained(args["model_name_or_path"])
    model, tokenizer = train_model(model, tokenizer, train_dataset, eval_dataset, args)

    print(f"Training completed. Model saved to {args['output_dir']}")

# Esegui la demo
main_demo()


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

ValueError: Config name is missing.
Please pick one among the available configs: ['wikitext-103-raw-v1', 'wikitext-103-v1', 'wikitext-2-raw-v1', 'wikitext-2-v1']
Example of usage:
	`load_dataset('wikitext', 'wikitext-103-raw-v1')`