In [2]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Installing collected packages: keybert
Successfully installed keybert-0.8.5


## Arricchimento del Dataset con Info Su KeyWords

In [4]:
import os
import torch
import pandas as pd
from tqdm import tqdm
import gc
from typing import List, Dict

# Check versions and import properly
print("Checking and importing dependencies...")
print(f"PyTorch version: {torch.__version__}")

# Import KeyBERT and SentenceTransformer with error handling
try:
    from keybert import KeyBERT
    from sentence_transformers import SentenceTransformer
except Exception as e:
    print(f"Error importing KeyBERT or SentenceTransformer: {e}")
    print("Please make sure you have the correct versions installed.")
    raise


def process_texts(texts: list, summaries: list, batch_size: int, top_n_keywords: int, model_path: str):
    """
    Optimized processing for single GPU with large memory
    """
    # Enable performance optimizations
    torch.backends.cudnn.benchmark = True
    if hasattr(torch.backends.cuda, 'matmul'):
        torch.backends.cuda.matmul.allow_tf32 = True

    # Setup GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize model with optimizations
    try:
        model = SentenceTransformer(model_path)
        if device.type == 'cuda':
            model.half()  #usa fp16
        model.to(device)
        kw_model = KeyBERT(model=model)
    except Exception as e:
        print(f"Error initializing models: {e}")
        raise

    results = []

    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch_texts = texts[i:i + batch_size]
        batch_summaries = summaries[i:i + batch_size]

        try:
            # Process each text in the batch
            for idx, (text, summary) in enumerate(zip(batch_texts, batch_summaries)):
                try:
                    keywords = kw_model.extract_keywords(
                        text,
                        keyphrase_ngram_range=(1, 2),
                        stop_words='english',
                        top_n=top_n_keywords,
                        use_maxsum=False,
                        use_mmr=True,
                        diversity=0.5
                    )

                    keywords_section = " ".join([f"[{kw.upper()}:{score:.2f}]" for kw, score in keywords])
                    augmented_text = f"{text}\n\nKeywords: {keywords_section}"
                    results.append({"text": augmented_text, "summary": summary})

                except Exception as e:
                    print(f"Error processing text {i+idx}: {e}")
                    results.append({"text": text, "summary": summary})

            # Clean GPU memory periodically
            if i % (batch_size * 4) == 0 and i > 0 and device.type == 'cuda':
                torch.cuda.empty_cache()
                gc.collect()

        except Exception as e:
            print(f"Error processing batch starting at index {i}: {e}")
            for text, summary in zip(batch_texts, batch_summaries):
                results.append({"text": text, "summary": summary})

    return results


def process_dataset(file_path: str, batch_size: int, top_n_keywords: int, model_path: str) -> List[Dict]:
    """
    Process dataset using single GPU
    """
    print("Loading dataset...")
    df = pd.read_csv(file_path)

    # Filter valid documents
    mask = df.iloc[:, 1].apply(lambda x: isinstance(x, str) and len(str(x).strip()) > 10)
    df = df[mask].reset_index(drop=True)

    texts = df.iloc[:, 1].tolist()
    summaries = df.iloc[:, 2].tolist()

    del df  # Free memory
    gc.collect()

    return process_texts(texts, summaries, batch_size, top_n_keywords, model_path)


def save_to_csv(processed_data: List[Dict], output_path: str):
    chunk_size = 5000
    for i in range(0, len(processed_data), chunk_size):
        chunk = processed_data[i:i + chunk_size]
        mode = 'w' if i == 0 else 'a'
        header = i == 0

        pd.DataFrame(chunk).to_csv(
            output_path,
            index=False,
            mode=mode,
            header=header
        )

        del chunk
        gc.collect()

    print(f"Saved processed data to {output_path}")


def main():
    # Print CUDA information
    print("\nCUDA Information:")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

    file_path = "data.csv"

    # Process dataset with optimized parameters for single GPU
    processed_data = process_dataset(
        file_path=file_path,
        batch_size=64,  # Conservative batch size
        top_n_keywords=10,
        model_path="all-MiniLM-L6-v2"
    )

    # Save results
    output_path = "processed_dataset.csv"
    save_to_csv(processed_data, output_path)

    print(f"\nProcessing completed. Processed {len(processed_data)} documents")


if __name__ == '__main__':
    main()

Checking and importing dependencies...
PyTorch version: 2.5.1+cu121

CUDA Information:
CUDA available: True
GPU: Tesla T4
GPU Memory: 14.75 GB
Loading dataset...
Using device: cuda:0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processing batches: 100%|██████████| 180/180 [34:40<00:00, 11.56s/it]


Saved processed data to processed_dataset.csv

Processing completed. Processed 11490 documents


## Finetuning su Dataset Arricchito

In [5]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [10]:
import torch
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

# 1. Carica il tokenizer e il modello
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# 2. Carica il dataset dal file CSV
file_path = "processed_dataset.csv"  # Specifica il percorso corretto
df = pd.read_csv(file_path)

# Converti il dataframe in un dataset Hugging Face
hf_dataset = Dataset.from_pandas(df)

# 3. Preprocessing del dataset
def preprocess_function(examples):
    inputs = examples["text"]  # Usa direttamente la colonna 'text'
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Applica il preprocessing
dataset = hf_dataset.map(preprocess_function, batched=False)

# Dividi il dataset in training e validation
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

# 4. Collatore per i dati
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 5. Imposta i parametri di training
training_args = Seq2SeqTrainingArguments(
    output_dir="./distilbart-summarization",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_first_step=True,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Usa FP16 se CUDA è disponibile
    push_to_hub=False,  # Cambia a True se vuoi caricare il modello su Hugging Face Hub
)

# 6. Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 7. Avvia il training
trainer.train()

# 8. Salva il modello fine-tunato
trainer.save_model("./distilbart-summarization")
tokenizer.save_pretrained("./distilbart-summarization")




Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss
500,1.8266,1.733834
1000,1.8188,1.733539
1500,1.3777,1.751115
2000,1.397,1.7173
2500,1.4113,1.707285
3000,1.0282,1.779222
3500,1.0477,1.768952




('./distilbart-summarization/tokenizer_config.json',
 './distilbart-summarization/special_tokens_map.json',
 './distilbart-summarization/vocab.json',
 './distilbart-summarization/merges.txt',
 './distilbart-summarization/added_tokens.json')

In [14]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

model_path = "./distilbart-summarization"
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Testo da riassumere
text_to_summarize = """
Scientists have discovered a new species of frog in the remote rainforests of the Amazon. This brightly colored amphibian, named Dendrobates aurora, has a striking blue and yellow pattern, making it easily distinguishable from other species. Researchers believe the frog's unique coloration serves as a warning to predators about its toxic skin secretions. The discovery highlights the importance of preserving the Amazon rainforest, which is home to countless undiscovered species. Unfortunately, deforestation continues to threaten this delicate ecosystem, with thousands of hectares lost each year to logging and agriculture. Conservationists are calling for immediate action to protect these habitats, emphasizing that the loss of biodiversity could have devastating consequences for the planet.
<KEYWORDS> [NEW SPECIES:0.78] [FROG AMAZON:0.72] [TOXIC SKIN:0.67] [RAINFOREST CONSERVATION:0.65] [BIODIVERSITY LOSS:0.59] [BLUE YELLOW:0.52] [PREDATORS WARNING:0.49] [DEFENSE MECHANISM:0.45] [HABITAT THREAT:0.42] [ACTION REQUIRED:0.40] </KEYWORDS>"""

# Preprocessa il testo
inputs = tokenizer(text_to_summarize, return_tensors="pt", truncation=True, max_length=512).to(device)  # Porta i tensori sullo stesso dispositivo

# Genera il riassunto
model.eval()  # Imposta il modello in modalità valutazione
with torch.no_grad():
    summary_ids = model.generate(
        inputs.input_ids,  # Ora sul dispositivo corretto
        max_length=128,
        num_beams=4,  # Beam search per migliorare la qualità
        early_stopping=True
    )

# Decodifica il riassunto
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("\n*** Riassunto Generato ***\n")
print(summary)




*** Riassunto Generato ***

Scientists have discovered a new species of frog in the Amazon .
The brightly colored amphibian has a striking blue and yellow pattern .
The discovery highlights the importance of preserving the Amazon rainforest .


## Finetuning su Testo Standard

In [None]:
import torch
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

# 1. Carica il tokenizer e il modello
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# 2. Carica il dataset dal file CSV
file_path = "data.csv"  # Specifica il percorso corretto
df = pd.read_csv(file_path)

# Converti il dataframe in un dataset Hugging Face
hf_dataset = Dataset.from_pandas(df)

# 3. Preprocessing del dataset
def preprocess_function(examples):
    inputs = examples["article"]  # Usa direttamente la colonna 'text'
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(examples["highlights"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Applica il preprocessing
dataset = hf_dataset.map(preprocess_function, batched=False)

# Dividi il dataset in training e validation
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

# 4. Collatore per i dati
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 5. Imposta i parametri di training
training_args = Seq2SeqTrainingArguments(
    output_dir="./distilbart-summarization",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_first_step=True,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Usa FP16 se CUDA è disponibile
    push_to_hub=False,  # Cambia a True se vuoi caricare il modello su Hugging Face Hub
)

# 6. Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 7. Avvia il training
trainer.train()

# 8. Salva il modello fine-tunato
trainer.save_model("./distilbart-summarization_standard_text")
tokenizer.save_pretrained("./distilbart-summarization_standard_text")


## Seleziona e Scarica il modello

In [None]:
!zip -r fine_tuned_bart.zip ./distilbart-summarization


  adding: distilbart-summarization/ (stored 0%)
  adding: distilbart-summarization/vocab.json (deflated 68%)
  adding: distilbart-summarization/training_args.bin (deflated 52%)
  adding: distilbart-summarization/checkpoint-3500/ (stored 0%)
  adding: distilbart-summarization/checkpoint-3500/scheduler.pt (deflated 55%)
  adding: distilbart-summarization/checkpoint-3500/trainer_state.json (deflated 75%)
  adding: distilbart-summarization/checkpoint-3500/vocab.json (deflated 68%)
  adding: distilbart-summarization/checkpoint-3500/training_args.bin (deflated 52%)
  adding: distilbart-summarization/checkpoint-3500/optimizer.pt (deflated 9%)
  adding: distilbart-summarization/checkpoint-3500/generation_config.json (deflated 47%)
  adding: distilbart-summarization/checkpoint-3500/merges.txt (deflated 53%)
  adding: distilbart-summarization/checkpoint-3500/config.json (deflated 62%)
  adding: distilbart-summarization/checkpoint-3500/tokenizer_config.json (deflated 76%)
  adding: distilbart-sum

In [None]:
from google.colab import files
files.download("fine_tuned_bart.zip")

