In [1]:
import re

def clean_article(text: str) -> str:
    # Rimuovi righe "Updated ..." (anche DailyMail)
    text = re.sub(r"(?im)^\s*(last\s+)?updated.*(\n|$)", "", text)
    text = re.sub(r"^.*\([^)]*\)\s*--\s*", "", text)



    # Rimuovi intestazioni CNN: luogo + (CNN) --
    text = re.sub(r"^[A-Z ,()'-]{2,100}--\s*", "", text)
    text = re.sub(r"^.*\(CNN.*?--\s*", "", text)


    # Rimuovi autore CNN (By John Doe, CNN ...)
    text = re.sub(r"(?i)^by [A-Z][a-z]+.*cnn.*\n", "", text)

    # Rimuovi autore DailyMail (By Nome Cognome for MailOnline / DailyMail)

    text = re.sub(r"(?i)^by .*Daily Mail Reporter .*\n", "", text)
    text = re.sub(r"(?i)^by [A-Z][a-z]+.*\n", "", text)

    return text.strip()
text_1 = "LONDON, England (CNN Student News) -- Some intro text"
print(clean_article(text_1))


Some intro text


## Caricamento del dataset e del tokenizer
estrazione della lunghezza dei token per ogni articolo, unione di tutte le sezioni train, validation, test.

In [2]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer

model_name = "facebook/bart-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Da un oggetto DatasetDict estraiamo un solo Dataset con tutte le sezioni che useremo
cnn_dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [3]:
# facciamo pulizia degli articoli e tokenizziamo
def preprocess(batch):
    cleaned_articles = [clean_article(a) for a in batch["article"]]
    # Tokenizza il batch
    tokens = tokenizer(cleaned_articles, truncation=False)
    # Conta i token e salvaloi in una nuova colonna
    return {
        "article_clean": cleaned_articles,   # 👈 così vedi i testi puliti
        "article_token_len": [len(ids) for ids in tokens["input_ids"]]
    }
processed = cnn_dataset.map(preprocess, batched=True, batch_size=1000, remove_columns=["article", "id"] )
cnn_dataset = concatenate_datasets([processed["train"], processed["validation"], processed["test"]])
print(type(cnn_dataset))

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1191 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

<class 'datasets.arrow_dataset.Dataset'>


## Data cleaning con pandas
eclusione della colonna id, esclusione delle righe con campi vuoti

In [None]:
import pandas as pd
# convertiamo il  Dataset in un Dataframe pandas
df_cnn_dailymail = cnn_dataset.to_pandas()
print(type(df_cnn_dailymail))


<class 'pandas.core.frame.DataFrame'>


In [6]:


# rimuoviamo righe con valori vuoti
display(df_cnn_dailymail.isnull().sum())

display(df_cnn_dailymail.head())

highlights           0
article_clean        0
article_token_len    0
dtype: int64

Unnamed: 0,highlights,article_clean,article_token_len
0,Harry Potter star Daniel Radcliffe gets £20M f...,Harry Potter star Daniel Radcliffe gains acces...,557
1,Mentally ill inmates in Miami are housed on th...,The ninth floor of the Miami-Dade pretrial det...,808
2,"NEW: ""I thought I was going to die,"" driver sa...",Drivers who were on the Minneapolis bridge whe...,909
3,"Five small polyps found during procedure; ""non...",Doctors removed five small polyps from Preside...,526
4,"NEW: NFL chief, Atlanta Falcons owner critical...",The National Football League has indefinitely ...,1191


### Analisi sulle lunghezze di riassunti e articoli

In [7]:
# nuove colonne con le lunghezze di articoli e riassunti
df_cnn_dailymail["article_len"] = df_cnn_dailymail["article_clean"].map(lambda x: len(x.split()))
df_cnn_dailymail["highlights_len"] = df_cnn_dailymail["highlights"].map(lambda x: len(x.split()))
#percentuale lunghezza riassunto/lunghezza testo
df_troppo_corti = df_cnn_dailymail[df_cnn_dailymail["highlights_len"]/df_cnn_dailymail["article_len"] < 1/100]
df_troppo_corti.to_html("df_troppo_corti.html", index=False)
print(df_troppo_corti.shape)




(162, 5)


In [8]:

count = (df_cnn_dailymail["article_token_len"] > 1024).sum()
print("Articoli oltre i 1024 token:", count)

Articoli oltre i 1024 token: 92030
