In [34]:
import os
from transformers import pipeline
from rouge import Rouge
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\javie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Prueba con un único artículo

In [6]:
#Modelo de summarization de HuggingFace facebook/bart-large-cnn
#URL descripcion: https://huggingface.co/facebook/bart-large-cnn
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")




In [8]:
with open('data/BBC News Summary/News Articles/business/001.txt', 'r') as file:
    contenido = file.read()

# Mostrar el contenido del archivo
print(contenido)

Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL

In [24]:

res = summarizer(contenido, max_length=500, min_length=200, do_sample=False)
print(res)

[{'summary_text': "Time Warner profits up 76% to $1.13bn for the three months to December. Firm now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance. For 2005, Time Warner is projecting operating earnings growth of around 5% and also expects higher revenue and wider profit margins. The firm is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe."}]


In [25]:
resumen_generado = res[0]['summary_text']

In [26]:
with open('data/BBC News Summary/Summaries/business/001.txt', 'r') as file:
    resumen_referencia = file.read()
print(resumen_referencia)

TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.Time Warner's fourth quarter profits were slightly better than analysts' expectations.


In [27]:
#Clasifiación de precisión Rouge
rouge = Rouge()
scores = rouge.get_scores(resumen_generado, resumen_referencia)
print(scores)

[{'rouge-1': {'r': 0.5092592592592593, 'p': 0.4661016949152542, 'f': 0.4867256587266035}, 'rouge-2': {'r': 0.31654676258992803, 'p': 0.2732919254658385, 'f': 0.2933333283602223}, 'rouge-l': {'r': 0.49074074074074076, 'p': 0.4491525423728814, 'f': 0.46902654368235575}}]


In [33]:
#Clasificación de precisión BLEU
referencias_tokenizadas = [word_tokenize(sent) for sent in resumen_referencia.split('. ') if sent]

# Tokenizando el resumen generado
hipotesis_tokenizada = word_tokenize(resumen_generado)

# Asegurándonos de que las referencias estén en una lista de listas como espera corpus_bleu
score = corpus_bleu([referencias_tokenizadas], [hipotesis_tokenizada], smoothing_function=SmoothingFunction().method1)
print(f"BLEU score: {score*100:.2f}")

BLEU score: 26.65


### Generalización a más de un artículo

In [43]:
carpeta_noticias = "data/BBC News Summary/News Articles/sport"
carpeta_resumenes = "data/BBC News Summary/Summaries/sport"

archivos_noticias = sorted(os.listdir(carpeta_noticias))
archivos_resumenes = sorted(os.listdir(carpeta_resumenes))

In [None]:
def leer_archivo(ruta):
    try:
        with open(ruta, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print("El archivo no fue encontrado")
    except Exception as e:
        print(f"Ocurrió un error al leer el archivo: {e}")
    
scores=[]
for archivo_noticia, archivo_resumen in zip(archivos_noticias, archivos_resumenes):
    ruta_noticia = os.path.join(carpeta_noticias, archivo_noticia)
    ruta_resumen = os.path.join(carpeta_resumenes, archivo_resumen)
    
    noticia = leer_archivo(ruta_noticia)
    resumen_referencia = leer_archivo(ruta_resumen)

    res = summarizer(noticia, max_length=500, min_length=200, do_sample=False)

    resumen_generado = res[0]['summary_text']

    #Clasificación de precisión BLEU
    referencias_tokenizadas = [word_tokenize(sent) for sent in resumen_referencia.split('. ') if sent]

    # Tokenizando el resumen generado
    hipotesis_tokenizada = word_tokenize(resumen_generado)

    # Asegurándonos de que las referencias estén en una lista de listas como espera corpus_bleu
    score = corpus_bleu([referencias_tokenizadas], [hipotesis_tokenizada], smoothing_function=SmoothingFunction().method1)
    print(f"BLEU score: {score*100:.2f}")
    scores.append(score)    