In [1]:
import os
import pandas as pd
from transformers import pipeline
from rouge import Rouge
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk.tokenize import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\javie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Prueba con un único artículo

In [2]:
#Modelo de summarization de HuggingFace facebook/bart-large-cnn
#URL descripcion: https://huggingface.co/facebook/bart-large-cnn
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")




In [3]:
with open('data/News Articles/business/001.txt', 'r') as file:
    contenido = file.read()

# Mostrar el contenido del archivo
print(contenido)

Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL

In [4]:

res = summarizer(contenido, max_length=500, min_length=200, do_sample=False)
print(res)

[{'summary_text': "Time Warner profits up 76% to $1.13bn for the three months to December. Firm now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance. For 2005, Time Warner is projecting operating earnings growth of around 5% and also expects higher revenue and wider profit margins. The firm is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe."}]


In [5]:
resumen_generado = res[0]['summary_text']

In [6]:
with open('data/Summaries/business/001.txt', 'r') as file:
    resumen_referencia = file.read()
print(resumen_referencia)

TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.Time Warner's fourth quarter profits were slightly better than analysts' expectations.


In [7]:
#Clasifiación de precisión Rouge
rouge = Rouge()
scores = rouge.get_scores(resumen_generado, resumen_referencia)
print(scores)

[{'rouge-1': {'r': 0.5092592592592593, 'p': 0.4661016949152542, 'f': 0.4867256587266035}, 'rouge-2': {'r': 0.31654676258992803, 'p': 0.2732919254658385, 'f': 0.2933333283602223}, 'rouge-l': {'r': 0.49074074074074076, 'p': 0.4491525423728814, 'f': 0.46902654368235575}}]


In [8]:
#Clasificación de precisión BLEU
referencias_tokenizadas = [word_tokenize(sent) for sent in resumen_referencia.split('. ') if sent]

# Tokenizando el resumen generado
hipotesis_tokenizada = word_tokenize(resumen_generado)

# Asegurándonos de que las referencias estén en una lista de listas como espera corpus_bleu
score = corpus_bleu([referencias_tokenizadas], [hipotesis_tokenizada], smoothing_function=SmoothingFunction().method1)
print(f"BLEU score: {score*100:.2f}")

BLEU score: 26.65


### Generalización a más de un artículo

In [9]:
path_, filename_, category_, article_or_summary_,content_ = [],[],[],[],[]
for dirname, _, filenames in os.walk('data/'):
    for filename in filenames:
        path = os.path.join(dirname, filename).replace("\\","/")
        f = open(os.path.join(dirname, filename),"r")
        try:
            content_.append(str(f.read()))
            path_.append(path)
            filename_.append(filename)
            category_.append(path.split("/")[-2])
            article_or_summary_.append(path.split("/")[-3])
        except:
            print("ERROR ABRIENDO EL FICHERO")

In [10]:
df1 = pd.DataFrame({"path":path_, "filename":filename_, "category":category_, "article_or_summary":article_or_summary_,"content":content_}, columns=["path", "filename", "category", "article_or_summary","content"])
df1

Unnamed: 0,path,filename,category,article_or_summary,content
0,data/News Articles/business/001.txt,001.txt,business,News Articles,Ad sales boost Time Warner profit\n\nQuarterly...
1,data/News Articles/business/002.txt,002.txt,business,News Articles,Dollar gains on Greenspan speech\n\nThe dollar...
2,data/News Articles/business/003.txt,003.txt,business,News Articles,Yukos unit buyer faces loan claim\n\nThe owner...
3,data/News Articles/business/004.txt,004.txt,business,News Articles,High fuel prices hit BA's profits\n\nBritish A...
4,data/News Articles/business/005.txt,005.txt,business,News Articles,Pernod takeover talk lifts Domecq\n\nShares in...
...,...,...,...,...,...
4444,data/Summaries/tech/397.txt,397.txt,tech,Summaries,BT is introducing two initiatives to help beat...
4445,data/Summaries/tech/398.txt,398.txt,tech,Summaries,A third of them read unsolicited junk e-mail a...
4446,data/Summaries/tech/399.txt,399.txt,tech,Summaries,This goes to the heart of the European project...
4447,data/Summaries/tech/400.txt,400.txt,tech,Summaries,Amit Yoran was director of the National Cyber ...


In [11]:
df = df1[df1['article_or_summary'] == "News Articles"]
df

Unnamed: 0,path,filename,category,article_or_summary,content
0,data/News Articles/business/001.txt,001.txt,business,News Articles,Ad sales boost Time Warner profit\n\nQuarterly...
1,data/News Articles/business/002.txt,002.txt,business,News Articles,Dollar gains on Greenspan speech\n\nThe dollar...
2,data/News Articles/business/003.txt,003.txt,business,News Articles,Yukos unit buyer faces loan claim\n\nThe owner...
3,data/News Articles/business/004.txt,004.txt,business,News Articles,High fuel prices hit BA's profits\n\nBritish A...
4,data/News Articles/business/005.txt,005.txt,business,News Articles,Pernod takeover talk lifts Domecq\n\nShares in...
...,...,...,...,...,...
2219,data/News Articles/tech/397.txt,397.txt,tech,News Articles,BT program to beat dialler scams\n\nBT is intr...
2220,data/News Articles/tech/398.txt,398.txt,tech,News Articles,Spam e-mails tempt net shoppers\n\nComputer us...
2221,data/News Articles/tech/399.txt,399.txt,tech,News Articles,Be careful how you code\n\nA new European dire...
2222,data/News Articles/tech/400.txt,400.txt,tech,News Articles,US cyber security chief resigns\n\nThe man mak...


In [12]:
df_resumenes = df1[df1['article_or_summary'] == "Summaries"] 
df_resumenes

Unnamed: 0,path,filename,category,article_or_summary,content
2224,data/Summaries/business/001.txt,001.txt,business,Summaries,TimeWarner said fourth quarter sales rose 2% t...
2225,data/Summaries/business/002.txt,002.txt,business,Summaries,The dollar has hit its highest level against t...
2226,data/Summaries/business/003.txt,003.txt,business,Summaries,Yukos' owner Menatep Group says it will ask Ro...
2227,data/Summaries/business/004.txt,004.txt,business,Summaries,"Rod Eddington, BA's chief executive, said the ..."
2228,data/Summaries/business/005.txt,005.txt,business,Summaries,Pernod has reduced the debt it took on to fund...
...,...,...,...,...,...
4444,data/Summaries/tech/397.txt,397.txt,tech,Summaries,BT is introducing two initiatives to help beat...
4445,data/Summaries/tech/398.txt,398.txt,tech,Summaries,A third of them read unsolicited junk e-mail a...
4446,data/Summaries/tech/399.txt,399.txt,tech,Summaries,This goes to the heart of the European project...
4447,data/Summaries/tech/400.txt,400.txt,tech,Summaries,Amit Yoran was director of the National Cyber ...


In [14]:
categoria_business = 150
categoria_tech = 180
categoria_entertainment = 220
categoria_politics = 240
categoria_sport = 180

summary_df = pd.DataFrame(columns=['filename', 'summary_HF', 'category'])

# Iterar sobre cada categoría y generar 10 resúmenes por categoría
for category in df['category'].unique():
    try:
        category_df = df[df['category'] == category].head(15) 
        for index, row in category_df.iterrows():
            filename = row['filename']
            content = row['content']
            valores_a_filtrar = [filename,category_df]

            df_filtrado = df_resumenes[df_resumenes[['filename', 'category']].apply(tuple, axis=1).isin(valores_a_filtrar)]

            #obtener solo la columna 'content'
            contenido_filtrado = df_filtrado['content']
            
            if category == 'business':        
                res = summarizer(content, max_length=categoria_business, min_length=100, do_sample=False)
            elif category == 'tech':
                res = summarizer(content, max_length=categoria_tech, min_length=127, do_sample=False)
            elif category == 'entertainment':
                res = summarizer(content, max_length=categoria_entertainment, min_length=150, do_sample=False)
            elif category == 'politics':
                res = summarizer(content, max_length=categoria_politics, min_length=156, do_sample=False)
            elif category == 'sport':
                res = summarizer(content, max_length=categoria_sport, min_length=104, do_sample=False)
                
            resumen_generado = res[0]['summary_text']
            
            summary_df = summary_df.append({'filename': filename, 'summary_HF': resumen_generado, 'category': category}, ignore_index=True)
    except:
        print('Hubo un error')

# Ver el DataFrame con los resúmenes generados
print(summary_df)


  summary_df = summary_df.append({'filename': filename, 'summary_HF': resumen_generado, 'category': category}, ignore_index=True)
  summary_df = summary_df.append({'filename': filename, 'summary_HF': resumen_generado, 'category': category}, ignore_index=True)
  summary_df = summary_df.append({'filename': filename, 'summary_HF': resumen_generado, 'category': category}, ignore_index=True)
  summary_df = summary_df.append({'filename': filename, 'summary_HF': resumen_generado, 'category': category}, ignore_index=True)
  summary_df = summary_df.append({'filename': filename, 'summary_HF': resumen_generado, 'category': category}, ignore_index=True)
  summary_df = summary_df.append({'filename': filename, 'summary_HF': resumen_generado, 'category': category}, ignore_index=True)
  summary_df = summary_df.append({'filename': filename, 'summary_HF': resumen_generado, 'category': category}, ignore_index=True)
  summary_df = summary_df.append({'filename': filename, 'summary_HF': resumen_generado, 'c

Hubo un error
   filename                                         summary_HF  category
0   001.txt  Time Warner profits up 76% to $1.13bn for the ...  business
1   002.txt  Dollar hits highest level against the euro in ...  business
2   003.txt  Yukos' owner Menatep Group says it will ask Ro...  business
3   004.txt  British Airways blames high fuel prices for a ...  business
4   005.txt  Allied Domecq shares in London rose 4% by 1200...  business
..      ...                                                ...       ...
63  004.txt  Nicholas Negroponte, chairman and founder of M...      tech
64  005.txt  UK telco BT has launched its Connected World i...      tech
65  006.txt  A network of community computer centres, linke...      tech
66  007.txt  Microsoft issues eight 'critical' security hol...      tech
67  008.txt  Zafi.D virus translates the Christmas greeting...      tech

[68 rows x 3 columns]


In [15]:
summary_df.to_csv("ResumenesGeneradosHF.csv",index=False)