In [63]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig
import os
import pandas as pd
from transformers import pipeline
import torch
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\javie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
model_name='google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

### Cargar el dataset

In [5]:
path_, filename_, category_, article_or_summary_,content_ = [],[],[],[],[]
for dirname, _, filenames in os.walk('data/'):
    for filename in filenames:
        path = os.path.join(dirname, filename).replace("\\","/")
        f = open(os.path.join(dirname, filename),"r")
        try:
            content_.append(str(f.read()))
            path_.append(path)
            filename_.append(filename)
            category_.append(path.split("/")[-2])
            article_or_summary_.append(path.split("/")[-3])
        except:
            print("ERROR ABRIENDO EL FICHERO")

In [42]:
df1 = pd.DataFrame({"path":path_, "filename":filename_, "category":category_, "article_or_summary":article_or_summary_,"content":content_}, columns=["path", "filename", "category", "article_or_summary","content"])
df_news = df1.loc[df1['article_or_summary']=='News Articles']
df_summaries = df1.loc[df1['article_or_summary']=='Summaries']
df_summaries = df_summaries.reset_index(drop=True)

### Probando el encoder

In [23]:
indice_especifico = [1]
valor = df_news.loc[indice_especifico]
valor


Unnamed: 0,path,filename,category,article_or_summary,content
1,data/News Articles/business/002.txt,002.txt,business,News Articles,Dollar gains on Greenspan speech\n\nThe dollar...


In [24]:
sentence = valor['content'].astype(str).iloc[0]
sentence


'Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\n\nAnd Alan Greenspan highlighted the US government\'s willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan\'s speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data. "I think the chairman\'s taking a much more sanguine view on the current account deficit than he\'s taken for some time," said Robert Sinche, head of currency strategy at Bank of America in New York. "He\'s taking a longer-term view, laying out a set of conditions u

+ Esto nos va a servir para ver si luego el contexto que le pasamos cabe en el prompting

In [25]:

sentence_encoded = tokenizer(sentence, return_tensors='pt')

sentence_decoded = tokenizer.decode(
        sentence_encoded["input_ids"][0], 
        skip_special_tokens=True
    )

print('ENCODED SENTENCE:')
print(sentence_encoded["input_ids"][0])
print('\nDECODED SENTENCE:')
print(sentence_decoded)
print('\nTAMAÑO DE LA CODIFICACIÓN:')
print(len(sentence_encoded["input_ids"][0]))

ENCODED SENTENCE:
tensor([14110, 11391,    30,  1862,     7,  2837,  5023,    37,  6816,    65,
         1560,   165,  2030,   593,   581,     8,  3983,    16,   966,   386,
          767,   227,     8,  5034,  9473,   819,   243,     8,   837,  1668,
        11724,    19,   356,    12, 20426,     7,    15,     5,   275, 12453,
         1862,     7,  2837, 12566,     8,   837,   789,    31,     7, 20463,
           12, 16110,  2887,    11,  6937,  5699,  5051,    38,  2580,    84,
          164,   199,    12,  1428,    34,     5,    86,  1480,  3415,    16,
          368,  1060,     6,     8,  6816,  3495,  1970,     5,  2577,  4450,
          581,     8,  3983,     6,    45,  1970,     5,  3166,  4581,    30,
         2721,     5,  3611,  3315,    81,     8, 11724,    65,  1560,     8,
         1442,  1549,    16,  1100,   767,     5,   461,  1701,     6,  5034,
         9473, 13404,  1363,  1862,     7,  2837,    31,     7,  5023,    16,
         1524,  2177,    13,     8,  1338,    

Modelo para hacer los resúmenes

In [26]:
#primero sin few shot, únicamente pasando el texto

In [47]:
dash_line = "------------------------------------------------------------------------------------------------------"
indices=[3,5,8]
for i, index in enumerate(indices):
    dialogue = df_news.loc[index]['content']
    summary = df_summaries.loc[index]['content']
    
    inputs = tokenizer(dialogue, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{dialogue}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)
    print(f'MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}\n')

------------------------------------------------------------------------------------------------------
Example  1
------------------------------------------------------------------------------------------------------
INPUT PROMPT:
High fuel prices hit BA's profits

British Airways has blamed high fuel prices for a 40% drop in profits.

Reporting its results for the three months to 31 December 2004, the airline made a pre-tax profit of £75m ($141m) compared with £125m a year earlier. Rod Eddington, BA's chief executive, said the results were "respectable" in a third quarter when fuel costs rose by £106m or 47.3%. BA's profits were still better than market expectation of £59m, and it expects a rise in full-year revenues.

To help offset the increased price of aviation fuel, BA last year introduced a fuel surcharge for passengers.

In October, it increased this from £6 to £10 one-way for all long-haul flights, while the short-haul surcharge was raised from £2.50 to £4 a leg. Yet aviation 

In [48]:
#Ahora vamos a hacer únicamente prompting
for i, index in enumerate(indices):
    dialogue = df_news.loc[index]['content']
    summary = df_summaries.loc[index]['content']

    prompt = f"""
You are an exper in news summarization. Summarize the following text.

{dialogue}

Summary:
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)    
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

------------------------------------------------------------------------------------------------------
Example  1
------------------------------------------------------------------------------------------------------
INPUT PROMPT:

You are an exper in news summarization. Summarize the following text.

High fuel prices hit BA's profits

British Airways has blamed high fuel prices for a 40% drop in profits.

Reporting its results for the three months to 31 December 2004, the airline made a pre-tax profit of £75m ($141m) compared with £125m a year earlier. Rod Eddington, BA's chief executive, said the results were "respectable" in a third quarter when fuel costs rose by £106m or 47.3%. BA's profits were still better than market expectation of £59m, and it expects a rise in full-year revenues.

To help offset the increased price of aviation fuel, BA last year introduced a fuel surcharge for passengers.

In October, it increased this from £6 to £10 one-way for all long-haul flights, while t

In [67]:
#Ahora vamos a hacer un few shot
#Función para hacer el prompting
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    for index in example_indices_full:
        dialogue = df_news.loc[index]['content']
        summary = df_summaries.loc[index]['content']
        
        # The stop sequence '{summary}\n\n\n' is important for FLAN-T5. Other models may have their own preferred stop sequence.
        prompt += f"""You are an expert in news summarization.
News:

{dialogue}

Summary:
{summary}


"""
    
    dialogue = df_news.loc[example_index_to_summarize]['content']
    
    prompt += f"""
News:

{dialogue}

Make a 3 sentences long Summary:"""
        
    return prompt

In [68]:
example_indices_full = [80,35]
example_index_to_summarize = 100

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)

You are an expert in news summarization.
News:

BMW to recall faulty diesel cars

BMW is to recall all cars equipped with a faulty diesel fuel-injection pump supplied by parts maker Robert Bosch.

The faulty part does not represent a safety risk and the recall only affects pumps made in December and January. BMW said that it was too early to say how many cars were affected or how much the recall would cost. The German company is to extend a planned production break at one of its plants due to the faulty Bosch part. The Dingolfing site will now be closed all next week instead of for just two days. The additional three-day stoppage will mean a production loss of up to 3,600 vehicles, BMW said, adding that it was confident it could make up the numbers later.

Bosch has stopped production of the part but expects to restart by 2 February. The faulty component does not represent a safety risk but causes the motor to stall after a significant amount of mileage. When asked if BMW would be seek

In [69]:
example_indices_full = [40,50,300]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)
summary = df_summaries.loc[example_index_to_summarize]['content']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')

------------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Swiss drugmaker Novartis has announced 5.65bn euros ($7.4bn; £3.9bn) of purchases to make its Sandoz unit the world's biggest generic drug producer.Novartis said that it would be able to make cost savings of about $200m a year following the acquisitions.Novartis said that it would merge a number of departments, adding that there may be job cuts.The deal will see Novartis' Sandoz business overtake Israel's Teva Pharmaceuticals as the world's biggest maker of generics.Novartis, which last month forecast record sales for 2005, said it had bought all of Germany's Hexal.Novartis' shares rose 1% to 57.85 Swiss francs in early trading.

------------------------------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
Swiss drugmaker Novartis has announced $5.65bn ($7.4bn; £3.9bn) of purchases to make its Sandoz unit th

In [70]:
#Clasificación de precisión BLEU
referencias_tokenizadas = [word_tokenize(sent) for sent in summary.split('. ') if sent]

# Tokenizando el resumen generado
hipotesis_tokenizada = word_tokenize(output)

# Asegurándonos de que las referencias estén en una lista de listas como espera corpus_bleu
score = corpus_bleu([referencias_tokenizadas], [hipotesis_tokenizada], smoothing_function=SmoothingFunction().method1)
print(f"BLEU score: {score*100:.2f}")

BLEU score: 3.63
