
# Text Summarizer

# Extractive

In [17]:
! pip install bert-extractive-summarizer

In [32]:
! pip install transformers

## BERT

In [33]:
from summarizer.bert import Summarizer ,TransformerSummarizer
import time

text= """Text classification is a classic problem in Natural Language Processing (NLP). 
The task is to assign predefined categories to a given text sequence. 
An important intermediate step is the text representation. 
Previous work uses various neural models to learn text representation, including convolution models 
(Kalchbrenner et al., 2014; Zhang et al.,2015), 
recurrent models (Liu et al., 2016; Yogatama et al.,2017; Seo et al., 2017), 
and attention mechanisms (Yang et al., 2016; Lin et al., 2017). Alternatively, 
substantial work has shown that pre-trained models on large corpus are beneficial for text classification and 
other NLP tasks, which can avoid training a new model from scratch. 
One kind of pre-trained models is the word embeddings, such as word2vec (Mikolov et al., 2013) 
and GloVe (Pennington et al., 2014), or the contextualized word embeddings, such as 
CoVe (McCann et al., 2017) and ELMo (Peters et al., 2018). These word embeddings are often used as 
additional features for the main task. Another kind of pre-training models is sentence level. 
Howard and Ruder (2018) propose ULMFiT, a fine-tuning method for pre-trained language model 
that achieves state-of-the-art results on six widely studied text classification datasets. 
More recently, pre-trained language models have shown to be useful in learning 
common language representations by utilizing a large amount of unlabeled data: e.g., 
OpenAI GPT (Radford et al., 2018) and BERT (Devlin et al., 2018). 
BERT is based on a multi-layer bidirectional Transformer (Vaswani et al., 2017) and is 
trained on plain text for masked word prediction and next sentence prediction tasks.
"""

t1=time.time()

model = Summarizer()
result = model(text, num_sentences=3)  
output = ''.join(result)

t2=time.time()

print ("\n", output)
print('\ntime for BERT:', t2-t1, "sec" )


## GPT2

In [29]:
import time
from summarizer.bert import Summarizer ,TransformerSummarizer

t1=time.time()

GPT2_model = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-medium")
output = ''.join(GPT2_model(text, num_sentences=3))

t2=time.time()
print ("\n", output)

print('\ntime for GPT2:', t2-t1, "sec" )


## XLNet

In [28]:
import time

t1=time.time()

model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")
output = ''.join(model(text, num_sentences=3))

t2=time.time()

print ("\n",output)

print('\ntime for XLNet:', t2-t1, "sec" )#

# abstractive

## T5

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

import torch
import time

min_length=20


t1=time.time()

model = T5ForConditionalGeneration.from_pretrained('t5-large')
tokenizer = T5Tokenizer.from_pretrained('t5-large')
device = torch.device('cpu')

preprocess_text = text.strip().replace("\n","")
t5_prepared_Text = "summarize: "+preprocess_text


tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)



summary_ids = model.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=20,
                                    max_length= min_length*5 ,
                                    early_stopping=True)

output_t5 = tokenizer.decode(summary_ids[0], skip_special_tokens=True, max_length=2)
               
t2=time.time()       

print ("\n",output_t5)

print('\ntime for T5:', t2-t1, "sec" )




In [None]:
from transformers import pipeline
import time

t1=time.time()

min_length=20

summarization = pipeline("summarization", model= "t5-large")
output_t= summarization(text, max_length= min_length*5,  min_length=20, do_sample=False)[0]['summary_text']


t2=time.time()

print ("\n", output_t)

print('\ntime for T5:', t2-t1, "sec" )


## BART

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

import torch
import time

min_length=20

t1=time.time()

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
device = torch.device('cpu')

preprocess_text = text.strip().replace("\n","")

tokenized_text = tokenizer.encode(preprocess_text, return_tensors="pt").to(device)


summary_ids = model.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=20,
                                    max_length= min_length*5,
                                    early_stopping=True)

output_br = tokenizer.decode(summary_ids[0], skip_special_tokens=True, max_length=2)
          
t2=time.time()       

print ("\n",output_br)

print('\ntime for BART:', t2-t1, "sec" )

In [None]:
from transformers import pipeline

t1=time.time()

min_length=20

summarization = pipeline("summarization", model= "facebook/bart-large-cnn")
output_b = summarization(text, max_length= min_length*5, min_length=20, do_sample=False)[0]['summary_text']

t2=time.time()

print ("\n",output_b)

print('\ntime for BART:', t2-t1, "sec" )



## Pegasus

In [None]:

from transformers import PegasusForConditionalGeneration, PegasusTokenizer

import torch

import time

t1=time.time()

device = torch.device('cpu')

min_length=20

pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-cnn_dailymail").to(device)
pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail")

input_text = ' '.join(text.split())
batch = pegasus_tokenizer.prepare_seq2seq_batch(input_text, truncation=True,
                                                 padding='longest', return_tensors="pt").to(device)

summary_ids = pegasus_model.generate(**batch,
                                        num_beams=4,
                                        num_return_sequences=1,
                                        no_repeat_ngram_size = 2,
                                        length_penalty = 1,
                                        min_length = 20,
                                        max_length = min_length*5,
                                        early_stopping = True)

summ = ''.join (pegasus_tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0])
output_p= summ.replace("<n>", " ")

t2=time.time()

print ("\n", output_p)

print('\ntime for Pagasus:', t2-t1, "sec" )


In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

import time

min_length=20

t1=time.time()

tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-cnn_dailymail") 


input_ids = tokenizer(text, return_tensors="pt").input_ids


output_Pegasus = model.generate(
    input_ids, 
    min_length=20,
    max_length= min_length*5,
    num_beams=4, 
    early_stopping=True
)

decode_text= tokenizer.decode(output_Pegasus[0], skip_special_tokens=True)

summ= decode_text.replace("<n>", " ")

t2=time.time()

print ("\n", summ)

print('\ntime for Pagasus:', t2-t1, "sec" )


## LED

In [None]:
import torch
from transformers import LEDTokenizer, LEDForConditionalGeneration
import time

min_length=20



t1=time.time()


device = torch.device('cpu')
    
model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
tokenizer = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")

inputs_led= tokenizer.encode(text, return_tensors="pt")


global_attention_mask = torch.zeros_like(inputs_led)
global_attention_mask[:, 0] = 1

summary_ids = model.generate(inputs_led, global_attention_mask=global_attention_mask,
                             num_beams=4,
                             min_length=20,
                             max_length= 100)

output_led= tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)


t2=time.time()

print ("\n", output_led)

print('\ntime for LED:', t2-t1, "sec" )

## Flask API

In [None]:
from transformers import pipeline
from summarizer import Summarizer,TransformerSummarizer
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
from transformers import LEDTokenizer, LEDForConditionalGeneration
import torch
import json 



from flask import Flask, request, render_template

app = Flask(__name__)

@app.route('/')

def my_form():
    return render_template('index6.html')


@app.route("/get", methods=["POST"])


def my_form_post():
    text = request.form['input']
    num = int(request.form['sentNumber'])
    Wcount = int( request.form['wordCount'])
    absmodel = request.form['absModel']
    exmodel = request.form['extModel']
    type_sum= request.form['type']
    
    
    if type_sum=='extractive':
        
        if exmodel == "BERT":
            
            sum_text= summary_BERT(text, num)
            
            print("BERT")
            
        elif exmodel == "GPT2":
            
             sum_text=  summary_GPT2(text, num)
                
             print("GPT2")   
                
        elif exmodel == "XLNet":   
            
            sum_text=  summary_XLNet(text, num)
            
            print("XLNet")
            

    
    else:
        
        if absmodel == "T5":
            
            sum_text= summary_T5(text, Wcount)
            
            print("T5")
            
            
        elif  absmodel == "BART":
            
            sum_text= summary_BART(text, Wcount)
            
            print("BART")
            
        elif  absmodel == "Pegasus": 
            
            sum_text= summary_Pegasus(text, Wcount)
            print("Pegasus")
            
        elif absmodel=="LED":   
    
             sum_text= summary_LED(text, Wcount)
        
             print("LED")
      
    
    return sum_text


def summary_BERT(body, num_sent):
    
    model = Summarizer()
    result = model(body, num_sentences=num_sent)  # Number of sentences
    full = ''.join(result) 
    return full


def summary_GPT2(body, num_sent):
    
    GPT2_model = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-medium")
    full = ''.join(GPT2_model(body, num_sentences= num_sent))
     
    return full


def summary_XLNet(body, num_sent):
    model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")
    full = ''.join(model(body, num_sentences=num_sent))

    return full



def summary_T5(text, min_length):
    
    #summarization = pipeline("summarization", model= "t5-large")
    #full = summarization(text, max_length=len(text),  min_length=10, do_sample=False)[0]['summary_text']
    
    model = T5ForConditionalGeneration.from_pretrained('t5-large')
    tokenizer = T5Tokenizer.from_pretrained('t5-large')
    device = torch.device('cpu')
    
    preprocess_text = text.strip().replace("\n","")
    t5_prepared_Text = "summarize: "+preprocess_text

    tokenized_text = tokenizer.encode(t5_prepared_Text, truncation=True,  return_tensors="pt").to(device)

    summary_ids = model.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    num_return_sequences=1,
                                    min_length= min_length,
                                    max_length= min_length*5,
                                    early_stopping=True)

    full = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


    return full


def summary_BART(text, min_length):
    
    summarization = pipeline("summarization", model= "facebook/bart-large-cnn")
    
    full = summarization(text, max_length= min_length*5,  min_length= min_length, do_sample=False)[0]['summary_text']
    
    
    #model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
    #tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    #device = torch.device('cpu')
    #preprocess_text = text.strip().replace("\n","")
    #BART_prepared_Text = "summarize: "+preprocess_text
    #tokenized_text = tokenizer.encode(text, truncation=True, return_tensors="pt").to(device)

    #summary_ids = model.generate(tokenized_text,
                                        #num_beams=4,
                                        #no_repeat_ngram_size=2,
                                        #num_return_sequences=1,
                                        #min_length=min_length,
                                        #max_length=min_length*3,
                                        #length_penalty = 1,
                                        #early_stopping=True)

    #full = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return full



def summary_Pegasus(text, min_length):
    
    
     
    device = torch.device('cpu')

    pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-cnn_dailymail").to(device)
    pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail")

    #input_text = ' '.join(text.split())
    
    input_ids = pegasus_tokenizer(text, return_tensors="pt").input_ids

    # Generate the output (Here, we use beam search but you can also use any other strategy you like)
    output = pegasus_model.generate(
        input_ids, 
        num_beams=4, 
        early_stopping=True,
        min_length = min_length,
        max_length = min_length*5,
        no_repeat_ngram_size = 2)

  
    summ = ''.join (pegasus_tokenizer.decode(output[0], skip_special_tokens=True))
    
    #batch = pegasus_tokenizer.prepare_seq2seq_batch(input_text, truncation=True,
                                                 #padding='longest', return_tensors="pt").to(device)
    #summary_ids = pegasus_model.generate(**batch,
                                        #num_beams=4,
                                        #num_return_sequences=1,
                                        #no_repeat_ngram_size = 2,
                                        #length_penalty = 1,
                                        #min_length = min_length,
                                        #max_length = len(input_text),
                                        #early_stopping = True)

    #summ = ''.join (pegasus_tokenizer.batch_decode(summary_ids, skip_special_tokens=True,
                                                   #clean_up_tokenization_spaces=True)[0])
    full= summ.replace("<n>", " ")
    
    return full



def summary_LED(text,  min_length):
    
    device = torch.device('cpu')

    model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
    tokenizer = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")


    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    # Global attention on the first token (cf. Beltagy et al. 2020)
    global_attention_mask = torch.zeros_like(inputs)
    global_attention_mask[:, 0] = 1

       # Generate Summary
    summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask,
                                 num_beams=3, min_length=min_length, max_length=min_length*5)
    full= tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
                                 
    return full


if __name__ == "__main__":
    app.run()