In [2]:
!pip install nltk rouge-score bert-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ae67a0ff002f3a8847c3998b3cfb326510c87ca2359ab727330ab6d1332c1dc4
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


In [26]:
import numpy as np
import pandas as pd
import nltk
import os
import spacy
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score
from transformers import pipeline

In [4]:
from datasets import load_dataset
dataset = load_dataset("opinosis")
dataset

Downloading builder script:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/795 [00:00<?, ?B/s]

Downloading and preparing dataset opinosis/default (download: 739.65 KiB, generated: 723.90 KiB, post-processed: Unknown size, total: 1.43 MiB) to /root/.cache/huggingface/datasets/opinosis/default/1.0.0/1852a0faaef01fdbc6149f3f6c8a0c405fe27aa050e691be443f86e4e1273417...


Downloading data:   0%|          | 0.00/757k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51 [00:00<?, ? examples/s]

Dataset opinosis downloaded and prepared to /root/.cache/huggingface/datasets/opinosis/default/1.0.0/1852a0faaef01fdbc6149f3f6c8a0c405fe27aa050e691be443f86e4e1273417. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['review_sents', 'summaries'],
        num_rows: 51
    })
})

In [5]:
data = dataset['train'].to_pandas()
data.head()

Unnamed: 0,review_sents,summaries
0,", and is very, very accurate .\r\n but for the...",[This unit is generally quite accurate. \r\nS...
1,"The room was not overly big, but clean and ve...",[The rooms were not large but were clean and v...
2,After I plugged it in to my USB hub on my com...,[Battery life is exceptional.\r\nThe Kindle ca...
3,short battery life I moved up from an 8gb .\...,[The battery life is too short.\r\nThe time be...
4,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",[The battery life is longer then 5 hours.\r\nB...


In [6]:
data['summaries'][0]

array(['This unit is generally quite accurate.  \r\nSet-up and usage are considered to be very easy. \r\nThe maps can be updated, and tend to be reliable.',
       "The Garmin seems to be generally very accurate.\r\nIt's easy to use with an intuitive interface.",
       'It is very accurate, even in destination time.',
       'Very accurate with travel and destination time.\r\nNegatives are not accurate with speed limits and rural roads.',
       'Its accurate, fast and its simple operations make this a for sure buy.'],
      dtype=object)

In [7]:
summaries = []
for i in range(len(data)):
    summaries.append(data['summaries'][i][0])

In [8]:
data = data.drop(['summaries'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   review_sents  51 non-null     object
dtypes: object(1)
memory usage: 536.0+ bytes


In [9]:
data['summaries'] = summaries
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   review_sents  51 non-null     object
 1   summaries     51 non-null     object
dtypes: object(2)
memory usage: 944.0+ bytes


In [10]:
data['review_sents'].isnull().sum()

0

In [11]:
def calculate_redundancy(summaries):
    
    total_tokens = sum(len(summary.split()) for summary in summaries)
    unique_tokens = len(set(token for summary in summaries for token in summary.split()))
    redundancy_score = 1 - (unique_tokens / total_tokens)
    
    return redundancy_score

In [12]:
def calculate_bleu(actual_summary, predicted_summary):
    actual_summary_tokenized = [[ref.split()] for ref in actual_summary]         
    pred_summary_tokenized = [output.split() for output in predicted_summary]         
    bleu_score = corpus_bleu(actual_summary_tokenized, pred_summary_tokenized)
    return bleu_score

In [13]:
def calculate_bert(actual_summary, predicted_summary):
    P, R, F1 = score(actual_summary, predicted_summary, lang='en', verbose=False)     # returns Precision, Recall and F1 score
    bert_score = F1.mean().item()  
    return bert_score

In [14]:
def calculate_rouge(actual_summary, predicted_summary):
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)         
             
    rouge1_scores = []
    rouge2_scores = []
    for pred, actual in zip(predicted_summary, actual_summary):
        rouge_scores = rouge.score(pred, actual)                                               
        rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
            
    rouge1 = sum(rouge1_scores) / len(rouge1_scores)                                      
    rouge2 = sum(rouge2_scores) / len(rouge2_scores)
        
    return rouge1, rouge2

In [15]:
def named_entity_recognition(original_texts, hypotheses):
    nlp = spacy.load("en_core_web_sm")
    precision_sum, recall_sum = 0, 0
    
    for original_text, hypothesis in zip(original_texts, hypotheses):
#         print(type(original_text))
        original_doc = nlp(original_text)
        hypothesis_doc = nlp(hypothesis)
        
        original_entities = set([ent.text for ent in original_doc.ents])
        hypothesis_entities = set([ent.text for ent in hypothesis_doc.ents])
        
        common_entities = original_entities.intersection(hypothesis_entities)
        
        precision_sum += len(common_entities) / len(hypothesis_entities) if len(hypothesis_entities) > 0 else 0
        recall_sum += len(common_entities) / len(original_entities) if len(original_entities) > 0 else 0
    
    num_samples = len(original_texts)
    avg_precision = precision_sum / num_samples
    avg_recall = recall_sum / num_samples
    
    return avg_precision, avg_recall


In [16]:
# named_entity_recognition(df['review_sents'],df['summaries'])

In [17]:
def get_metric(original, predicted, actual):
    redundancy = calculate_redundancy(predicted)
    rouge1, rouge2 = calculate_rouge(actual, predicted)
    bert = calculate_bert(actual, predicted)
    bleu = calculate_bleu(actual, predicted)
    precision, recall = named_entity_recognition(original, predicted)
    
    return redundancy, bleu, bert, rouge1, rouge2, precision, recall

In [18]:
df = data.sample(n=3, replace=False).reset_index(drop=True)
df = df.dropna()
df.head()

Unnamed: 0,review_sents,summaries
0,"The room was not overly big, but clean and ve...",The rooms were not large but were clean and ve...
1,short battery life I moved up from an 8gb .\...,The battery life is too short.\r\nThe time bet...
2,Another feature on the 255w is a display of th...,Map and tracking speed is fast and convenient....


In [19]:
df['review_sents'].isnull().sum()

0

In [20]:
df['review_sents']

0     The room was not overly big, but clean and ve...
1     short battery life  I moved up from an 8gb .\...
2    Another feature on the 255w is a display of th...
Name: review_sents, dtype: object

In [21]:
models = ["facebook/bart-large-cnn", 
          "philschmid/bart-large-cnn-samsum", 
          "google/pegasus-cnn_dailymail", 
          "google/pegasus-large"
         ]

In [22]:
# pipe = pipeline("summarization", model = "google/pegasus-large")
# pipe(df['review_sents'][0], max_length=130, min_length=30, truncation= True)[0]['summary_text']

In [30]:
redundancy_score=[]
bleu_score=[]
bert_score=[]
rouge1_score=[]
rouge2_score=[]
named_entity_precision=[]
named_entity_recall=[]

**Model 1: facebook/bart-large-cnn**

In [24]:
pipe = pipeline("summarization", model = "facebook/bart-large-cnn")
summary = []
for j in range(0,len(df)):
    pred = pipe(df['review_sents'][j], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    summary.append(pred)
print(summary[0])

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The rooms are exceptionally clean and also the bathrooms. The bathroom was marble and we had luxurious bathrobes and really, every detail attended to. The rooms are abit smallish  , but immaculately maintained, and tastefully decorated, with excellent bathroom facilities. Although the refrigerators are honour bars the hotel is happy for you to chill your own drinks.


In [27]:
redundancy,bleu,bert,rouge1,rouge2,precision,recall = get_metric(df['review_sents'], summary, df['summaries'].tolist())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [28]:
print('PARAMETERS')
print('Redundancy Score: ',redundancy)
print('BLEU Score: ',bleu)
print('BERT Score: ',bert)
print('Rouge-1 Score: ',rouge1)
print('Rouge-2 Score: ',rouge2)
print('Named Entity Precision: ',precision)
print('Named Entity Recall: ',recall)

PARAMETERS
Redundancy Score:  0.2698412698412699
BLEU Score:  0.19797244778877948
BERT Score:  0.8480364680290222
Rouge-1 Score:  0.17152985487980502
Rouge-2 Score:  0.04245895864158388
Named Entity Precision:  0.6666666666666666
Named Entity Recall:  0.07872846108140226


In [31]:
redundancy_score.append(redundancy)
bleu_score.append(bleu)
bert_score.append(bert)
rouge1_score.append(rouge1)
rouge2_score.append(rouge2)
named_entity_precision.append(precision)
named_entity_recall.append(recall)

**Model 2: philschmid/bart-large-cnn-samsum**

In [32]:
pipe = pipeline("summarization", model = "philschmid/bart-large-cnn-samsum")
summary = []
for j in range(0,len(df)):
    pred = pipe(df['review_sents'][j], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    summary.append(pred)
print(summary[0])

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

The rooms at Tuscan Inn are immaculately maintained and well-appointed. The rooms are clean and the bathrooms are well set up. There is a marble bathroom, mini bar, room service and an evening wine reception every day at the hotel.


In [33]:
redundancy,bleu,bert,rouge1,rouge2,precision,recall = get_metric(df['review_sents'], summary, df['summaries'].tolist())
print('PARAMETERS')
print('Redundancy Score: ',redundancy)
print('BLEU Score: ',bleu)
print('BERT Score: ',bert)
print('Rouge-1 Score: ',rouge1)
print('Rouge-2 Score: ',rouge2)
print('Named Entity Precision: ',precision)
print('Named Entity Recall: ',recall)

redundancy_score.append(redundancy)
bleu_score.append(bleu)
bert_score.append(bert)
rouge1_score.append(rouge1)
rouge2_score.append(rouge2)
named_entity_precision.append(precision)
named_entity_recall.append(recall)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


PARAMETERS
Redundancy Score:  0.29104477611940294
BLEU Score:  0.0806370043434534
BERT Score:  0.8552425503730774
Rouge-1 Score:  0.2326118326118326
Rouge-2 Score:  0.07300744371822802
Named Entity Precision:  0.7222222222222222
Named Entity Recall:  0.11293608352431883


**Model 3: google/pegasus-cnn_dailymail**

In [34]:
pipe = pipeline("summarization", model = "google/pegasus-cnn_dailymail")
summary = []
for j in range(0,len(df)):
    pred = pipe(df['review_sents'][j], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    summary.append(pred)
print(summary[0])

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .<n>The second room was smaller, with a very spotless bathroom layout, but at least it was quieter and we were able to sleep .<n>The bathrooms were quite well set up, with a seperate toilet shower to basin, so whilst one guest is showering another can use the basin .


In [35]:
redundancy,bleu,bert,rouge1,rouge2,precision,recall = get_metric(df['review_sents'], summary, df['summaries'].tolist())
print('PARAMETERS')
print('Redundancy Score: ',redundancy)
print('BLEU Score: ',bleu)
print('BERT Score: ',bert)
print('Rouge-1 Score: ',rouge1)
print('Rouge-2 Score: ',rouge2)
print('Named Entity Precision: ',precision)
print('Named Entity Recall: ',recall)

redundancy_score.append(redundancy)
bleu_score.append(bleu)
bert_score.append(bert)
rouge1_score.append(rouge1)
rouge2_score.append(rouge2)
named_entity_precision.append(precision)
named_entity_recall.append(recall)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PARAMETERS
Redundancy Score:  0.352112676056338
BLEU Score:  0.04981113102024433
BERT Score:  0.8431407809257507
Rouge-1 Score:  0.17813209892417814
Rouge-2 Score:  0.05923400673400673
Named Entity Precision:  0.7222222222222222
Named Entity Recall:  0.07762498938969527


**Model 4: google/pegasus-large**

In [36]:
pipe = pipeline("summarization", model = "google/pegasus-large")
summary = []
for j in range(0,len(df)):
    pred = pipe(df['review_sents'][j], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    summary.append(pred)
print(summary[0])

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms . Also, the bathroom was a bit dirty , brown water came out of the bath tub faucet initially and the sink wall by the toilet was dirty . Also, the bathrooms were quite well set up, with a seperate toilet shower to basin, so whilst one guest is showering another can use the basin . The room was wonderful with great beds, a nice bathroom and we also had an excellent view of Coit Tower as well .


In [37]:
redundancy,bleu,bert,rouge1,rouge2,precision,recall = get_metric(df['review_sents'], summary, df['summaries'].tolist())
print('PARAMETERS')
print('Redundancy Score: ',redundancy)
print('BLEU Score: ',bleu)
print('BERT Score: ',bert)
print('Rouge-1 Score: ',rouge1)
print('Rouge-2 Score: ',rouge2)
print('Named Entity Precision: ',precision)
print('Named Entity Recall: ',recall)

redundancy_score.append(redundancy)
bleu_score.append(bleu)
bert_score.append(bert)
rouge1_score.append(rouge1)
rouge2_score.append(rouge2)
named_entity_precision.append(precision)
named_entity_recall.append(recall)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PARAMETERS
Redundancy Score:  0.3913043478260869
BLEU Score:  0.037325164292908004
BERT Score:  0.8393990397453308
Rouge-1 Score:  0.129319802237835
Rouge-2 Score:  0.04941428446542529
Named Entity Precision:  0.625
Named Entity Recall:  0.11624649859943977


# Topsis for Comparison

In [38]:
scores = [redundancy_score, bleu_score, bert_score, rouge1_score, rouge2_score, named_entity_precision, named_entity_recall]
for score in scores:
    for i in range(len(score)):
        score[i]= np.round(score[i],3)

In [39]:
scores

[[0.27, 0.291, 0.352, 0.391],
 [0.198, 0.081, 0.05, 0.037],
 [0.848, 0.855, 0.843, 0.839],
 [0.172, 0.233, 0.178, 0.129],
 [0.042, 0.073, 0.059, 0.049],
 [0.667, 0.722, 0.722, 0.625],
 [0.079, 0.113, 0.078, 0.116]]

In [40]:
topsis_data= pd.DataFrame({
    'Model': models,
    'Redundancy': redundancy_score,
    'BLEU': bleu_score,
    'BERT': bert_score,
    'Rouge-1': rouge1_score,
    'Rouge-2': rouge2_score,
    'Named Entity Precision': named_entity_precision,
    'Named Entity Recall': named_entity_recall
})

In [41]:
weights = [5,2,3,4,4,3,2]
impacts = ['-','+','+','+','+','+','+']

In [42]:
def topsis(data, weight, impact):
    normData = data.copy()
    
    for i, col in enumerate(data.columns[0:]):
        normData[col] = data[col] / np.linalg.norm(data[col])
    normData = normData*weight
    # print(normData)

    ideal_best = np.max(normData, axis=0)
    ideal_worst = np.min(normData, axis=0)
    for idx, x in enumerate(impact):
        if(x=='-'):
            ideal_worst.iloc[idx], ideal_best.iloc[idx] = ideal_best.iloc[idx], ideal_worst.iloc[idx]
    # print(ideal_best)
    # print(ideal_worst)

    separation_best = np.linalg.norm(normData - ideal_best, axis=1)
    separation_worst = np.linalg.norm(normData - ideal_worst, axis=1)

    score = separation_worst / (separation_best + separation_worst)
    res = pd.DataFrame()
    res['TOPSIS Score'] = score
    res['Rank'] = res['TOPSIS Score'].rank(ascending=False)

    return res

In [43]:
score_table = topsis_data.drop('Model',axis=1)
result = topsis(score_table, weights, impacts)
result['Rank'] = result['Rank'].astype(int)
topsis_data['Topsis Score'] = result['TOPSIS Score']
topsis_data['Rank'] = result['Rank']

In [45]:
topsis_data

Unnamed: 0,Model,Redundancy,BLEU,BERT,Rouge-1,Rouge-2,Named Entity Precision,Named Entity Recall,Topsis Score,Rank
0,facebook/bart-large-cnn,0.27,0.198,0.848,0.172,0.042,0.667,0.079,0.57059,2
1,philschmid/bart-large-cnn-samsum,0.291,0.081,0.855,0.233,0.073,0.722,0.113,0.634322,1
2,google/pegasus-cnn_dailymail,0.352,0.05,0.843,0.178,0.059,0.722,0.078,0.343006,3
3,google/pegasus-large,0.391,0.037,0.839,0.129,0.049,0.625,0.116,0.170212,4
