In [5]:
import os
import pandas as pd
from glob import glob

#### Load Reviews csv

In [10]:
home_dir = 'c:\\Users\\DanaTal\\projects\\nlp\\Bridezilla_NLP'
os.path.exists(home_dir)
data_dir = os.path.join(home_dir, f'data')
data_path = glob(data_dir + '\\*.csv')[0]
data_path

df = pd.read_csv(data_path)
print(f' number of reviews is: {len(df)}')
df.head()

 number of reviews is: 189


Unnamed: 0,text,stars,months_ago
0,We love The Old Phoenix - this is the fourth t...,5,5
1,What a disappointment. We have wanted to stay ...,1,4
2,We walked over to Fenix from Loutro and stayed...,5,6
3,The Old Phoenix was the low point of our two w...,1,5
4,I had a fantastic time at Old-Phoenix!\nThe be...,5,7


I used this article for the summerization task: 

https://medium.com/@sarowar.saurav10/6-useful-text-summarization-algorithm-in-python-dfc8a9d33074

In [11]:
def summarize(df: pd.DataFrame, tokenizer, model, config: dict) -> pd.DataFrame:
    """ 
    """
    for i, row in df.iterrows():
        sequence = row.text
        input = tokenizer.encode("summarize: " + sequence, return_tensors='pt', max_length=config['max_input_length'], truncation=config['trunc_input'])
        output = model.generate(input, max_length=config['max_output_length'], min_length=config['min_output_length'], length_penalty=config['length_penalty'], num_beams=config['num_beams'])
        summary = tokenizer.decode(output[0])
        # remove tresh from the str
        df.at[i, "summary"] = summary.split(">")[1].split("<")[0]
    return df

### T5 Abstractive Summarization

In [1]:
# T5 imports
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [20]:
# Load Tokenizer & model
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model =  T5ForConditionalGeneration.from_pretrained(model_name, return_dict=True)

# I chose this hypper parameters for the model
T5_config = {'max_input_length':1024, 'trunc_input':True, 'max_output_length':100, 'min_output_length':1, 'length_penalty':5, 'num_beams':2}


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# Infer T5 model 
summarize(df, tokenizer, model, T5_config)


Unnamed: 0,text,stars,months_ago,summary
0,We love The Old Phoenix - this is the fourth t...,5,5,this is the fourth time we’ve visited the old...
1,What a disappointment. We have wanted to stay ...,1,4,we have wanted to stay here for years but it ...
2,We walked over to Fenix from Loutro and stayed...,5,6,we walked over to stony beach from Loutro and...
3,The Old Phoenix was the low point of our two w...,1,5,the old Phoenix was the low point of our two ...
4,I had a fantastic time at Old-Phoenix!\nThe be...,5,7,the beautiful beach on site provided the perfe...
...,...,...,...,...
184,Very beautiful!,5,24,very beautiful!..................................
185,Flawless,5,48,flawless & snn & snn & snn & snn & snn & snn &...
186,Everything is great!!,5,36,everything is great!!! everything is great!!!....
187,Amazing place,5,120,Amazing place....................................


In [16]:
# Save the output
summariztion_output_dir = os.path.join(home_dir, f"outputs\\summarization")
os.makedirs(summariztion_output_dirb, exist_ok=True)


In [22]:
summariztion_output_path = os.path.join(summariztion_output_dir, f'{model_name}.csv')
df.to_csv(summariztion_output_path)

#### BART Abstractive Summarization

In [23]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

model_name = "ainize/bart-base-cnn"
#  Load Model and Tokenize
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
bart_config = {'max_input_length':1024, 'trunc_input':True, 'max_output_length':100, 'min_output_length':1, 'length_penalty':5, 'num_beams':2}


In [24]:
df = pd.read_csv(data_path)
print(f' number of reviews is: {len(df)}')
df.head()

 number of reviews is: 189


Unnamed: 0,text,stars,months_ago
0,We love The Old Phoenix - this is the fourth t...,5,5
1,What a disappointment. We have wanted to stay ...,1,4
2,We walked over to Fenix from Loutro and stayed...,5,6
3,The Old Phoenix was the low point of our two w...,1,5
4,I had a fantastic time at Old-Phoenix!\nThe be...,5,7


In [25]:
for i, row in df.iterrows():
    sequence = row.text
    input = tokenizer.encode(sequence, return_tensors="pt")
    # Generate Summary Text Ids
    summary_tokens = model.generate(
        input_ids=input,
        bos_token_id=model.config.bos_token_id,
        eos_token_id=model.config.eos_token_id,
        length_penalty=2.0,
        max_length=100,
        min_length=1,
        num_beams=4,
    )
    summary = tokenizer.decode(summary_tokens[0], skip_special_tokens=True)
    df.at[i, "summary"] = summary

In [26]:
summariztion_output_path = os.path.join(summariztion_output_dir, f'base-bart.csv')
df.to_csv(summariztion_output_path)

### LSA

### Evaluating summarization of texts

#### 1) Evaluate using bert score

In [28]:
# !pip install bert_score


In [36]:
# ! pip install evaluate

In [44]:
references = df.summary[0]
references


'The Old Phoenix is secluded and quiet, and the swimming in the bay is sublime.\nIf you’re expecting five-star luxury, go somewhere else; this is a traditional …'

In [45]:
from evaluate import load
bertscore = load("bertscore")
hypotheses = [df.text[0]]
references = [df.summary[0]]
results= bertscore.compute(predictions=hypotheses, references=references,  model_type="distilbert-base-uncased")

In [46]:
print(results)

{'precision': [0.88301682472229], 'recall': [0.9785053133964539], 'f1': [0.9283120036125183], 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.35.2)'}


#### 2) Evaluate using ROUGE (Recall-Oriented Understudy for Gisting Evaluation)


In [48]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [53]:
# Import the load function from the evaluate module
from evaluate import load

# Loading the 'rouge' metric from the library
rouge = load('rouge')


ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['absl', 'rouge_score'] using 'pip install # Here to have a nice missing dependency error message early on rouge_score' for instance'

In [54]:
import torch
from rouge import Rouge

# Define the generated summary and the reference summary
generated_summary = df.text[0]
reference_summary = df.summary[0]
# Initialize the ROUGE object
rouge = Rouge()
# Calculate ROUGE for the generated and reference summaries
scores = rouge.get_scores(generated_summary, reference_summary)
# Print the results
print(scores)

[{'rouge-1': {'r': 1.0, 'p': 0.6486486486486487, 'f': 0.7868852411287289}, 'rouge-2': {'r': 0.9259259259259259, 'p': 0.5952380952380952, 'f': 0.7246376763957153}, 'rouge-l': {'r': 1.0, 'p': 0.6486486486486487, 'f': 0.7868852411287289}}]


In [58]:
# ! pip install absl-py rouge_score


In [59]:
# Import the load function from the evaluate module
from evaluate import load

# Loading the 'rouge' metric from the library
rouge = load('rouge')

# Define your predictions and references
predictions = [df.text[0]]
references = [df.summary[0]]

# Compute the scores
results = rouge.compute(predictions=predictions, references=references)

# Print the scores
print(results)

{'rouge1': 0.763157894736842, 'rouge2': 0.7027027027027025, 'rougeL': 0.763157894736842, 'rougeLsum': 0.763157894736842}


In [82]:
def evaluate_df(pred, reference, model):

    

{'rouge1': 0.763157894736842,
 'rouge2': 0.7027027027027025,
 'rougeL': 0.763157894736842,
 'rougeLsum': 0.763157894736842}

dict_keys(['rouge-1', 'rouge-2', 'rouge-l'])

#### Evaluete models

In [62]:
summariztion_output_dir

'c:\\Users\\DanaTal\\projects\\nlp\\Bridezilla_NLP\\outputs\\summarization'

In [69]:
bart_path = glob(summariztion_output_dir + '\\*bart*.csv')[0]
t5_path = glob(summariztion_output_dir + '\\*t5*.csv')[0]
t5_path


'c:\\Users\\DanaTal\\projects\\nlp\\Bridezilla_NLP\\outputs\\summarization\\t5-small.csv'

In [70]:
bart = pd.read_csv(bart_path)
t5 = pd.read_csv(t5_path)

In [74]:
from copy import deepcopy
evaluate_df = deepcopy(bart)
evaluate_df = evaluate_df.rename(columns={"summary": "bart"}).reset_index(drop=True)
evaluate_df = pd.concat([evaluate_df, t5["summary"]], axis=1).reset_index(drop=True)
evaluate_df = evaluate_df.rename(columns={"summary": "t5"}).reset_index(drop=True)

In [81]:
evaluate_df.head()

Unnamed: 0,text,stars,months_ago,bart,t5
0,We love The Old Phoenix - this is the fourth t...,5,5,"The Old Phoenix is secluded and quiet, and the...",this is the fourth time we’ve visited the old...
1,What a disappointment. We have wanted to stay ...,1,4,We have wanted to stay here for years but it w...,we have wanted to stay here for years but it ...
2,We walked over to Fenix from Loutro and stayed...,5,6,We walked over to Fenix from Loutro and stayed...,we walked over to stony beach from Loutro and...
3,The Old Phoenix was the low point of our two w...,1,5,The Old Phoenix was the low point of our two w...,the old Phoenix was the low point of our two ...
4,I had a fantastic time at Old-Phoenix!\nThe be...,5,7,"""The beautiful beach on site provided the perf...",the beautiful beach on site provided the perfe...
