In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/685-NLP/Project/HTSS-master

/content/drive/.shortcut-targets-by-id/1rvvIqcLvBtcH5kpwcbn5rWmanGNvSPOI/685-NLP/Project/HTSS-master


In [3]:
# Setup
!pip install transformers
!pip install sentencepiece
!pip install arxiv
!pip install textract
!pip install rouge

# Check Installation
from transformers import pipeline
print(pipeline('sentiment-analysis')('we love you'))



No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


[{'label': 'POSITIVE', 'score': 0.9998704195022583}]


# Load Summarization + Simplification Data (Zaman 2020)

In [4]:
import pandas as pd
import numpy as np

#FILE_PATH = 'ssd-full.tsv'
FILE_PATH = 'ssd-chunked'

SEP = '^'

ssd_data = pd.read_csv(FILE_PATH, sep=SEP).dropna()
articles = ssd_data['article'].tolist()[-10:]
summaries = ssd_data['summary'].tolist()[-10:]


# Generating Intermediate Summaries with Longformer

In [5]:
# Summarization Model
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

batch_size = 1

print('Loading Model')
PEGASUS_MODEL = 'pegasus-arxiv'
model_name = "google/{}".format(PEGASUS_MODEL)
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

# num_batches = len(articles) // batch_size
num_batches = 10

base_line = 1100

for i in range(num_batches):
    src_text = articles[batch_size * i: batch_size * (i + 1)]

    print('Preprocessing Input Batch: ', i)
    batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)

    print('Pegasus is Summarizing')
    translated = model.generate(**batch)
    
    pegasus_summaries = tokenizer.batch_decode(translated, skip_special_tokens=True)

    print('Saving Generated Summaries')
    pegasus_summary_df = pd.DataFrame.from_dict({PEGASUS_MODEL: pegasus_summaries})

    PEGASUS_SUMMARY_FILE_PATH = '{}-inference-{}.txt'.format(PEGASUS_MODEL, i)

    pegasus_summary_np = np.savetxt(PEGASUS_SUMMARY_FILE_PATH, np.array(pegasus_summaries), fmt='%s')
    #pegasus_summary_df.to_csv(PEGASUS_SUMMARY_FILE_PATH, sep='\t', index=False)

Loading Model
Preprocessing Input Batch:  0
Pegasus is Summarizing
Saving Generated Summaries
Preprocessing Input Batch:  1
Pegasus is Summarizing
Saving Generated Summaries
Preprocessing Input Batch:  2
Pegasus is Summarizing
Saving Generated Summaries
Preprocessing Input Batch:  3
Pegasus is Summarizing
Saving Generated Summaries
Preprocessing Input Batch:  4
Pegasus is Summarizing
Saving Generated Summaries
Preprocessing Input Batch:  5
Pegasus is Summarizing
Saving Generated Summaries
Preprocessing Input Batch:  6
Pegasus is Summarizing
Saving Generated Summaries
Preprocessing Input Batch:  7
Pegasus is Summarizing
Saving Generated Summaries
Preprocessing Input Batch:  8
Pegasus is Summarizing
Saving Generated Summaries
Preprocessing Input Batch:  9
Pegasus is Summarizing
Saving Generated Summaries


## Calculate BLEU and ROUGE and SARI for PEGASUS

In [6]:
# Simplification model / Evaluation setup
!git clone https://github.com/facebookresearch/muss.git
%cd muss/
!pip install -e .  # Install package
!python -m spacy download en_core_web_md  # Install required spacy models

fatal: destination path 'muss' already exists and is not an empty directory.
/content/drive/.shortcut-targets-by-id/1rvvIqcLvBtcH5kpwcbn5rWmanGNvSPOI/685-NLP/Project/HTSS-master/muss
Obtaining file:///content/drive/.shortcut-targets-by-id/1rvvIqcLvBtcH5kpwcbn5rWmanGNvSPOI/685-NLP/Project/HTSS-master/muss
Collecting easse@ git+https://github.com/feralvam/easse.git
  Cloning https://github.com/feralvam/easse.git to /tmp/pip-install-onaezn8k/easse_148d271df0704e44b08cbfeadc67c1ab
  Running command git clone -q https://github.com/feralvam/easse.git /tmp/pip-install-onaezn8k/easse_148d271df0704e44b08cbfeadc67c1ab
Collecting kenlm@ git+https://github.com/kpu/kenlm.git
  Cloning https://github.com/kpu/kenlm.git to /tmp/pip-install-onaezn8k/kenlm_83917ac921e840eeadcef9b98b34c7ec
  Running command git clone -q https://github.com/kpu/kenlm.git /tmp/pip-install-onaezn8k/kenlm_83917ac921e840eeadcef9b98b34c7ec
Collecting tseval@ git+https://github.com/facebookresearch/text-simplification-evaluation

In [7]:
# Back translation to get multiple summaries for SARI calculation
!pip install -q googletrans==3.1.0a0 
import googletrans
from googletrans import Translator

languages = [
    'en', # english
    'cs',  # czech
    'de',  # german
    'es', # spanish
    'fi',  # finnish
    'fr', # french
    'hi', # hindi
    'it', # italian
    'ja', # japanese
    'pt', # portuguese
    'ru', # russian
    'vi', # vietnamese
    'zh-cn',  # chinese
    ]

translator = Translator()

def back_translate(summary, lang='fr'):
    intermediate = translator.translate(summary, src='en', dest=lang).text
    back_translated = translator.translate(intermediate, src=lang, dest='en').text
    return back_translated


In [8]:
%cd /content/drive/MyDrive/685-NLP/Project/HTSS-master/

/content/drive/.shortcut-targets-by-id/1rvvIqcLvBtcH5kpwcbn5rWmanGNvSPOI/685-NLP/Project/HTSS-master


In [9]:
import  nltk.translate.bleu_score as bleu
from rouge import Rouge 
rouge = Rouge()

from pp import *
from easse.sari import corpus_sari

avg_bleu = 0
avg_rouge = 0
avg_sari = 0

for i in range(num_batches):  
    PEGASUS_SUMMARY_FILE_PATH = '{}-inference-{}.txt'.format(PEGASUS_MODEL, i)

    with open(PEGASUS_SUMMARY_FILE_PATH, 'r') as f:
        pegasus_summaries = f.readlines()
        
        for j in range(batch_size):
            input = [articles[i * batch_size + j]]
            output = [pegasus_summaries[j]]
            reference = [summaries[i * batch_size + j]]

            scores = rouge.get_scores(output[0], reference[0])[0]

            curr_bleu = scores['rouge-l']['p'] * 100
            curr_rouge = scores['rouge-1']['r'] * 100
            curr_rouge_f = scores['rouge-1']['f'] * 100

            reference_summaries_2 = [back_translate(curr_summary, lang='fr') for curr_summary in reference]
            reference_summaries_3 = [back_translate(curr_summary, lang='it') for curr_summary in reference]

            curr_sari = corpus_sari(orig_sents=input,  
                                sys_sents=output, 
                                refs_sents=[reference, reference_summaries_2, reference_summaries_3])


            avg_bleu += curr_bleu
            avg_rouge += curr_rouge_f

            avg_sari += curr_sari


            print('=' * 50)
            print ('BLEU: {:.3f}'.format(curr_bleu))
            print ('ROUGE: {:.3f}'.format(curr_rouge))
            print ('ROUGE-F: {:.3f}'.format(curr_rouge_f))
            print ('SARI: {:.3f}'.format(curr_sari))
            print('Article')
            print(input)
            print()
            print('Summary')
            print(output)
            print()

print('Avg. BLEU: {}'.format(avg_bleu / (num_batches * batch_size)))
print('Avg. ROUGE: {}'.format(avg_rouge / (num_batches * batch_size)))
print('Avg. SARI: {}'.format(avg_sari / (num_batches * batch_size)))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


BLEU: 40.000
ROUGE: 16.443
ROUGE-F: 24.318
SARI: 35.718
Article
['background microbiome studies suggest the presence of an interaction between the human gut microbiome and soil-transmitted helminth . upon deworming , a complex interaction between the anthelminthic drug , helminths and microbiome composition might occur . to dissect this , we analyse the changes that take place in the gut bacteria profiles in samples from a double blind placebo controlled trial conducted in an area endemic for soil transmitted helminths in indonesia . either placebo or albendazole were given every three months for a period of one and a half years . helminth infection was assessed before and at 3 months after the last treatment round . in 150 subjects , the bacteria were profiled using the 454 pyrosequencing . statistical analysis was performed cross-sectionally at pre-treatment to assess the effect of infection , and at post-treatment to determine the effect of infection and treatment on microbiome comp

## Summarization Cleanup

In [11]:
del model
del tokenizer
torch.cuda.empty_cache()

# Simplification using MUSS - FB Reasearch (https://arxiv.org/pdf/2005.00352.pdf)

In [12]:
%cd /content/drive/MyDrive/685-NLP/Project/HTSS-master/

/content/drive/.shortcut-targets-by-id/1rvvIqcLvBtcH5kpwcbn5rWmanGNvSPOI/685-NLP/Project/HTSS-master


In [13]:
%cd /content/drive/MyDrive/685-NLP/Project/HTSS-master/muss/resources/models
!wget https://dl.fbaipublicfiles.com/muss/muss_en_wikilarge_mined.tar.gz
!tar -xf muss_en_wikilarge_mined.tar.gz --directory muss_en_wikilarge_mined
!mv muss_en_wikilarge_mined/tmplxa7whzw/* muss_en_wikilarge_mined
!rm -rf muss_en_wikilarge_mined/tmplxa7whzw
!rm -rf muss_en_wikilarge_mined.tar.gz

/content/drive/.shortcut-targets-by-id/1rvvIqcLvBtcH5kpwcbn5rWmanGNvSPOI/685-NLP/Project/HTSS-master/muss/resources/models
--2022-05-12 23:29:50--  https://dl.fbaipublicfiles.com/muss/muss_en_wikilarge_mined.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4021463948 (3.7G) [application/gzip]
Saving to: ‘muss_en_wikilarge_mined.tar.gz’


2022-05-12 23:32:31 (23.9 MB/s) - ‘muss_en_wikilarge_mined.tar.gz’ saved [4021463948/4021463948]



In [18]:
%cd /content/drive/MyDrive/685-NLP/Project/HTSS-master/muss/scripts

/content/drive/.shortcut-targets-by-id/1rvvIqcLvBtcH5kpwcbn5rWmanGNvSPOI/685-NLP/Project/HTSS-master/muss/scripts


In [19]:
# !python simplify.py /content/drive/MyDrive/685-NLP/Project/HTSS-master/pegasus-arxiv-inference-0.txt --model-name muss_en_wikilarge_mined
!sh simplify.sh /content/drive/MyDrive/685-NLP/Project/HTSS-master muss_en_wikilarge_mined

tcmalloc: large alloc 1625161728 bytes == 0x7445e000 @  0x7f09a013db6b 0x7f09a015d379 0x7f08ce20e50e 0x7f08ce2007c2 0x7f09837bb1e5 0x7f09837b9a7e 0x5946b8 0x548cc1 0x51566f 0x549e0e 0x4bcb19 0x59582d 0x595b69 0x62026d 0x55de15 0x59af67 0x515655 0x549e0e 0x4bca8a 0x5134a6 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x549576 0x593fce 0x5118f8 0x549576
tcmalloc: large alloc 1625161728 bytes == 0xd523e000 @  0x7f09a013db6b 0x7f09a015d379 0x7f08ce20e50e 0x7f08ce2007c2 0x7f09837bb1e5 0x7f09837b9a7e 0x5946b8 0x548cc1 0x51566f 0x549e0e 0x4bcb19 0x59582d 0x595b69 0x62026d 0x55de15 0x59af67 0x515655 0x549e0e 0x4bca8a 0x5134a6 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x549576 0x593fce 0x5118f8 0x549576
  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size
--------------------------------------------------------------------------------
Original:   background microbiome studies suggest the presence of an interaction between the human gut mic

In [21]:
%cd /content/drive/MyDrive/685-NLP/Project/HTSS-master/

/content/drive/.shortcut-targets-by-id/1rvvIqcLvBtcH5kpwcbn5rWmanGNvSPOI/685-NLP/Project/HTSS-master


In [22]:
PEGASUS_MODEL = 'pegasus-arxiv'
num_batches = 10
avg_sari = 0
avg_rouge_1 = 0
avg_rouge_2 = 0
avg_rouge_l = 0

for i in range(num_batches):
    PEGASUS_SUMMARY_FILE_PATH = '{}-inference-{}.txt'.format(PEGASUS_MODEL, i)

    pegasus_summaries = None
    with open(PEGASUS_SUMMARY_FILE_PATH, 'r') as f:
        pegasus_summaries = f.readlines()
    
    MUSS_SIMPLIFIED_PATH = '{}-inference-{}-muss.txt'.format(PEGASUS_MODEL, i)
    simplified_summaries = None
    with open(MUSS_SIMPLIFIED_PATH, 'r') as f:
        simplified_summaries = f.readlines()

    reference_summaries = summaries[batch_size * i: batch_size * (i + 1)]
    reference_summaries_2 = [back_translate(curr_summary, lang='fr') for curr_summary in reference_summaries]
    reference_summaries_3 = [back_translate(curr_summary, lang='it') for curr_summary in reference_summaries]

    curr_sari = corpus_sari(orig_sents=pegasus_summaries,  
        sys_sents=simplified_summaries, 
        refs_sents=[reference_summaries, reference_summaries_2, reference_summaries_3])
    
    scores = rouge.get_scores(simplified_summaries, pegasus_summaries, avg=True)
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']

    # curr_sari = compute_sari(pegasus_summaries[j], simplified_summaries[j], reference_summaries[j])
    avg_sari += curr_sari
    avg_rouge_1 += rouge_1 * batch_size * 100
    avg_rouge_2 += rouge_2 * batch_size * 100
    avg_rouge_l += rouge_l * batch_size * 100

    print('Batch: {}, SARI: {}, ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}'.format(i, curr_sari, rouge_1, rouge_2, rouge_l))

print('Avg. SARI: {}'.format(avg_sari / (num_batches * batch_size)))
print('Avg. ROUGE-1: {}'.format(avg_rouge_1 / (num_batches * batch_size)))
print('Avg. ROUGE-2: {}'.format(avg_rouge_2 / (num_batches * batch_size)))
print('Avg. ROUGE-F: {}'.format(avg_rouge_l / (num_batches * batch_size)))

Batch: 0, SARI: 17.31087385841062, ROUGE-1: 0.9556650196364873, ROUGE-2: 0.8947368371281588, ROUGE-L: 0.9556650196364873
Batch: 1, SARI: 34.29811181289784, ROUGE-1: 0.8695652124503781, ROUGE-2: 0.7985347936475734, ROUGE-L: 0.8695652124503781
Batch: 2, SARI: 30.834067761081897, ROUGE-1: 0.6881720383165684, ROUGE-2: 0.6171874954406739, ROUGE-L: 0.6881720383165684
Batch: 3, SARI: 28.094310932531517, ROUGE-1: 0.7486033472176274, ROUGE-2: 0.6542750884128191, ROUGE-L: 0.7486033472176274
Batch: 4, SARI: 26.329538896783415, ROUGE-1: 0.7857142808737244, ROUGE-2: 0.7287449345014672, ROUGE-L: 0.7857142808737244
Batch: 5, SARI: 33.33771909650167, ROUGE-1: 0.8481675343274582, ROUGE-2: 0.759999995128, ROUGE-L: 0.8481675343274582
Batch: 6, SARI: 30.56314316639943, ROUGE-1: 0.7142857094451531, ROUGE-2: 0.6507936460065508, ROUGE-L: 0.7142857094451531
Batch: 7, SARI: 29.48019355332764, ROUGE-1: 0.673366829325522, ROUGE-2: 0.5617977480873627, ROUGE-L: 0.6432160755566779
Batch: 8, SARI: 30.83290701801931,