<a href="https://colab.research.google.com/github/cicattzo/nlp_project/blob/main/6_684_project_bertsum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
'''Define the model you want to run here:'''

'''Wanted data sets the dataset to train on. The options are:
cnn - full cnn dataset from online
cnn_sample - cnn sample dataset from the dropbox
bc3 - bc3 dataset from the dropbox
merged_data - merged dataset from the dropbox'''
wanted_data = 'cnn_sample'

# '''pretrained_model_name determines the pretrained model to load prior to training. The options are:
# bert - trains a bert-base-uncased to bert-base-uncased encoder decoder model
# gpt2 - trains a gp2 encoder decoder model
# pretrained_summarizer - pretrained summarization model on financial reports'''
# pretrained_model_name = 'gpt2'

# '''Model type determines the architecture of the model to train on. The options are:
# original - fine tuned model with only a single linear layer
# bottleneck - bottleneck fine tuning with a linear layer scaling it down, dropout, then scaling it back up'''
# model_type = 'bottleneck'

In [None]:
%%capture
"""
Install working versions of packages
"""
!pip install torch==1.4.1
!pip install datasets==1.0.2
# !pip install transformers==4.0.1
!pip install transformers==4.4.2
!pip install bert-extractive-summarizer
!pip install sacrebleu
!pip install rouge_score



In [None]:
MODEL_FOLDER = "/content/gdrive/MyDrive/6864_project/"

import datasets
from transformers import BertTokenizerFast
from summarizer import Summarizer
import sacrebleu
from rouge_score import rouge_scorer
from tqdm import tqdm
import pandas as pd
import numpy as np
from transformers import EncoderDecoderModel

In [None]:
%%bash
mkdir "/content/gdrive/MyDrive/6864_project/"
cd "/content/gdrive/MyDrive/6864_project/"

mkdir: cannot create directory ‘/content/gdrive/MyDrive/6864_project/’: File exists


# Download data

In [None]:
#decide which dataset we want to train on 
if wanted_data == 'cnn':
  train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:5%]")
  train_data = [x for x in train_data]
  full_text_key = 'article'
  label_key = 'highlights'
elif wanted_data == 'bc3':

  #reading in and processing data
  bc3_df = pd.read_csv(MODEL_FOLDER+"bc3_processed.csv")
  txt = bc3_df.iloc[0]['body']
  summary = bc3_df.iloc[0]['summary']
  # keeping the subject and body separate, but they can be merged
  bc3_df['unique_key'] = bc3_df['listno'] + "-" + bc3_df['email_num'].astype(str)
  # train_data = bc3_df.groupby('unique_key').agg({'subject':lambda x: x.iloc[0], 'body':lambda x: x.iloc[0], 'summary':lambda x: x.to_list()}).to_dict('records')
  train_data = bc3_df.agg({'subject':lambda x: x.iloc[0], 'body':lambda x: x.iloc[0], 'summary':lambda x: x}).to_dict('records')

  test_data_pd = pd.read_csv(MODEL_FOLDER+"bc3_test.csv")
  test_data = test_data_pd.to_dict('records')
  test_data = [x for x in test_data]
  full_text_key = 'body'
  label_key = 'summary'

elif wanted_data == 'cnn_sample':
  train_data_pd = pd.read_csv(MODEL_FOLDER+"cnn_train_data_5.csv")
  test_data_pd = pd.read_csv(MODEL_FOLDER+"bc3_test.csv")

  train_data = train_data_pd.to_dict('records')
  train_data = [x for x in train_data]

  test_data = test_data_pd.to_dict('records')
  test_data = [x for x in test_data]

  full_text_key = 'article'
  label_key = 'highlights'

elif wanted_data == 'merged_data':
  train_data_pd = pd.read_csv(MODEL_FOLDER+"train_combined.csv")
  test_data_pd = pd.read_csv(MODEL_FOLDER+"bc3_test.csv")

  train_data = train_data_pd.to_dict('records')
  train_data = [x for x in train_data]

  test_data = test_data_pd.to_dict('records')
  test_data = [x for x in test_data]

  full_text_key = 'article'
  label_key = 'highlights'

# Bert Extractive Summarization - bert-base-uncased tokenizer

In [None]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
model = Summarizer(custom_tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=571.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1344997306.0, style=ProgressStyle(descr…




In [None]:
test_data[0]

{'body': '> In fact, I have it on the shelf behind me at work as I type this, beside my XHTML, XSLT, CSS, usability, and disability references. \nIs this bookshelf documented on Kynn.COM, or would you consider sharing with us a short list of the references you find the most helpful? \nAl',
 'email_num': 6,
 'listno': '076-4622322',
 'subject': 'w3c-wai-ig@w3.org',
 'summary': "Al asks if Kynn's book collection is documented on Kynn's site."}

In [None]:
rscorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)

rouge_score_list = []
bleu_score_list = []

print('Len of our corpus:', len(test_data))
i = 0
for i in range(len(test_data)):
  print('Current email: ', i)
  email = test_data[i]
  txt = email['body']
  summary = email['summary']
  # parser = PlaintextParser.from_string(txt, tokenizer)
  pred = model(txt, num_sentences = 1)
  rouge_score_list.append(rscorer.score(summary, pred))
  bleu_score_list.append(sacrebleu.raw_corpus_bleu([pred], [[summary]], .01).score)
  i+=1


print('rouge2-fscore',np.mean([sc['rouge2'].fmeasure for sc in rouge_score_list]))
print('rouge1-fscore', np.mean([sc['rouge1'].fmeasure for sc in rouge_score_list]))
print('bleu-score',np.mean(bleu_score_list))

In [None]:
i = 0
while i<=5:
  print('body: ', test_data[i]['body'])
  print('actual summary: ', test_data[i]['summary'])
  print('predicted summary BERTSUM: ', model(test_data[i]['body'], num_sentences = 1))
  print('\n')
  i+=1

body:  > In fact, I have it on the shelf behind me at work as I type this, beside my XHTML, XSLT, CSS, usability, and disability references. 
Is this bookshelf documented on Kynn.COM, or would you consider sharing with us a short list of the references you find the most helpful? 
Al
actual summary:  Al asks if Kynn's book collection is documented on Kynn's site.
predicted summary BERTSUM:  > In fact, I have it on the shelf behind me at work as I type this, beside my XHTML, XSLT, CSS, usability, and disability references.


body:  > Is this bookshelf documented on Kynn.COM, or would you consider sharing with us a short list of the references you find the most helpful? 
Hey, that's not a bad idea. In fact, I encourage everyone else to share their favorite bookshelf collections -- if relevant to WAI activities. I'll post mine on kynn.com tomorrow when I'm back in the office.
actual summary:  Kynn thinks that Al has a good idea and encourages everyone to share their bookshelf collections i

In [None]:
from transformers import *
custom_config = AutoConfig.from_pretrained('allenai/scibert_scivocab_uncased')
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
custom_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', config=custom_config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




In [None]:
model = Summarizer(custom_model = custom_model, custom_tokenizer=custom_tokenizer)

In [None]:
rscorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)

rouge_score_list = []
bleu_score_list = []

print('Len of our corpus:', len(test_data))
i = 0
for i in range(len(test_data)):
  print('Current email: ', i)
  email = test_data[i]
  txt = email['body']
  summary = email['summary']
  # parser = PlaintextParser.from_string(txt, tokenizer)
  pred = model(txt, num_sentences = 1)
  rouge_score_list.append(rscorer.score(summary, pred))
  bleu_score_list.append(sacrebleu.raw_corpus_bleu([pred], [[summary]], .01).score)
  i+=1


print('rouge2-fscore',np.mean([sc['rouge2'].fmeasure for sc in rouge_score_list]))
print('rouge1-fscore', np.mean([sc['rouge1'].fmeasure for sc in rouge_score_list]))
print('bleu-score',np.mean(bleu_score_list))

In [None]:
i = 0
while i<=5:
  print('body: ', test_data[i]['body'])
  print('actual summary: ', test_data[i]['summary'])
  print('predicted summary SCI-BERTSUM: ', model(test_data[i]['body'], num_sentences = 1))
  print('\n')
  i+=1

body:  > In fact, I have it on the shelf behind me at work as I type this, beside my XHTML, XSLT, CSS, usability, and disability references. 
Is this bookshelf documented on Kynn.COM, or would you consider sharing with us a short list of the references you find the most helpful? 
Al
actual summary:  Al asks if Kynn's book collection is documented on Kynn's site.
predicted summary SCI-BERTSUM:  > In fact, I have it on the shelf behind me at work as I type this, beside my XHTML, XSLT, CSS, usability, and disability references.


body:  > Is this bookshelf documented on Kynn.COM, or would you consider sharing with us a short list of the references you find the most helpful? 
Hey, that's not a bad idea. In fact, I encourage everyone else to share their favorite bookshelf collections -- if relevant to WAI activities. I'll post mine on kynn.com tomorrow when I'm back in the office.
actual summary:  Kynn thinks that Al has a good idea and encourages everyone to share their bookshelf collectio

In [None]:
pred = model(test_data[0]['body'], num_sentences = 1)
pred

'> In fact, I have it on the shelf behind me at work as I type this, beside my XHTML, XSLT, CSS, usability, and disability references.'

In [None]:
true_sum = test_data[0]['summary']
true_sum

"Al asks if Kynn's book collection is documented on Kynn's site."