## Connect to Google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## TEST

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.4 MB/s[0m eta [36m0:00:0

In [3]:
import os
import re
import numpy as np
import pandas as pd
import json
import random
import nltk
nltk.download('punkt')

from IPython.display import display, HTML
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import LEDTokenizer, LEDForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
model_name = "nsi319/legal-led-base-16384"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [5]:
encoder_max_length = 1024*16
decoder_max_length = 1024

In [8]:
led = AutoModelForSeq2SeqLM.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False).to("cuda").half()

# set generate hyperparameters
led.config.num_beams = 2
led.config.max_length = decoder_max_length
led.config.min_length = 256
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 4

In [9]:
import pprint

## read in preprocessed input

In [10]:
import pandas as pd
extractive_output = pd.read_csv("/content/drive/MyDrive/W266 Final Project/output/train_data_LSA_extractive_500.csv")
extractive_output = extractive_output[['Index', 'Summary', 'ExtractiveSummary']]

## Generate Summaries Experimentation

In [11]:
import torch
led.eval()

text = 'summarize: ' + str(extractive_output['ExtractiveSummary'][0])

inputs = tokenizer(
	text,
	padding="max_length",
	truncation=True,
	max_length=encoder_max_length,
	return_tensors="pt",
)
input_ids = inputs.input_ids.to(led.device)
attention_mask = inputs.attention_mask.to(led.device)
global_attention_mask = torch.zeros_like(attention_mask)
# put global attention on <s> token
global_attention_mask[:, 0] = 1

outputs = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, min_length = 512, max_length = 1024, num_beams = 2,repetition_penalty = 2.5, length_penalty = 1, early_stopping = True)
preds = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_space=True) for gen_id in outputs]
pprint.pprint("".join(preds))



('The United Kingdom Court of Appeal today announced that it has entered a '
 'final judgment against a Nigerian husband and wife who had been barred from '
 "appearing before the court for divorce proceedings.  According to the SEC's "
 'Complaint, filed in the U.K. District Court for the Middle Kingdom on May '
 '24, 2008, Judge John R. Coleridge J issued an order imposing a temporary '
 'restraining order preventing them from proceeding with their pending divorce '
 "proceedings. The court granted the wife's request for leave to apply for "
 'financial relief after her husband had agreed to settle the matter without '
 'admitting or denying her requests. The court found that the wife had failed '
 'to show sufficient evidence to justify her decision to withdraw her '
 'application. The court also ruled that she was entitled to a civil penalty '
 "equal to the amount of her husband's compensation.  The SEC's complaint, "
 'filed in federal court in England and Wales, charges Coleridg

In [14]:
len("".join(preds).split())

415

# Evaluation

In [12]:
!pip install -q evaluate
!pip install -q rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.4/492.4 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [13]:
import evaluate
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=["".join(preds),],
                        references=[extractive_output["Summary"][0],])
print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.346551724137931, 'rouge2': 0.07081174438687392, 'rougeL': 0.17068965517241377, 'rougeLsum': 0.3086206896551724}


# 2nd iteration: length panelty

In [15]:
outputs2 = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, min_length = 512, max_length = 1024, num_beams = 2,repetition_penalty = 2.5, length_penalty = 2, early_stopping = False)
preds2 = "".join([tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_space=True) for gen_id in outputs2])
pprint.pprint(preds2)

results2 = rouge.compute(predictions=[preds2,],
                        references=[extractive_output["Summary"][0],])
print(results2)

('The United Kingdom Court of Appeal today announced that it has entered a '
 'final judgment against a Nigerian husband and wife who had been barred from '
 "appearing before the court for divorce proceedings.  According to the SEC's "
 'Complaint, filed in the U.K. District Court for the Middle Kingdom on May '
 '24, 2008, Judge John R. Coleridge J issued an order imposing a temporary '
 'restraining order preventing them from proceeding with their pending divorce '
 "proceedings. The court granted the wife's request for leave to apply for "
 'financial relief after her husband had agreed to settle the matter without '
 'admitting or denying her requests. The court found that the wife had failed '
 'to show sufficient evidence to justify her decision to withdraw her '
 'application. The court also ruled that she was entitled to a civil penalty '
 "equal to the amount of her husband's compensation.  The SEC's complaint, "
 'filed in federal court in England and Wales, charges Coleridg

In [20]:
outputs3 = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, min_length = 512, max_length = 1024, num_beams = 3,repetition_penalty = 2.5, length_penalty = 0.5, early_stopping = False)
preds3 = "".join([tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_space=True) for gen_id in outputs3])
pprint.pprint(preds3)

results3 = rouge.compute(predictions=[preds3,],
                        references=[extractive_output["Summary"][0],])
print(results3)

('The Court of Appeal today announced that it has entered final judgments '
 'against a Nigerian husband and wife who sought to have their divorce '
 "annulled in a foreign country.  The court's order, which is subject to court "
 'approval, requires the parties to show cause why they should not be entitled '
 "to financial relief after a foreign divorce.  According to the SEC's "
 'complaint, filed in the U.K. District Court for the Southern District of New '
 'York, on June 2, 2005, Coleridge J issued an order appointing a receiver to '
 'determine whether England and Wales was the appropriate forum for granting a '
 'temporary restraining order or other emergency relief following a divorce.  '
 'As alleged in the complaint, the receiver had no right to make any such '
 'order and therefore did not qualify as a party to the divorce.  Without '
 "admitting or denying the SEC's allegations, the receiver agreed to pay a "
 'total of about $1.2 million in disgorgement, prejudgment intere

In [21]:
outputs4 = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, min_length = 512, max_length = 1024, num_beams = 3, temperature = 0.5, repetition_penalty = 2.5, length_penalty = 4, early_stopping = True)
preds4 = "".join([tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_space=True) for gen_id in outputs4])
pprint.pprint(preds4)

results4 = rouge.compute(predictions=[preds4,],
                        references=[extractive_output["Summary"][0],])
print(results4)

('The Court of Appeal today announced that it has entered final judgments '
 'against a Nigerian husband and wife who sought to have their divorce '
 "annulled in a foreign country.  The court's order, which is subject to court "
 'approval, requires the parties to show cause why they should not be entitled '
 "to financial relief after a foreign divorce.  According to the SEC's "
 'complaint, filed in the U.K. District Court for the Southern District of New '
 'York, on June 2, 2005, Coleridge J issued an order appointing a receiver to '
 'determine whether England and Wales was the appropriate forum for granting a '
 'temporary restraining order or other emergency relief following a divorce.  '
 'As alleged in the complaint, the receiver had no right to make any such '
 'order and therefore did not qualify as a party to the divorce.  Without '
 "admitting or denying the SEC's allegations, the receiver agreed to pay a "
 'total of about $1.2 million in disgorgement, prejudgment intere