## Connect to Google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Connect to hugging face

Get your hugging face token from your Settings -> Access Tokens 😀

In [2]:
!pip install -q huggingface_hub

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Imports

In [4]:
!pip install -q datasets
!pip install -q accelerate
!pip install -q transformers
!pip install -q nltk
!pip install -q evaluate
!pip install -q rouge_score
!pip install -q deepspeed

Run the cell below and restart runtime if you get error:  
```
Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`
```

In [5]:
# !pip install -q --force-reinstall -v "accelerate==0.20.3"

In [6]:
import os
import re
import numpy as np
import pandas as pd
import json
import random
import nltk
nltk.download('punkt')

from IPython.display import display, HTML
import torch
import datasets
from datasets import load_dataset, load_metric, Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import LEDTokenizer, LEDForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[2023-08-05 06:29:10,879] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# Fine-tune Longformer Encoder-Decoder (LED) model on legal data

In [7]:
model_checkpoint = "google/bigbird-pegasus-large-pubmed"

## Split train and validation set

In [111]:
import pandas as pd
extractive_output = pd.read_csv("/content/drive/MyDrive/W266 Final Project/output/train_data_LSA_extractive_100.csv")
extractive_output = extractive_output[['Index', 'Summary', 'ExtractiveSummary']]

In [112]:
subset_train = pd.read_csv("/content/drive/MyDrive/W266 Final Project/output/train_data_1000.csv")
subset_train = set(subset_train['Index'])

In [113]:
extractive_output = extractive_output[extractive_output['Index'].isin(subset_train)]

In [114]:
len(extractive_output)

1000

In [115]:
# split data into training and validation dataset:
import pandas as pd
from sklearn.model_selection import train_test_split
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(extractive_output, test_size=0.2, random_state=42)
# Print the shapes of the resulting datasets
print("Training data shape:", train_df.shape)
print("Validation data shape:", val_df.shape)

Training data shape: (800, 3)
Validation data shape: (200, 3)


## Preprocess the training data

Create train, validate, and test set, each having a list of `document`, `summary`, and `id` in it.

In [104]:
from transformers import AutoTokenizer

# load tokenizer, using the same tokenizer from pretrained base
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [116]:
max_input_length = 4096
max_target_length = 1024
def preprocess_function(example):
    inputs = [doc for doc in example["input"]]
    # tokenizer for input
    model_inputs = tokenizer(inputs, max_length=max_input_length, return_tensors="pt", truncation=True, padding=True)
    # tokenizer for reference summary
    labels = tokenizer(text_target=example["summary"], max_length=max_target_length, truncation = True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [117]:
# Splitting the dataframe
train_df = train_df
validation_df = val_df.iloc[:100]
test_df = val_df.iloc[100:]

In [118]:
from datasets import Dataset

# Convert the DataFrames to the HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

In [119]:
# Combine them into a DatasetDict

from datasets import DatasetDict

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

In [120]:
# Rename features for the train split
dataset_dict['train'] = dataset_dict['train'].rename_column('Index', 'id')
dataset_dict['train'] = dataset_dict['train'].rename_column('Summary', 'summary')
dataset_dict['train'] = dataset_dict['train'].rename_column('ExtractiveSummary', 'input')

# Rename features for the validation split
dataset_dict['validation'] = dataset_dict['validation'].rename_column('Index', 'id')
dataset_dict['validation'] = dataset_dict['validation'].rename_column('Summary', 'summary')
dataset_dict['validation'] = dataset_dict['validation'].rename_column('ExtractiveSummary', 'input')

# Rename features for the test split
dataset_dict['test'] = dataset_dict['test'].rename_column('Index', 'id')
dataset_dict['test'] = dataset_dict['test'].rename_column('Summary', 'summary')
dataset_dict['test'] = dataset_dict['test'].rename_column('ExtractiveSummary', 'input')


In [121]:
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## Fine-tune the model

Using **DeepSpeed** to configure ZeroOptimization to offload to CPU memory so that GPU memory utilization can be reduced. https://huggingface.co/blog/accelerate-deepspeed

https://huggingface.co/docs/transformers/main_classes/deepspeed


In [67]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to("cuda")

In [68]:
import accelerate
import transformers

batch_size = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-legal",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
    # deepspeed="ds_config.json"
)

In [69]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [70]:
import nltk
import numpy as np
from evaluate import load
metric = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [71]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/minjingzhu/bigbird-pegasus-large-pubmed-finetuned-legal into local empty directory.


Download file pytorch_model.bin:   0%|          | 16.5k/2.15G [00:00<?, ?B/s]

Download file runs/Aug04_00-55-39_6cc891915750/events.out.tfevents.1691110549.6cc891915750.9864.1: 100%|######…

Download file runs/Aug04_00-54-03_6cc891915750/events.out.tfevents.1691110459.6cc891915750.9864.0: 100%|######…

Download file runs/Aug04_00-25-54_6cc891915750/events.out.tfevents.1691108775.6cc891915750.1611.0: 100%|######…

Download file runs/Aug04_01-00-18_6cc891915750/events.out.tfevents.1691110823.6cc891915750.9864.3: 100%|######…

Download file training_args.bin: 100%|##########| 4.06k/4.06k [00:00<?, ?B/s]

Clean file runs/Aug04_00-55-39_6cc891915750/events.out.tfevents.1691110549.6cc891915750.9864.1:  22%|##1      …

Clean file runs/Aug04_00-54-03_6cc891915750/events.out.tfevents.1691110459.6cc891915750.9864.0:  22%|##1      …

Clean file runs/Aug04_00-25-54_6cc891915750/events.out.tfevents.1691108775.6cc891915750.1611.0:  22%|##1      …

Clean file runs/Aug04_01-00-18_6cc891915750/events.out.tfevents.1691110823.6cc891915750.9864.3:  18%|#8       …

Clean file training_args.bin:  25%|##4       | 1.00k/4.06k [00:00<?, ?B/s]

Download file runs/Aug04_00-56-42_6cc891915750/events.out.tfevents.1691110617.6cc891915750.9864.2: 100%|######…

Clean file runs/Aug04_00-56-42_6cc891915750/events.out.tfevents.1691110617.6cc891915750.9864.2:  18%|#8       …

Clean file pytorch_model.bin:   0%|          | 1.00k/2.15G [00:00<?, ?B/s]

In [72]:
import torch
torch.cuda.is_available()

True

In [73]:
torch.cuda.init()

In [74]:
!export CUDA_LAUNCH_BLOCKING=1

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


In [75]:
trainer.train()

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,5.5964,4.860909,28.3788,9.014,18.3548,26.2062,240.64


TrainOutput(global_step=800, training_loss=5.409326171875, metrics={'train_runtime': 1615.5424, 'train_samples_per_second': 0.495, 'train_steps_per_second': 0.495, 'total_flos': 9242420654899200.0, 'train_loss': 5.409326171875, 'epoch': 1.0})

In [76]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/2.15G [00:00<?, ?B/s]

Upload file runs/Aug05_04-41-02_9f1aeb025fd8/events.out.tfevents.1691211580.9f1aeb025fd8.2238.0:   0%|        …

To https://huggingface.co/minjingzhu/bigbird-pegasus-large-pubmed-finetuned-legal
   7b5e3aa..8a8e981  main -> main

   7b5e3aa..8a8e981  main -> main

To https://huggingface.co/minjingzhu/bigbird-pegasus-large-pubmed-finetuned-legal
   8a8e981..560dbac  main -> main

   8a8e981..560dbac  main -> main



'https://huggingface.co/minjingzhu/bigbird-pegasus-large-pubmed-finetuned-legal/commit/8a8e98120777bc75fa8c18b8dd857f831cd5c439'

# Experiment 2: fine-tune on more data (the complete train set)

In [7]:
model_checkpoint = "google/bigbird-pegasus-large-pubmed"

## Split train and validation set

In [8]:
import pandas as pd
extractive_output = pd.read_csv("/content/drive/MyDrive/W266 Final Project/output/train_data_LSA_extractive_100.csv")
extractive_output = extractive_output[['Index', 'Summary', 'ExtractiveSummary']]

In [9]:
# Filter rows where each column is not null, not NaN, and is a string
extractive_output = extractive_output.dropna(subset=['Index', 'Summary', 'ExtractiveSummary'])
extractive_output = extractive_output[extractive_output.applymap(lambda x: isinstance(x, str)).all(axis=1)]
extractive_output = extractive_output.reset_index()

In [10]:
len(extractive_output)

7721

In [11]:
# split data into training and validation dataset:
import pandas as pd
from sklearn.model_selection import train_test_split
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(extractive_output, test_size=0.2, random_state=42)

train_df = train_df
validation_df = val_df.iloc[:1000]
test_df = val_df.iloc[1000:]
# Print the shapes of the resulting datasets
print("Training data shape:", train_df.shape)
print("Validation data shape:", validation_df.shape)
print("Test data shape:", test_df.shape)

Training data shape: (6176, 4)
Validation data shape: (1000, 4)
Test data shape: (545, 4)


## Preprocess the training data

Create train, validate, and test set, each having a list of `document`, `summary`, and `id` in it.

In [12]:
from transformers import AutoTokenizer

# load tokenizer, using the same tokenizer from pretrained base
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [13]:
max_input_length = 4096
max_target_length = 1024
def preprocess_function(example):
    try:
      inputs = [doc for doc in example["input"]]
      # tokenizer for input
      model_inputs = tokenizer(inputs, max_length=max_input_length, return_tensors="pt", truncation=True, padding=True)
      # tokenizer for reference summary
      labels = tokenizer(text_target=example["summary"], max_length=max_target_length, truncation = True)
      model_inputs["labels"] = labels["input_ids"]
      return model_inputs
    except:
      pass # ignore error

In [14]:
from datasets import Dataset

# Convert the DataFrames to the HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

In [15]:
# Combine them into a DatasetDict

from datasets import DatasetDict

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

In [16]:
# Rename features for the train split
dataset_dict['train'] = dataset_dict['train'].rename_column('Index', 'id')
dataset_dict['train'] = dataset_dict['train'].rename_column('Summary', 'summary')
dataset_dict['train'] = dataset_dict['train'].rename_column('ExtractiveSummary', 'input')

# Rename features for the validation split
dataset_dict['validation'] = dataset_dict['validation'].rename_column('Index', 'id')
dataset_dict['validation'] = dataset_dict['validation'].rename_column('Summary', 'summary')
dataset_dict['validation'] = dataset_dict['validation'].rename_column('ExtractiveSummary', 'input')

# Rename features for the test split
dataset_dict['test'] = dataset_dict['test'].rename_column('Index', 'id')
dataset_dict['test'] = dataset_dict['test'].rename_column('Summary', 'summary')
dataset_dict['test'] = dataset_dict['test'].rename_column('ExtractiveSummary', 'input')

In [17]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['index', 'id', 'summary', 'input', '__index_level_0__'],
        num_rows: 6176
    })
    validation: Dataset({
        features: ['index', 'id', 'summary', 'input', '__index_level_0__'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['index', 'id', 'summary', 'input', '__index_level_0__'],
        num_rows: 545
    })
})

In [18]:
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/6176 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/545 [00:00<?, ? examples/s]

## Fine-tune the model



In [19]:
model_name = model_checkpoint.split("/")[-1]
fine_tuned_model_name = f"{model_name}-finetuned-legal-2"

In [20]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to("cuda")

In [21]:
import accelerate
import transformers

batch_size = 1

args = Seq2SeqTrainingArguments(
    fine_tuned_model_name,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [23]:
import nltk
import numpy as np
from evaluate import load
metric = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [24]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

/content/bigbird-pegasus-large-pubmed-finetuned-legal-2 is already a clone of https://huggingface.co/minjingzhu/bigbird-pegasus-large-pubmed-finetuned-legal-2. Make sure you pull the latest changes with `repo.git_pull()`.


In [25]:
import torch
torch.cuda.is_available()

True

In [26]:
torch.cuda.init()

**This trainer used Colab Pro+ A100 GPU with 8.2 system RAM, 30.9 GPU RAM, and 37.7 Disk space.**

In [27]:
trainer.train()

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.3208,3.093214,35.0046,14.6481,20.8387,32.3484,245.06


TrainOutput(global_step=6176, training_loss=3.899464879011243, metrics={'train_runtime': 14356.4594, 'train_samples_per_second': 0.43, 'train_steps_per_second': 0.43, 'total_flos': 7.135148745582182e+16, 'train_loss': 3.899464879011243, 'epoch': 1.0})

In [28]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/2.15G [00:00<?, ?B/s]

Upload file runs/Aug05_06-30-00_9f1aeb025fd8/events.out.tfevents.1691217005.9f1aeb025fd8.50312.0:   0%|       …

To https://huggingface.co/minjingzhu/bigbird-pegasus-large-pubmed-finetuned-legal-2
   1500837..4e838d5  main -> main

   1500837..4e838d5  main -> main

To https://huggingface.co/minjingzhu/bigbird-pegasus-large-pubmed-finetuned-legal-2
   4e838d5..99917d1  main -> main

   4e838d5..99917d1  main -> main



'https://huggingface.co/minjingzhu/bigbird-pegasus-large-pubmed-finetuned-legal-2/commit/4e838d5e1791d87c8df9710a4fe41fa98ee22c5e'