In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# After executing the cell above, Drive
# files will be present in "/content/drive/My Drive".
!ls "/content/drive/My Drive/W266 Final Project"

data	     project_notebook_bt.ipynb	project_notebook_mz.ipynb
dataset.zip  project_notebook_cw.ipynb	test


In [None]:
!unzip -u "/content/drive/My Drive/W266 Final Project/data/dataset.zip" -d "/content/drive/My Drive/W266 Final Project/test"

In [None]:
%cd /content/drive/MyDrive/

[Errno 2] No such file or directory: '/content/drive/MyDrive/'
/content


In [None]:
!pwd

/content


In [None]:
!ls "/content/drive/My Drive/W266 Final Project/test/dataset"


ls: cannot access '/content/drive/My Drive/W266 Final Project/test/dataset': No such file or directory


## Read data from UK-Abs and IN-Abs

In [None]:
data_folder_path = "/content/drive/My Drive/W266 Final Project/data/"

In [None]:
def read_txt_from_folder(folder_path):
  txt_list = []
  for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path) and file_name.endswith('.txt'):  # Check if the file is a text file
        with open(file_path, 'r') as file:
            contents = file.read()
            txt_list.append((file_name, contents))
  return txt_list

In [None]:
import os

# train data
UK_train_judgement = read_txt_from_folder(data_folder_path + "UK-Abs/train-data/judgement")
UK_train_summary   = read_txt_from_folder(data_folder_path + "UK-Abs/train-data/summary")
IN_train_judgement = read_txt_from_folder(data_folder_path + "IN-Abs/train-data/judgement")
IN_train_summary   = read_txt_from_folder(data_folder_path + "IN-Abs/train-data/summary")
# test data
UK_test_judgement = read_txt_from_folder(data_folder_path + "UK-Abs/test-data/judgement")
UK_test_summary   = read_txt_from_folder(data_folder_path + "UK-Abs/test-data/summary/full")
IN_test_judgement = read_txt_from_folder(data_folder_path + "IN-Abs/test-data/judgement")
IN_test_summary   = read_txt_from_folder(data_folder_path + "IN-Abs/test-data/summary")

In [None]:
print(f"UK train judgement: {len(UK_train_judgement)}")
print(UK_train_judgement[:1])
print("\n")
print(f"UK train summary: {len(UK_train_summary)}")
print(UK_train_summary[:1])
print("\n")
print(f"IN train judgement: {len(IN_train_judgement)}")
print(IN_train_judgement[:1])
print("\n")
print(f"IN train summary: {len(IN_train_summary)}")
print(IN_train_summary[:1])
print("\n")
print(f"UK test judgement: {len(UK_test_judgement)}")
print(UK_test_judgement[:1])
print("\n")
print(f"UK test summary: {len(UK_test_summary)}")
print(UK_test_summary[:1])
print("\n")
print(f"IN test judgement: {len(IN_test_judgement)}")
print(IN_test_judgement[:1])
print("\n")
print(f"IN test summary: {len(IN_test_summary)}")
print(IN_test_summary[:1])

UK train judgement: 693


UK train summary: 693
[('uksc-2009-0018.txt', 'In response to various incidents of international terrorism, including the attacks on 9/11, the UN Security Council (the UNSC) passed resolutions (UNSCRs) requiring member states to take steps to freeze the assets of: (i) Usama Bin Laden, the Taliban and their associates; and (ii) those involved in international terrorism.\nThe UNSC established a list of persons whose assets member states were obliged to freeze (the Consolidated List).\nThose included in the Consolidated List are not informed of the basis for their inclusion or afforded the right to challenge the decision before an independent and impartial judge.\nThe Appeals concern the legality of the Terrorism (United Nations Measures) Order 2006 (the TO) and the Al Qaida and Taliban (United Nations Measures) Order 2006 (the AQO).\nThe TO and AQO were made by Her Majestys Treasury (the Treasury) pursuant to s.1 of the United Nations Act 1946 (the 1946 Act), wh

## Join Judgement and Summary

In [None]:
import pandas as pd

# Convert into Pandas DataFrame
UK_train_judgement_df = pd.DataFrame(UK_train_judgement, columns=['index', 'judgement'])
UK_train_summary_df = pd.DataFrame(UK_train_summary, columns=['index', 'summary'])
IN_train_judgement_df = pd.DataFrame(IN_train_judgement, columns=['index', 'judgement'])
IN_train_summary_df = pd.DataFrame(IN_train_summary, columns=['index', 'summary'])
UK_test_judgement_df = pd.DataFrame(UK_test_judgement, columns=['index', 'judgement'])
UK_test_summary_df = pd.DataFrame(UK_test_summary, columns=['index', 'summary'])
IN_test_judgement_df = pd.DataFrame(IN_test_judgement, columns=['index', 'judgement'])
IN_test_summary_df = pd.DataFrame(IN_test_summary, columns=['index', 'summary'])

UK_train_df = pd.merge(UK_train_judgement_df, UK_train_summary_df, on='index', how='inner')
IN_train_df = pd.merge(IN_train_judgement_df, IN_train_summary_df, on='index', how='inner')
UK_test_df = pd.merge(UK_test_judgement_df, UK_test_summary_df, on='index', how='inner')
IN_test_df = pd.merge(IN_test_judgement_df, IN_test_summary_df, on='index', how='inner')

print(UK_train_df.shape)
print(IN_train_df.shape)
print(UK_test_df.shape)
print(IN_test_df.shape)

(693, 3)
(7030, 3)
(100, 3)
(100, 3)


## Merge UK and IN data

In [None]:
# Add country information just in case
UK_train_df['country'] = 'UK'
IN_train_df['country'] = 'IN'
UK_test_df['country'] = 'UK'
IN_test_df['country'] = 'IN'

train_df = pd.concat([UK_train_df, IN_train_df])
test_df = pd.concat([UK_test_df, IN_test_df])

print(f"train data: {train_df.shape}")
print(train_df.head(1))
print("\n")
print(f"test data: {test_df.shape}")
print(test_df.head(1))

train data: (7723, 4)
                index                                          judgement  \
0  uksc-2009-0034.txt  Part III of the Matrimonial and Family Proceed...   

                                             summary country  
0  Mr and Mrs Agbaje were married for 38 years.\n...      UK  


test data: (200, 4)
                index                                          judgement  \
0  uksc-2009-0022.txt  This appeal was heard by this Panel on 10 and ...   

                                             summary country  
0  The issue in this appeal is whether a failure ...      UK  


# Write train/test DF to csv for quicker loading in the future.

In [None]:
train_df.to_csv('train_data.csv')
!cp train_data.csv "/content/drive/My Drive/W266 Final Project/data/"

In [None]:
test_df.to_csv('test_data.csv')
!cp test_data.csv "/content/drive/My Drive/W266 Final Project/data/"

# T5 for generic summarization baseline

In [None]:
!pip install -q sentencepiece

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m19.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q evaluate
import evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
# pip install rouge

In [None]:
#let's make longer output readable without horizontal scrolling
from pprint import pprint

In [None]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [None]:
t5model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  109628544 
                                                                 
 decoder (TFT5MainLayer)     multiple                  137949312 
                                                                 
Total params: 222,903,552
Trainable params: 222,903,552
Non-trainable params: 0
_________________________________________________________________


In [None]:
!ls "/content/drive/My Drive/W266 Final Project/data/train_data.csv"

'/content/drive/My Drive/W266 Final Project/data/train_data.csv'


In [None]:
# Read in CSV data
import pandas as pd
train_df = pd.read_csv("/content/drive/My Drive/W266 Final Project/data/train_data.csv")
train_df_filter = train_df[['judgement','summary']]
test_df = pd.read_csv("/content/drive/My Drive/W266 Final Project/data/test_data.csv")
test_df_filter = test_df[['judgement','summary']]

In [None]:
train_df_filter.columns

Index(['judgement', 'summary'], dtype='object')

In [None]:
# Calculate the maximum length
max_length = train_df["judgement"].apply(lambda x: len(str(x))).max()
print(max_length)

808119


In [None]:
# Find file that has max length judgement text
max_length = train_df.where(train_df["judgement"].apply(lambda x: len(str(x))) == train_df["judgement"].apply(lambda x: len(str(x))).max())
max_length = max_length.dropna(axis=0)
max_length
# print("Maximum length:", max_length)

Unnamed: 0.1,Unnamed: 0,index,judgement,summary,country
7049,6356.0,3498.txt,: Criminal Appeal No. 279 of 1975.\n(From the ...,174 In exercise of the powers conferred by Cla...,IN


In [None]:
train_df_filter['summary'][0]

'Mr and Mrs Agbaje were married for 38 years.\nBoth Nigerian by birth, they had met in England in the 1960s and acquired UK citizenship in 1972.\nAll five of their children were born (and all but one educated) in England, and in 1975 Mr Agbaje bought a property in England called Lytton Road in which their children stayed with a nanny.\nBut for the majority of their married life Mr and Mrs Agbaje lived in Nigeria.\nThey separated in 1999, at which point Mrs Agbaje came to live in Lytton Road.\nShe has lived here ever since.\nIn 2003 Mr Agbaje issued divorce proceedings in the Nigerian courts in which Mrs Agbaje sought ancillary relief.\nThe Nigerian court awarded her a life interest in a property in Lagos (with a capital value of about 86,000) and a lump sum equivalent to about 21,000.\nPart III of the Matrimonial and Family Proceedings Act 1984 was enacted to give the English court the power to grant financial relief after a marriage has been dissolved (or annulled) in a foreign countr

In [None]:
# Check the average length of summary text
avg_length = train_df_filter['summary'].dropna().apply(len).mean()
print(avg_length)

4994.059318741096


In [None]:
# Check for nan values in the summary column
num_nan = train_df_filter['summary'].isna().sum()
# num_nan
nan_rows = train_df[train_df_filter.isna().any(axis=1)]
nan_rows

Unnamed: 0.1,Unnamed: 0,index,judgement,summary,country
4246,3553,4799.txt,TE JURISDICTION: Writ Petition (Crl) No. 708 o...,,IN
6951,6258,299.txt,Appeal No. 144 of 1953.\nAppeal from the Judgm...,,IN


In [None]:
import pandas as pd
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

def load_model_and_tokenizer():
    t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
    t5tokenizer = T5Tokenizer.from_pretrained("t5-base")
    return t5model, t5tokenizer

def preprocess_data(df, input_column, label_column):
    inputs = df[input_column].apply(lambda x: f"summarize: {x}")
    labels = df[label_column]
    return inputs, labels

def generate_summaries(t5model, t5tokenizer, inputs):
    generated_summaries = []

    for text in inputs:
        # Split the text into smaller segments or windows
        windows = [text[i:i+512] for i in range(0, len(text), 512)]

        # Generate summaries for each window
        window_summaries = []
        for window in windows:
            input_text = f"summarize: {window}"
            input_encoding = t5tokenizer.encode_plus(
                input_text,
                padding='longest',
                truncation=True,
                max_length=512,
                return_tensors="tf"
            )
            input_ids = input_encoding["input_ids"]
            attention_mask = input_encoding["attention_mask"]

            outputs = t5model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )

            summary = t5tokenizer.decode(outputs[0], skip_special_tokens=True)
            window_summaries.append(summary)

        # Concatenate the window summaries into a single summary
        full_summary = " ".join(window_summaries)
        generated_summaries.append(full_summary)

    return generated_summaries

In [None]:
train_df_filter.iloc[[0]]

Unnamed: 0,judgement,summary
0,Part III of the Matrimonial and Family Proceed...,Mr and Mrs Agbaje were married for 38 years.\n...


In [None]:
# Test with a single row
# Load the T5 model and tokenizer
t5model, t5tokenizer = load_model_and_tokenizer()

# Preprocess the data
inputs, labels = preprocess_data(train_df_filter.iloc[[0]], "judgement", "summary")

# Generate the summaries
generated_summaries = generate_summaries(t5model, t5tokenizer, inputs)

# # Add the generated summaries to the DataFrame
# train_df_filter["generated_summary"] = generated_summaries

# # Print the generated summaries
# print(train_df_filter["generated_summary"])

# Save the DataFrame with the generated summaries
# df.to_csv("legal_documents_with_summaries.csv", index=False)  # Replace "legal_documents_with_summaries.csv" with the desired output file path

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
generated_summaries



In [None]:
# Load the T5 model and tokenizer
t5model, t5tokenizer = load_model_and_tokenizer()

# Preprocess the data
inputs, labels = preprocess_data(train_df_filter, "judgement", "summary")

# Generate the summaries
generated_summaries = generate_summaries(t5model, t5tokenizer, inputs)

# Add the generated summaries to the DataFrame
train_df_filter["generated_summary"] = generated_summaries

# Print the generated summaries
print(train_df_filter["generated_summary"])

# Save the DataFrame with the generated summaries
# df.to_csv("legal_documents_with_summaries.csv", index=False)  # Replace "legal_documents_with_summaries.csv" with the desired output file path

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


# SUMY EXTRACTIVE

In [None]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m92.2/97.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdon

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

# Initialize the summarizer with the TextRank algorithm
summarizer = LsaSummarizer()

baseline_output = []

for judgement in train_df_filter['judgement']:
  parser = PlaintextParser.from_string(judgement, Tokenizer("english"))

  # Summarize the article and get the most important sentences
  summary = summarizer(parser.document, 500)  # You can change the number of sentences as needed
  summary_sentences = [str(sentence) for sentence in summary]
  baseline_output.append(summary_sentences)

len(baseline_output)

7723

# TEST

In [None]:
import torch
from transformers import AutoTokenizer, LEDForConditionalGeneration

model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
tokenizer = AutoTokenizer.from_pretrained("allenai/led-large-16384-arxiv")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
inputs = 'summarize: ' + str(baseline_output[0])
# Tokenize the input text
input_ids = tokenizer.encode(inputs, return_tensors="pt", max_length=16384, truncation=True)

# global attention on the first token
# global_attention_mask = torch.zeros_like(input_ids)
# global_attention_mask[:, 0] = 1

# Generate summary using the model
summary_ids = model.generate(input_ids, num_beams=3, min_length = 512, max_length=1024, early_stopping=True)

# Decode the generated summary IDs back to text
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

pprint(summary, compact = True)

TypeError: ignored

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import pandas as pd
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
from sklearn.model_selection import train_test_split
from rouge import Rouge

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def tokenize_data(tokenizer, judgements, summaries):
    encodings = tokenizer(judgements, truncation=True, padding=True, return_tensors='pt')
    labels = tokenizer(summaries, truncation=True, padding=True, return_tensors='pt')['input_ids']
    encodings['labels'] = labels
    return encodings

def prepare_dataset(encodings):
    class Dataset(torch.utils.data.Dataset):
        def __init__(self, encodings):
            self.encodings = encodings

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            return item

        def __len__(self):
            return len(self.encodings['input_ids'])

    dataset = Dataset(encodings)
    return dataset

def train_model(model, train_dataset, batch_size, num_epochs, device):
    model.train()
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

def evaluate_model(model, test_dataset, batch_size, device):
    model.eval()
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    rouge = Rouge()
    predictions = []
    references = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            # labels = batch['labels']
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=100)
            decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            predictions.extend(decoded_output)
            references.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))

    rouge_scores = rouge.get_scores(predictions, references, avg=True)
    return rouge_scores

# Load the pre-trained DistilBART model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# train_df_filter.iloc[[0]]
# Tokenize the input documents and summaries
train_encodings = tokenize_data(tokenizer, train_df_filter.iloc[[0]]['judgement'].tolist(), train_df_filter.iloc[[0]]['summary'].tolist())
test_encodings = tokenize_data(tokenizer, test_df_filter.iloc[[0]]['judgement'].tolist(), test_df_filter.iloc[[0]]['summary'].tolist())

# Prepare the dataset for model training
train_dataset = prepare_dataset(train_encodings)
test_dataset = prepare_dataset(test_encodings)

# Fine-tune the DistilBART model for sequence-to-sequence (summarization) task
train_model(model, train_dataset, batch_size=8, num_epochs=3, device=torch.device('cuda'))

# Evaluate the model using ROUGE score
rouge_scores = evaluate_model(model, test_dataset, batch_size=8, device=torch.device('cuda'))
print("ROUGE Scores:", rouge_scores)


RuntimeError: ignored

In [None]:
train_encodings

{'input_ids': tensor([[   0, 4741, 6395,  ...,   13, 3114,    2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [None]:
import pandas as pd
import numpy as np
from transformers import TFT5ForConditionalGeneration, T5Tokenizer
import tensorflow as tf

def encode_data(judgement, summary, tokenizer, source_max_len=512, target_max_len=150):
    source = 'summarize: ' + judgement
    tokenized_inputs = tokenizer.encode_plus(source, max_length=source_max_len, truncation=True, padding='max_length', return_tensors='tf')
    tokenized_targets = tokenizer.encode_plus(summary, max_length=target_max_len, truncation=True, padding='max_length', return_tensors='tf')

    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': tokenized_targets['input_ids']
    }

In [None]:
def prepare_data(df, tokenizer):
    dataset_dict = {'input_ids': [], 'attention_mask': [], 'labels': []}

    for _, row in df.iterrows():
        data = encode_data(row['judgement'], row['summary'], tokenizer)
        for key, value in data.items():
            dataset_dict[key].append(value)

    for key in dataset_dict:
        dataset_dict[key] = np.squeeze(np.array(dataset_dict[key]), axis=1)

    return tf.data.Dataset.from_tensor_slices(dataset_dict)


In [None]:
train_dataset = prepare_data(train_df_filter, t5tokenizer)
test_dataset = prepare_data(test_df_filter, t5tokenizer)

ValueError: ignored

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
t5model.compile(optimizer=optimizer, loss=t5model.compute_loss)

history = t5model.fit(train_dataset.batch(8), validation_data=test_dataset.batch(8), epochs=3)

In [None]:
def generate_summary(model, judgement, tokenizer):
    source = 'summarize: ' + judgement
    source_encoded = tokenizer.encode_plus(source, return_tensors='tf')

    output = model.generate(input_ids=source_encoded['input_ids'], attention_mask=source_encoded['attention_mask'], max_length=150)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    return summary