## Connect to Google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls "/content/gdrive/MyDrive/W266 Final Project/data"

IN-Abs	IN-Ext	test_data.csv  test_data.gsheet  train_data.csv  UK-Abs


## Read data from UK-Abs and IN-Abs

In [None]:
data_folder_path = "/content/gdrive/MyDrive/W266 Final Project/data/"

In [None]:
def read_txt_from_folder(folder_path):
    txt_list = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if (os.path.isfile(file_path) and file_name.endswith('.txt')):
            with open(file_path, 'r') as file:
                contents = file.read()
                txt_list.append((file_name, contents))
    return txt_list

In [None]:
import os

# train data
UK_train_judgement = read_txt_from_folder(data_folder_path + "UK-Abs/train-data/judgement")
UK_train_summary   = read_txt_from_folder(data_folder_path + "UK-Abs/train-data/summary")
IN_train_judgement = read_txt_from_folder(data_folder_path + "IN-Abs/train-data/judgement")
IN_train_summary   = read_txt_from_folder(data_folder_path + "IN-Abs/train-data/summary")
# test data
UK_test_judgement = read_txt_from_folder(data_folder_path + "UK-Abs/test-data/judgement")
UK_test_summary   = read_txt_from_folder(data_folder_path + "UK-Abs/test-data/summary/full")
IN_test_judgement = read_txt_from_folder(data_folder_path + "IN-Abs/test-data/judgement")
IN_test_summary   = read_txt_from_folder(data_folder_path + "IN-Abs/test-data/summary")

In [None]:
print(f"UK train judgement: {len(UK_train_judgement)}")
print(UK_train_judgement[:1])
print("\n")
print(f"UK train summary: {len(UK_train_summary)}")
print(UK_train_summary[:1])
print("\n")
print(f"IN train judgement: {len(IN_train_judgement)}")
print(IN_train_judgement[:1])
print("\n")
print(f"IN train summary: {len(IN_train_summary)}")
print(IN_train_summary[:1])
print("\n")
print(f"UK test judgement: {len(UK_test_judgement)}")
print(UK_test_judgement[:1])
print("\n")
print(f"UK test summary: {len(UK_test_summary)}")
print(UK_test_summary[:1])
print("\n")
print(f"IN test judgement: {len(IN_test_judgement)}")
print(IN_test_judgement[:1])
print("\n")
print(f"IN test summary: {len(IN_test_summary)}")
print(IN_test_summary[:1])

## Join Judgement and Summary

In [None]:
import pandas as pd

# Convert into Pandas DataFrame
UK_train_judgement_df = pd.DataFrame(UK_train_judgement, columns=['index', 'judgement'])
UK_train_summary_df = pd.DataFrame(UK_train_summary, columns=['index', 'summary'])
IN_train_judgement_df = pd.DataFrame(IN_train_judgement, columns=['index', 'judgement'])
IN_train_summary_df = pd.DataFrame(IN_train_summary, columns=['index', 'summary'])
UK_test_judgement_df = pd.DataFrame(UK_test_judgement, columns=['index', 'judgement'])
UK_test_summary_df = pd.DataFrame(UK_test_summary, columns=['index', 'summary'])
IN_test_judgement_df = pd.DataFrame(IN_test_judgement, columns=['index', 'judgement'])
IN_test_summary_df = pd.DataFrame(IN_test_summary, columns=['index', 'summary'])

UK_train_df = pd.merge(UK_train_judgement_df, UK_train_summary_df, on='index', how='inner')
IN_train_df = pd.merge(IN_train_judgement_df, IN_train_summary_df, on='index', how='inner')
UK_test_df = pd.merge(UK_test_judgement_df, UK_test_summary_df, on='index', how='inner')
IN_test_df = pd.merge(IN_test_judgement_df, IN_test_summary_df, on='index', how='inner')

print(UK_train_df.shape)
print(IN_train_df.shape)
print(UK_test_df.shape)
print(IN_test_df.shape)

## Merge UK and IN data

In [None]:
# Add country information just in case
UK_train_df['country'] = 'UK'
IN_train_df['country'] = 'IN'
UK_test_df['country'] = 'UK'
IN_test_df['country'] = 'IN'

train_df = pd.concat([UK_train_df, IN_train_df])
test_df = pd.concat([UK_test_df, IN_test_df])

print(f"train data: {train_df.shape}")
print(train_df.head(1))
print("\n")
print(f"test data: {test_df.shape}")
print(test_df.head(1))

In [None]:
print(train_df.head(5))

                index                                          judgement  \
0  uksc-2009-0034.txt  Part III of the Matrimonial and Family Proceed...   
1  uksc-2009-0037.txt  The appellant is the brother of the late Alan ...   
2  uksc-2009-0048.txt  This is the judgment of the court.\nThe appeal...   
3  uksc-2009-0031.txt  When a court grants a decree of divorce, nulli...   
4  uksc-2009-0018.txt  On 13 December 2006 the appellant Mohammed al ...   

                                             summary country  
0  Mr and Mrs Agbaje were married for 38 years.\n...      UK  
1  The Appellants brother, who is now deceased (t...      UK  
2  RTS specialises in the supply of automated mac...      UK  
3  This appeal concerns the principles to be appl...      UK  
4  In response to various incidents of internatio...      UK  


## read in Train and Test data directly from csv

In [None]:
# Read in CSV data
import pandas as pd
train_df = pd.read_csv("/content/gdrive/MyDrive/W266 Final Project/data/train_data.csv")
train_df_filter = train_df[['judgement','summary']]
test_df = pd.read_csv("/content/gdrive/MyDrive/W266 Final Project/data/test_data.csv")
test_df_filter = test_df[['judgement','summary']]

## extractive models first

In [None]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: breadability, docopt, pycountry
  Building wheel for breadability (setup.py) ... [?25l[?25

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

# Initialize the summarizer with the TextRank algorithm
summarizer = LsaSummarizer()

baseline_output = []

for judgement in train_df_filter['judgement'][:5]:
  parser = PlaintextParser.from_string(judgement, Tokenizer("english"))

  # Summarize the article and get the most important sentences
  summary = summarizer(parser.document, 100)  # You can change the number of sentences as needed
  summary_sentences = [str(sentence) for sentence in summary]
  baseline_output.append(summary_sentences)

len(baseline_output)

5

## T5 baseline and Evaluation

In [None]:
!pip install -q sentencepiece

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m19.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q evaluate
import evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
!pip install -q torch

In [None]:
#let's make longer output readable without horizontal scrolling
from pprint import pprint

In [None]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


### LongT5ForConditionalGeneration

In [None]:
from transformers import AutoTokenizer, LongT5ForConditionalGeneration

model = (
    LongT5ForConditionalGeneration.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
)
tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")

Downloading (…)lve/main/config.json:   0%|          | 0.00/853 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
ARTICLE_TO_SUMMARIZE = "summarize: " + train_df_filter["judgement"][0]
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], return_tensors="pt")

# Generate Summary
summary_ids = model.generate(inputs["input_ids"]).sequences
print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))



AttributeError: ignored

In [None]:
ARTICLE_TO_SUMMARIZE = "summarize: " + train_df_filter["judgement"][0]
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], return_tensors="pt")
input_ids = inputs.input_ids

outputs = model.generate(input_ids, max_length=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



this article describes the case of a woman who had been married for 38 years prior to their divorce in 2005 on the husbands petition in Nigeria. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in india. the husband was married in 2005 on the husbands petition in 2005 on the husbands petition


In [None]:
max_input_length = 16384
max_target_length = 1000

prefix = 'summarize: '

def preprocess_function(example):
    inputs = [prefix + doc for doc in example['judgement']]
    model_input = tokenizer(inputs, max_length=max_input_length, truncation = True)

    # setup the tokenizer for targets
    labels = tokenizer(text_target=example['summary'], max_length=max_target_length, truncation=True)

    model_input['labels'] = labels['input_ids']

    return model_input

In [None]:
pprint(tokenizer.decode(outputs[0], skip_special_tokens=True), compact = True)

('this article describes the case of a woman who had been married for 38 years '
 'prior to their divorce in 2005 on the husbands petition in Nigeria. the '
 'husband was married in 2005 on the husbands petition in india. the husband '
 'was married in 2005 on the husbands petition in india. the husband was '
 'married in 2005 on the husbands petition in india. the husband was married '
 'in 2005 on the husbands petition in india. the husband was married in 2005 '
 'on the husbands petition in india. the husband was married in 2005 on the '
 'husbands petition in india. the husband was married in 2005 on the husbands '
 'petition in india. the husband was married in 2005 on the husbands petition '
 'in india. the husband was married in 2005 on the husbands petition in india. '
 'the husband was married in 2005 on the husbands petition in india. the '
 'husband was married in 2005 on the husbands petition in 2005 on the husbands '
 'petition')


### LEDForConditionalGeneration

In [None]:
import torch
from transformers import AutoTokenizer, LEDForConditionalGeneration

model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
tokenizer = AutoTokenizer.from_pretrained("allenai/led-large-16384-arxiv")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
len(baseline_output[0])

100

In [None]:
inputs = 'summarize: ' + str(baseline_output[0])
# Tokenize the input text
input_ids = tokenizer.encode(inputs, return_tensors="pt", max_length=16384, truncation=True)

# global attention on the first token
# global_attention_mask = torch.zeros_like(input_ids)
# global_attention_mask[:, 0] = 1

# Generate summary using the model
summary_ids = model.generate(input_ids, num_beams=3, max_length=1024, early_stopping=True)

# Decode the generated summary IDs back to text
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

pprint(summary, compact = True)

(' this is an account of the proceedings which led to an order for financial '
 'relief after a marriage had been dissolved ( or annulled ) in a foreign '
 'country.                                                                                                                                                                                                                              ')


In [None]:
# Function to generate summaries for a given DataFrame
def generate_summaries(data_frame, model, tokenizer):
    generated_summaries = []
    for judgment_text in data_frame['judgement']:
        inputs = 'summarize: ' + judgment_text
        # Tokenize the input text
        input_ids = tokenizer.encode(inputs, return_tensors="pt", max_length=16384, truncation=True)

        # global attention on the first token
        global_attention_mask = torch.zeros_like(input_ids)
        global_attention_mask[:, 0] = 1

        # Generate summary using the model
        summary_ids = model.generate(input_ids, global_attention_mask=global_attention_mask, num_beams=3, max_length=1024, early_stopping=True)

        # Decode the generated summary IDs back to text
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        generated_summaries.append(summary)

    return generated_summaries

# Generate summaries for train_df_filter
generated_summaries = generate_summaries(train_df_filter.iloc[0], model, tokenizer)

# Add the generated summaries to the DataFrame
# train_df_filter['generated_summary'] = generated_summaries
pprint(generated_summaries, compact = True)


KeyboardInterrupt: ignored

In [None]:
rouge = evaluate.load('rouge')
predictions = summary
references = train_df_filter.iloc[0]['summary']
results = rouge.compute(predictions=predictions,
                        references=references)
print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ValueError: ignored