# Baseline model and evaluation

This notebook uses the `sumy` sdk to summarize the judgements into 100 sentences (approximately 5000 words which is the average length of summaries in train set) using SumBasic model.

SumBasic is a "Method often used as a baseline in the literature - another one used to compare the score of the algorithms. I think you can use it if you want but it has no special advantage over the LSA or TextRank." https://github.com/miso-belica/sumy/blob/main/docs/summarizators.md

SumBasic is a simple extractive text summarization model based on the concept of word frequency and probability. It was introduced by Radev et al. in their research paper titled "SumBasic: A Simple Unsupervised Multi-document Summarization System" (2002).

# Connect to Google Drive

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


# Read in Train and Test data directly from csv

In [None]:
# Read in CSV data
import pandas as pd
train_df = pd.read_csv("/content/drive/MyDrive/W266 Final Project/data/train_data.csv")
train_df_filter = train_df[['index', 'judgement','summary']]
test_df = pd.read_csv("/content/drive/MyDrive/W266 Final Project/data/test_data.csv")
test_df_filter = test_df[['index', 'judgement','summary']]

# Setup Sumy

In [None]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m71.7/97.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdon

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Generate summary

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.sum_basic import SumBasicSummarizer

# Initialize the summarizer with the TextRank algorithm
summarizer = SumBasicSummarizer()

baseline_output = []

print("train dataset length:", len(train_df_filter))

for judgement in train_df_filter['judgement']:
  parser = PlaintextParser.from_string(judgement, Tokenizer("english"))

  # Summarize the article and get the most important sentences
  summary = summarizer(parser.document, 100)  # You can change the number of sentences as needed
  summary_sentences = " ".join([str(sentence) for sentence in summary])
  baseline_output.append(summary_sentences)

print("Summarization generated for:", len(baseline_output))

train dataset length: 7723
Summarization generated for: 7723


In [None]:
import csv

output_file_path = '/content/drive/MyDrive/W266 Final Project/output/train_data_baseline_summary.csv'

with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    # Create a CSV writer object
    csv_writer = csv.writer(csvfile)

    # Write the header row (optional, if you want to include column headers)
    csv_writer.writerow(['Index', 'Summary', 'BaselineSummary'])

    # Write the data rows (judgement and its corresponding summary)
    for i in range(len(baseline_output)):
        # Write the judgement and its summary to the CSV file
        csv_writer.writerow([train_df_filter["index"][i], train_df_filter['summary'][i], baseline_output[i]])

print("Summary sentences written to:", output_file_path)

Summary sentences written to: /content/drive/MyDrive/W266 Final Project/output/train_data_baseline_summary.csv


# Evaluate results

In [None]:
import pandas as pd
baseline_output = pd.read_csv("/content/drive/MyDrive/W266 Final Project/output/train_data_baseline_summary.csv")

In [None]:
baseline_output.head()

Unnamed: 0,Index,Summary,BaselineSummary
0,uksc-2009-0034.txt,Mr and Mrs Agbaje were married for 38 years.\n...,The wife has been living in England continuous...
1,uksc-2009-0037.txt,"The Appellants brother, who is now deceased (t...",It was a conditional suspended possession orde...
2,uksc-2009-0048.txt,RTS specialises in the supply of automated mac...,This is the judgment of the court. The judge d...
3,uksc-2009-0031.txt,This appeal concerns the principles to be appl...,The husband cross petitioned in November. Mean...
4,uksc-2009-0018.txt,In response to various incidents of internatio...,"It affects third parties too, including the sp..."


In [None]:
!pip install -q evaluate
import evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install rouge_score

Collecting rouge_score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=baseline_output["BaselineSummary"],
                        references=baseline_output["Summary"])
print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.43141278545901385, 'rouge2': 0.2884847798543292, 'rougeL': 0.2610166865396496, 'rougeLsum': 0.38124037560233726}
