In [None]:
!pip install pydot --quiet
!pip install tensorflow==2.15.0 --quiet
!pip install sentencepiece --quiet
!pip install nltk --quiet
!pip install datasets --quiet
!pip install bert-extractive-summarizer --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset, load_from_disk
import numpy as np
import pandas as pd
import re
import torch
from torch.utils.data import Dataset
import string
import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

import nltk.corpus
from nltk.tokenize import sent_tokenize, regexp_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
lem = WordNetLemmatizer()

from summarizer import Summarizer
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_colwidth', 0)
notebook_login()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
search_list = ["This bill", "this bill", "This act", "This joint resolution", "The bill", "This resolution"]

def clean_summary(df):
  # remove extra whitespace from current summary
  summary = re.sub(r"\s+", " ", df["cur_summary"])
  # remove title of bill from summary, actual summary starts with "This bill", "This act", etc from search_list
  re_search = re.compile("|".join(search_list)).search(summary)
  # if there is a match, clip beginning of summary to the starting index
  if re_search is not None:
    start_loc = re_search.start()
    summary = summary[start_loc:]
  # return dictionary with cleaned summary, will create new columns in dataset
  return dict({"cleaned_summary": summary})

def extract_sentences(df):
    # remove extra whitespace from current text
    text = re.sub(r"\s+", " ", df["cur_text"])
    # split text by whitespace, if # words < extract length, return original text
    if len(regexp_tokenize(text, "\w+")) <= extract_length:
        return dict({f"extracted_text_{extract_length}": text})

    # split text into sentences, create list of lemmatized words and dictionary of sentence with associated lemmatized word
    all_sentences = sent_tokenize(text)
    # Keep track of sentence order to sort back sentences in extracted text back to original order
    sent_order = dict(zip(all_sentences, range(len(all_sentences))))
    lem_words = []
    lem_sentences = {sentence: [] for sentence in all_sentences}

    for one_sentence in all_sentences:
        for token in regexp_tokenize(one_sentence.lower(), '\w+'):  #divide the sentences into tokens based on the regex for whitespace
            if token not in string.punctuation:                     #ignore punctuation
                if token not in stopwords.words('english'):         #ignore stopwords
                    lem_token = lem.lemmatize(token)
                    lem_words.append(lem_token)
                    lem_sentences[one_sentence].append(lem_token)
    # get probabilities of lemmatized words
    freq = FreqDist(lem_words)
    total = sum(freq.values())
    probs = {k: v/total for k, v in freq.items()}
    summary = {}
    # build summary, add sentences to summary until extract length is reached
    while len(regexp_tokenize(" ".join(summary.keys()), '\w+')) < extract_length:
        # for each pass, recalculate importance scores for each sentence
        scores = {k: [] for k in lem_sentences}
        importance = {k: 0 for k in scores}
        # importance determined by summing probabilities of all lemmatized words divided by # lemmatized words in sentence
        for key, value in lem_sentences.items():
            for word in value:
                scores[key].append(probs[word])
            if len(scores[key]) > 0:
                importance[key] = sum(scores[key]) / len(scores[key])
            #edge case: if sentence doesn't have any lemmatized word, sentence automatically gets score of 0
            else:
                importance[key] = 0
        if len(importance) == 0:
            break
        most_importance_sentence = max(importance, key = importance.get)
        # if adding sentence to summary will cause summary to exceed extract length, don't include in summary
        if len(regexp_tokenize(" ".join(list(summary.keys()) + [most_importance_sentence]), '\w+')) > extract_length:
            break
        # keep track of order sentence appears
        summary[most_importance_sentence] = sent_order[most_importance_sentence]
        # Decrease probabilities of words added to summary to avoid extracting similar sentences
        for word in lem_sentences[most_importance_sentence]:
            probs[word] = probs[word] * probs[word]
        #remove sentence from dictionary to avoid possibly duplicating sentence in summary
        del lem_sentences[most_importance_sentence]
    # sort by sentence order and then join summary
    return {f"extracted_text_{extract_length}": " ".join(dict(sorted(summary.items(), key=lambda item: item[1])).keys())}

def chunk_text(text):
    # if text is less than extract length, return original text in list form
    if len(regexp_tokenize(text, "\w+")) <= chunk_length:
        return [text]
    # split text into sentences
    text_tokenized = sent_tokenize(text)
    sent_word_count = {sent: len(regexp_tokenize(sent, "\w+")) for sent in text_tokenized}
    chunked_text = []
    # go through each sentence, concatenate sentences to temporary string. Once string reaches around chunk length, add to chunked_text
    # restart until all sentences are exhausted
    while len(text_tokenized) > 0:
        # if sentence is longer than chunk length, add sentence to chunked_text as is and restart
        if sent_word_count[text_tokenized[0]] > chunk_length:
            chunked_text.append(text_tokenized[0])
            text_tokenized = text_tokenized[1:]
            continue
        subtext = ""
        subtext_len = 0
        while subtext_len < chunk_length:
            if (len(text_tokenized) == 0) or (subtext_len + sent_word_count[text_tokenized[0]] > chunk_length):
                break
            subtext = subtext +  " " + text_tokenized[0]
            subtext_len += sent_word_count[text_tokenized[0]]
            text_tokenized = text_tokenized[1:]
        chunked_text.append(subtext)
    return chunked_text

def extract_sentences_bertsum(df):
    # remove extra whitespace from current text
    text = re.sub(r"\s+", " ", df["cur_text"])
    # chunk text into chunk_length sizes
    chunked_text = chunk_text(text)
    extracted = []
    for chunk in chunked_text:
        result = model(chunk, min_length=20)
        extracted.append(result)
    return {f"bertsum_extracted_{chunk_length}": " ".join(extracted)}

def reduce_bertsum_summary(df):
    summary = df[f"bertsum_extracted_{chunk_length}"]
    while len(regexp_tokenize(summary, "\w+")) > extract_length:
        chunked_text = chunk_text(summary)
        extracted = []
        for chunk in chunked_text:
            result = model(chunk, min_length = 20)
            extracted.append(result)
        if summary == " ".join(extracted):
            break
        summary = " ".join(extracted)
    return {f"bertsum_extracted_{chunk_length}_{extract_length}": summary}

## Filter out data, clean summary, perform term-frequency extractive summarization

In [None]:
# df = load_dataset("hheiden/us-congress-117-bills")
# df = df.filter(lambda x: x["cur_summary"] is not None)
# df = df.filter(lambda x: len(re.split("\s+", x["cur_summary"])) >= 10)
# df = df.map(clean_summary)
# extract_length = 500
# df = df.map(extract_sentences)
# df_val, df_test = df["test"].train_test_split(test_size=0.1).values()

In [None]:
# import datasets
# datasets.DatasetDict({"train": df["train"],
#                    "val": df_val,
#                    "test": df_test}).push_to_hub("jordanfan/processed_us_congress_117_bills")

## Create term-frequency extractive summaries for text length 350, 750, and 1000

In [None]:
#df = load_dataset("jordanfan/processed_us_congress_117_bills")
#extract_length = 375
#df = df.map(extract_sentences)

In [None]:
#extract_length = 750
#df = df.map(extract_sentences)

In [None]:
#extract_length = 1000
#df = df.map(extract_sentences)

In [None]:
#df.push_to_hub("jordanfan/processed_us_congress_117_bills_v2")

## Create BERTSum extractive summaries for text length 375, 500, 750, 1000 for chunk lengths of 250 and 375

In [None]:
#df = load_dataset("jordanfan/processed_us_congress_117_bills_v2")

In [None]:
# model = Summarizer()
# chunk_length = 250
# df = df.map(extract_sentences_bertsum)

In [None]:
# df.push_to_hub("jordanfan/processed_us_congress_117_bills_v3")

In [None]:
# model = Summarizer()
# chunk_length = 375
# df = df.map(extract_sentences_bertsum)

In [None]:
# df.push_to_hub("jordanfan/processed_us_congress_117_bills_v3")

In [None]:
df = load_dataset("jordanfan/processed_us_congress_117_bills_v3")

Downloading readme:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/110M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/109M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/64.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11277 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/3388 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
# extract_length = 1000
# model = Summarizer()
# chunk_length = 375
# df = df.map(reduce_bertsum_summary)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
# extract_length = 1000
# model = Summarizer()
# chunk_length = 250
# df = df.map(reduce_bertsum_summary)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
# df.push_to_hub("jordanfan/processed_us_congress_117_bills_v3")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jordanfan/processed_us_congress_117_bills_v3/commit/015ff50461bcf0242b60c1a1770da3a4736eb008', commit_message='Upload dataset', commit_description='', oid='015ff50461bcf0242b60c1a1770da3a4736eb008', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
def reduce_bertsum_summary_1000(df):
    summary = df[f"bertsum_extracted_{chunk_length}_1000"]
    while len(regexp_tokenize(summary, "\w+")) > extract_length:
        chunked_text = chunk_text(summary)
        extracted = []
        for chunk in chunked_text:
            result = model(chunk, min_length = 20)
            extracted.append(result)
        if summary == " ".join(extracted):
            break
        summary = " ".join(extracted)
    return {f"bertsum_extracted_{chunk_length}_{extract_length}": summary}

extract_length = 750
model = Summarizer()
chunk_length = 375
df = df.map(reduce_bertsum_summary_1000)




Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
extract_length = 750
model = Summarizer()
chunk_length = 250
df = df.map(reduce_bertsum_summary_1000)

Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
def reduce_bertsum_summary_750(df):
    summary = df[f"bertsum_extracted_{chunk_length}_750"]
    while len(regexp_tokenize(summary, "\w+")) > extract_length:
        chunked_text = chunk_text(summary)
        extracted = []
        for chunk in chunked_text:
            result = model(chunk, min_length = 20)
            extracted.append(result)
        if summary == " ".join(extracted):
            break
        summary = " ".join(extracted)
    return {f"bertsum_extracted_{chunk_length}_{extract_length}": summary}

extract_length = 500
model = Summarizer()
chunk_length = 375
df = df.map(reduce_bertsum_summary_750)


Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:

extract_length = 500
model = Summarizer()
chunk_length = 250
df = df.map(reduce_bertsum_summary_750)


Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
def reduce_bertsum_summary_500(df):
    summary = df[f"bertsum_extracted_{chunk_length}_500"]
    while len(regexp_tokenize(summary, "\w+")) > extract_length:
        chunked_text = chunk_text(summary)
        extracted = []
        for chunk in chunked_text:
            result = model(chunk, min_length = 20)
            extracted.append(result)
        if summary == " ".join(extracted):
            break
        summary = " ".join(extracted)
    return {f"bertsum_extracted_{chunk_length}_{extract_length}": summary}

extract_length = 375
model = Summarizer()
chunk_length = 375
df = df.map(reduce_bertsum_summary_500)

Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
extract_length = 375
model = Summarizer()
chunk_length = 250
df = df.map(reduce_bertsum_summary_500)

Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
df.push_to_hub("jordanfan/processed_us_congress_117_bills_v3")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jordanfan/processed_us_congress_117_bills_v3/commit/a598c0a639251f6a172ffbca4a93bf1eb55aadc1', commit_message='Upload dataset', commit_description='', oid='a598c0a639251f6a172ffbca4a93bf1eb55aadc1', pr_url=None, pr_revision=None, pr_num=None)