In [None]:
from google.colab import drive
from huggingface_hub import notebook_login
drive.mount('/content/drive')

notebook_login()

Mounted at /content/drive


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install pydot --quiet
!pip install tensorflow==2.15.0 --quiet
!pip install sentencepiece --quiet
!pip install nltk --quiet
!pip install datasets --quiet
!pip install bert-extractive-summarizer --quiet
!pip install transformers --quiet
#!pip install flash-attn --no-build-isolation

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import transformers
from transformers import DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForCausalLM
from huggingface_hub import notebook_login
from datasets import load_dataset, load_from_disk
import numpy as np
import pandas as pd
import re
import torch
from torch.utils.data import Dataset
import string
import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

import nltk.corpus
from nltk.tokenize import sent_tokenize, regexp_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
lem = WordNetLemmatizer()

from summarizer import Summarizer
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_colwidth', 0)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def extract_sentences(summarized_text):
    # remove extra whitespace from current text
    text = re.sub(r"\s+", " ", summarized_text)
    # split text by whitespace, if # words < extract length, return original text
    if len(regexp_tokenize(text, "\w+")) <= extract_length:
        return text

    # split text into sentences, create list of lemmatized words and dictionary of sentence with associated lemmatized word
    all_sentences = sent_tokenize(text)
    # Keep track of sentence order to sort back sentences in extracted text back to original order
    sent_order = dict(zip(all_sentences, range(len(all_sentences))))
    lem_words = []
    lem_sentences = {sentence: [] for sentence in all_sentences}

    for one_sentence in all_sentences:
        for token in regexp_tokenize(one_sentence.lower(), '\w+'):  #divide the sentences into tokens based on the regex for whitespace
            if token not in string.punctuation:                     #ignore punctuation
                if token not in stopwords.words('english'):         #ignore stopwords
                    lem_token = lem.lemmatize(token)
                    lem_words.append(lem_token)
                    lem_sentences[one_sentence].append(lem_token)
    # get probabilities of lemmatized words
    freq = FreqDist(lem_words)
    total = sum(freq.values())
    probs = {k: v/total for k, v in freq.items()}
    summary = {}
    # build summary, add sentences to summary until extract length is reached
    while len(regexp_tokenize(" ".join(summary.keys()), '\w+')) < extract_length:
        # for each pass, recalculate importance scores for each sentence
        scores = {k: [] for k in lem_sentences}
        importance = {k: 0 for k in scores}
        # importance determined by summing probabilities of all lemmatized words divided by # lemmatized words in sentence
        for key, value in lem_sentences.items():
            for word in value:
                scores[key].append(probs[word])
            if len(scores[key]) > 0:
                importance[key] = sum(scores[key]) / len(scores[key])
            #edge case: if sentence doesn't have any lemmatized word, sentence automatically gets score of 0
            else:
                importance[key] = 0
        if len(importance) == 0:
            break
        most_importance_sentence = max(importance, key = importance.get)
        # if adding sentence to summary will cause summary to exceed extract length, don't include in summary
        if len(regexp_tokenize(" ".join(list(summary.keys()) + [most_importance_sentence]), '\w+')) > extract_length:
            break
        # keep track of order sentence appears
        summary[most_importance_sentence] = sent_order[most_importance_sentence]
        # Decrease probabilities of words added to summary to avoid extracting similar sentences
        for word in lem_sentences[most_importance_sentence]:
            probs[word] = probs[word] * probs[word]
        #remove sentence from dictionary to avoid possibly duplicating sentence in summary
        del lem_sentences[most_importance_sentence]
    # sort by sentence order and then join summary
    return " ".join(dict(sorted(summary.items(), key=lambda item: item[1])).keys())

def chunk_text(text):
    # if text is less than extract length, return original text in list form
    if len(regexp_tokenize(text, "\w+")) <= chunk_length:
        return [text]
    # split text into sentences
    text_tokenized = sent_tokenize(text)
    sent_word_count = {sent: len(regexp_tokenize(sent, "\w+")) for sent in text_tokenized}
    chunked_text = []
    # go through each sentence, concatenate sentences to temporary string. Once string reaches around chunk length, add to chunked_text
    # restart until all sentences are exhausted
    while len(text_tokenized) > 0:
        # if sentence is longer than chunk length, add sentence to chunked_text as is and restart
        if sent_word_count[text_tokenized[0]] > chunk_length:
            chunked_text.append(text_tokenized[0])
            text_tokenized = text_tokenized[1:]
            continue
        subtext = ""
        subtext_len = 0
        while subtext_len < chunk_length:
            if (len(text_tokenized) == 0) or (subtext_len + sent_word_count[text_tokenized[0]] > chunk_length):
                break
            subtext = subtext +  " " + text_tokenized[0]
            subtext_len += sent_word_count[text_tokenized[0]]
            text_tokenized = text_tokenized[1:]
        chunked_text.append(subtext)
    return chunked_text

def encode_decode(chunk):
    inputs = tokenizer(chunk, return_tensors = "pt", max_length = chunk_length, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], num_beams = model_num_beams, no_repeat_ngram_size = model_no_repeat_ngram_size,
                            min_length=model_min_length, max_length = model_max_length)
    result = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return result

def extract_sentences_billsum(df):
    # remove extra whitespace from current text
    text = re.sub(r"\s+", " ", df["cur_text"])
    # chunk text into chunk_length sizes
    chunked_text = chunk_text(text)
    summaries = [encode_decode(chunk) for chunk in chunked_text]
    summary = " ".join(summaries)
    #summary_extracted = extract_sentences(summary)
    return {f"billsum_abstracted_{chunk_length}": summary}

def extract_sentences_billsum_batched(df):
    text = df["cur_text"]
    text = pd.Series(text).map(lambda x: re.sub(r"\s+", " ", x))
    chunked_text = text.map(lambda x: chunk_text(x))
    result = chunked_text.map(lambda chunks: [encode_decode(chunk) for chunk in chunks])
    summaries = result.map(lambda x: " ".join(x))
    return {f"billsum_abstracted_{chunk_length}": summaries}



In [None]:
df = load_dataset("jordanfan/processed_us_congress_117_bills_v3")
df = df.map(lambda x: {"text_len": len(regexp_tokenize(re.sub(r"\s+", " ", x["cur_text"]), "\w+"))})

Downloading readme:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/132M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11277 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/3388 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/377 [00:00<?, ? examples/s]

Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
df_all = pd.concat([pd.DataFrame(df["train"]), pd.DataFrame(df["val"]), pd.DataFrame(df["test"])])
text_len_25 = df_all["text_len"].quantile(0.25)
text_len_75 = df_all["text_len"].quantile(0.75)
text_len_95 = df_all["text_len"].quantile(0.95)
df_95_perc = df.filter(lambda x: x["text_len"] < text_len_95)
df_25_75_perc = df.filter(lambda x: (x["text_len"] >= text_len_25) & (x["text_len"] <= text_len_75))

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3388 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3388 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
### Generate Abstractive Summary ###

chunk_length = 1000
model_name = 'jgibb/BART_1st_STAGE_SUMMARIZER_v3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model_num_beams = 2
model_no_repeat_ngram_size = 3
model_min_length= 20 #one sentence is about 15-20 words
model_max_length = 80
df_25_75_perc = df_25_75_perc.sort("text_len", reverse = True).map(extract_sentences_billsum)#, batched =  True, batch_size = 150)

tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

Map:   0%|          | 0/5627 [00:00<?, ? examples/s]

In [None]:
df_25_75_perc.push_to_hub("jgibb/billsum_abstracted_us_congress_117_bills", token = HF_TOKEN)