In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords


import pandas as pd
import numpy as np
import json
import os
from zipfile import ZipFile

import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx

import nltk
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial import distance
from rouge_score import rouge_scorer
from rouge import Rouge
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers import AbstractSummarizer
from sumy.summarizers.luhn import LuhnSummarizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivyha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ivyha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import Dataset

In [2]:
data = pd.read_json('billsum_v4_1/ca_test_data_final_OFFICIAL.jsonl', lines = True)

In [3]:
stop_words = stopwords.words("english")

# Pre-processing

In [4]:
def replace_semicolon(text, threshold=10):
    '''
    Get rid of semicolons.
    First split text into fragments between the semicolons. If the fragment 
    is longer than the threshold, turn the semicolon into a period. O.w treat
    it as a comma.
    Returns new text
    '''
    new_text = ""
    for subset in re.split(';', text):
        subset = subset.strip() # Clear off spaces
        # Check word count
        if len(subset.split()) > threshold:
            # Turn first char into uppercase
            new_text += ". " + subset[0].upper() + subset[1:]
        else:
            # Just append with a comma 
            new_text += ", " + subset

    return new_text

In [5]:
USC_re = re.compile('[Uu]\.*[Ss]\.*[Cc]\.]+')
PAREN_re = re.compile('\([^(]+\ [^\(]+\)')
BAD_PUNCT_RE = re.compile(r'([%s])' % re.escape('"#%&\*\+/<=>@[\]^{|}~_'), re.UNICODE)
BULLET_RE = re.compile('\n[\ \t]*`*\([a-zA-Z0-9]*\)')
DASH_RE = re.compile('--+')
WHITESPACE_RE = re.compile('\s+')
EMPTY_SENT_RE = re.compile('[,\.]\ *[\.,]')
FIX_START_RE = re.compile('^[^A-Za-z]*')
FIX_PERIOD = re.compile('\.([A-Za-z])')
SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

FIX_PERIOD = re.compile('\.([A-Za-z])')

SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

In [6]:
def clean_text(text):
    """
    Borrowed from the FNDS text processing with additional logic added in.
    Note: we do not take care of token breaking - assume SPACY's tokenizer
    will handle this for us.
    """

    # Indicate section headers, we need them for features
    text = SECTION_HEADER_RE.sub('SECTION-HEADER', text)
    # For simplicity later, remove '.' from most common acronym
    text = text.replace("U.S.", "US")
    text = text.replace('SEC.', 'Section')
    text = text.replace('Sec.', 'Section')
    text = USC_re.sub('USC', text)

    # Remove parantheticals because they are almost always references to laws 
    # We could add a special tag, but we just remove for now
    # Note we dont get rid of nested parens because that is a complex re
    #text = PAREN_re.sub('LAWREF', text)
    text = PAREN_re.sub('', text)
    

    # Get rid of enums as bullets or ` as bullets
    text = BULLET_RE.sub(' ',text)
    
    # Clean html 
    text = text.replace('&lt;all&gt;', '')

    # Remove annoying punctuation, that's not relevant
    text = BAD_PUNCT_RE.sub('', text)

    # Get rid of long sequences of dashes - these are formating
    text = DASH_RE.sub( ' ', text)

    # removing newlines, tabs, and extra spaces.
    text = WHITESPACE_RE.sub(' ', text)
    
    # If we ended up with "empty" sentences - get rid of them.
    text = EMPTY_SENT_RE.sub('.', text)
    
    # Attempt to create sentences from bullets 
    text = replace_semicolon(text)
    
    # Fix weird period issues + start of text weirdness
    #text = re.sub('\.(?=[A-Z])', '  . ', text)
    # Get rid of anything thats not a word from the start of the text
    text = FIX_START_RE.sub( '', text)
    # Sometimes periods get formatted weird, make sure there is a space between periods and start of sent   
    text = FIX_PERIOD.sub(". \g<1>", text)

    # Fix quotes
    text = text.replace('``', '"')
    text = text.replace('\'\'', '"')

    # Add special punct back in
    text = text.replace('SECTION-HEADER', '')

    text = remove_stopwords(text)

    return text

In [7]:
data['clean_text'] = data.text.map(clean_text)
        
data['clean_summary'] = data.summary.map(clean_text)

data['clean_title'] = data.title.map(clean_text)

In [None]:
data['clean_text'][0]

# Luhn Summarizers

In [10]:
content = data['clean_text'].values.tolist()

content_parser = [PlaintextParser.from_string(doc,Tokenizer("english")) for doc in content]

In [11]:
summarizer = LuhnSummarizer()

In [12]:
luhn_sum_train = [summarizer(doc.document, 5) for doc in content_parser]

In [14]:
luhn_summary = []

for doc in luhn_sum_train:
    summary = ""
    for s in doc:
        #print(type(s))
        summary = summary + str(s) + " "
    luhn_summary.append(summary)

In [15]:
luhn_summary[0]

'Section 501(c)(19) Internal Revenue Code related federal regulations provide exemption posts organizations war veterans, auxiliary unit society of, trust foundation for, post organization that, attributes, carries programs perpetuate memory deceased veterans members Armed Forces comfort survivors, conducts programs religious, charitable, scientific, literary, educational purposes, sponsors participates activities patriotic nature, provides social recreational activities members. Section 215.1 Revenue Taxation Code stipulates buildings, support real property buildings situated required convenient use occupation buildings, exclusively charitable purposes, owned veterans’ organization chartered Congress United States, organized operated charitable purposes, solely exclusively purpose organization, conducted profit net earnings ensures benefit private individual member thereof, exempt taxation. All buildings, real property buildings situated required convenient use occupation buildings, e

## ROUGE Evaluation

In [16]:
summary = data['clean_summary'].values.tolist()

In [17]:
rouge = Rouge()

Scores = rouge.get_scores(luhn_summary, summary, avg=True)

In [18]:
Scores

{'rouge-1': {'r': 0.3187697680337042,
  'p': 0.3214156784956868,
  'f': 0.30336462888397187},
 'rouge-2': {'r': 0.15411982992108747,
  'p': 0.16058954258756727,
  'f': 0.1471050918152946},
 'rouge-l': {'r': 0.29897174674694754,
  'p': 0.3027902355722908,
  'f': 0.28515365653969627}}