### Import Relevant Libraries

In [1]:
import nltk
from nltk.corpus import stopwords
stopWords = set(stopwords.words("english"))
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.parsing.preprocessing import remove_stopwords
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

import pandas as pd
import numpy as np
import json
import re

import spacy

from gensim.models import Word2Vec
from scipy import spatial
from scipy.sparse.linalg import svds
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer

import rouge
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu



### Load Dataset

In [2]:
billsum_test = pd.read_excel('../data/billsum_test.xlsx')

billsum_test['text'] = billsum_test['text'].apply(lambda x: x.lower())
billsum_test['summary'] = billsum_test['summary'].apply(lambda x: x.lower())
billsum_test.head()

Unnamed: 0,contract,text,summary
0,To amend the Internal Revenue Code of 1986 to ...,section 1. short title. this act may be cited ...,national science education tax incentive for b...
1,To amend the Internal Revenue Code of 1986 to ...,section 1. short title. this act may be cited ...,small business expansion and hiring act of 201...
2,A bill to require the Director of National Int...,section 1. release of documents captured in ir...,requires the director of national intelligence...
3,A bill to improve data collection and dissemin...,section 1. short title. this act may be cited ...,national cancer act of 2003 - amends the publi...
4,A bill to amend the Internal Revenue Code of 1...,section 1. short title. this act may be cited ...,military call-up relief act - amends the inter...


### Pre-processing

In [3]:
def replace_semicolon(text, threshold=10):
    '''
    Get rid of semicolons.
    First split text into fragments between the semicolons. If the fragment 
    is longer than the threshold, turn the semicolon into a period. O.w treat
    it as a comma.
    Returns new text
    '''
    new_text = ""
    for subset in re.split(';', text):
        subset = subset.strip() # Clear off spaces
        # Check word count
        if len(subset.split()) > threshold:
            # Turn first char into uppercase
            new_text += ". " + subset[0].upper() + subset[1:]
        else:
            # Just append with a comma 
            new_text += ", " + subset

    return new_text

In [4]:
USC_re = re.compile('[Uu]\.*[Ss]\.*[Cc]\.]+')
PAREN_re = re.compile('\([^(]+\ [^\(]+\)')
BAD_PUNCT_RE = re.compile(r'([%s])' % re.escape('"#%&\*\+/<=>@[\]^{|}~_'), re.UNICODE)
BULLET_RE = re.compile('\n[\ \t]*`*\([a-zA-Z0-9]*\)')
DASH_RE = re.compile('--+')
WHITESPACE_RE = re.compile('\s+')
EMPTY_SENT_RE = re.compile('[,\.]\ *[\.,]')
FIX_START_RE = re.compile('^[^A-Za-z]*')
FIX_PERIOD = re.compile('\.([A-Za-z])')
SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

FIX_PERIOD = re.compile('\.([A-Za-z])')

SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

In [5]:
def clean_text(text):
    """
    Borrowed from the FNDS text processing with additional logic added in.
    Note: we do not take care of token breaking - assume SPACY's tokenizer
    will handle this for us.
    """

    # Indicate section headers, we need them for features
    text = SECTION_HEADER_RE.sub('SECTION-HEADER', text)
    # For simplicity later, remove '.' from most common acronym
    text = text.replace("U.S.", "US")
    text = text.replace('SEC.', 'Section')
    text = text.replace('Sec.', 'Section')
    text = USC_re.sub('USC', text)

    # Remove parantheticals because they are almost always references to laws 
    # We could add a special tag, but we just remove for now
    # Note we dont get rid of nested parens because that is a complex re
    #text = PAREN_re.sub('LAWREF', text)
    text = PAREN_re.sub('', text)
    

    # Get rid of enums as bullets or ` as bullets
    text = BULLET_RE.sub(' ',text)
    
    # Clean html 
    text = text.replace('&lt;all&gt;', '')

    # Remove annoying punctuation, that's not relevant
    text = BAD_PUNCT_RE.sub('', text)

    # Get rid of long sequences of dashes - these are formating
    text = DASH_RE.sub( ' ', text)

    # removing newlines, tabs, and extra spaces.
    text = WHITESPACE_RE.sub(' ', text)
    
    # If we ended up with "empty" sentences - get rid of them.
    text = EMPTY_SENT_RE.sub('.', text)
    
    # Attempt to create sentences from bullets 
    text = replace_semicolon(text)
    
    # Fix weird period issues + start of text weirdness
    #text = re.sub('\.(?=[A-Z])', '  . ', text)
    # Get rid of anything thats not a word from the start of the text
    text = FIX_START_RE.sub( '', text)
    # Sometimes periods get formatted weird, make sure there is a space between periods and start of sent   
    text = FIX_PERIOD.sub(". \g<1>", text)

    # Fix quotes
    text = text.replace('``', '"')
    text = text.replace('\'\'', '"')

    # Add special punct back in
    text = text.replace('SECTION-HEADER', '')

    text = remove_stopwords(text)

    return text

In [6]:
billsum_test['clean_content'] = billsum_test.text.map(clean_text)
        
billsum_test['clean_summary'] = billsum_test.summary.map(clean_text)

billsum_test['clean_contract'] = billsum_test.contract.map(clean_text)

In [7]:
content = billsum_test['clean_content'].values.tolist()

content_parser = [PlaintextParser.from_string(doc,Tokenizer("english")) for doc in content]

### LexRank

In [8]:
summarizer_lr = LexRankSummarizer()

In [10]:
lr_sum_train = [summarizer_lr(doc.document, 8) for doc in content_parser]

In [11]:
lr_summary = []

for doc in lr_sum_train:
    summary = ""
    for s in doc:
        #print(type(s))
        summary = summary + str(s) + " "
    lr_summary.append(summary)

In [12]:
lr_summary[0]

'purposes section 38, elementary secondary science, technology, engineering, mathematics (stem) contributions credit determined section taxable year equal 100 percent qualified stem contributions taxpayer taxable year. purposes section, term `qualified stem contributions\' means "(1) stem school contributions, "(2) stem teacher externship expenses, "(3) stem teacher training expenses. term `stem school contributions\' means "(a) stem property contributions, "(b) stem service contributions. determination deduction section 170 purposes paragraph shall limitation section 170(e)(3)(b) applied stem inventory property. term `stem service contributions\' means paid incurred taxable year stem services provided united states defense dependents\' education exclusive benefit students elementary secondary school described section 170(b)(1)(a)(ii) "(a) taxpayer engaged trade business providing services commercial basis, "(b) charge imposed providing services. term `stem inventory property\' means, 

### Rouge Evaluation

In [13]:
summary = billsum_test['clean_summary'].values.tolist()

In [14]:
rouge = Rouge()

Scores = rouge.get_scores(lr_summary, summary, avg=True)

In [15]:
for score, f1 in Scores.items():
    print(f"{score}:")
    print(f"precision: {f1['p']}")
    print(f"recall: {f1['r']}")
    print(f"f1-score: {f1['f']}\n")

rouge-1:
precision: 0.27673588635396207
recall: 0.4018433578734701
f1-score: 0.30674825565409136

rouge-2:
precision: 0.12852871676430622
recall: 0.20325683823379267
f1-score: 0.1452357172061532

rouge-l:
precision: 0.26154769104756626
recall: 0.37852748421655513
f1-score: 0.28950952593283485

