### Import Relevant Libraries

In [49]:
import nltk
from nltk.corpus import stopwords
stopWords = set(stopwords.words("english"))
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import tokenize
from operator import itemgetter
import math
from gensim.parsing.preprocessing import remove_stopwords
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.edmundson import EdmundsonSummarizer

import pandas as pd
import numpy as np
import json
import re

import spacy
import string

from gensim.models import Word2Vec
from scipy import spatial
from scipy.sparse.linalg import svds
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer

import rouge
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

### Load Dataset

In [2]:
billsum_test = pd.read_excel('../data/billsum_test.xlsx')

billsum_test['text'] = billsum_test['text'].apply(lambda x: x.lower())
billsum_test['summary'] = billsum_test['summary'].apply(lambda x: x.lower())
billsum_test.head()

Unnamed: 0,contract,text,summary
0,To amend the Internal Revenue Code of 1986 to ...,section 1. short title. this act may be cited ...,national science education tax incentive for b...
1,To amend the Internal Revenue Code of 1986 to ...,section 1. short title. this act may be cited ...,small business expansion and hiring act of 201...
2,A bill to require the Director of National Int...,section 1. release of documents captured in ir...,requires the director of national intelligence...
3,A bill to improve data collection and dissemin...,section 1. short title. this act may be cited ...,national cancer act of 2003 - amends the publi...
4,A bill to amend the Internal Revenue Code of 1...,section 1. short title. this act may be cited ...,military call-up relief act - amends the inter...


### Pre-processing

In [3]:
def replace_semicolon(text, threshold=10):
    '''
    Get rid of semicolons.
    First split text into fragments between the semicolons. If the fragment 
    is longer than the threshold, turn the semicolon into a period. O.w treat
    it as a comma.
    Returns new text
    '''
    new_text = ""
    for subset in re.split(';', text):
        subset = subset.strip() # Clear off spaces
        # Check word count
        if len(subset.split()) > threshold:
            # Turn first char into uppercase
            new_text += ". " + subset[0].upper() + subset[1:]
        else:
            # Just append with a comma 
            new_text += ", " + subset

    return new_text

In [4]:
USC_re = re.compile('[Uu]\.*[Ss]\.*[Cc]\.]+')
PAREN_re = re.compile('\([^(]+\ [^\(]+\)')
BAD_PUNCT_RE = re.compile(r'([%s])' % re.escape('"#%&\*\+/<=>@[\]^{|}~_'), re.UNICODE)
BULLET_RE = re.compile('\n[\ \t]*`*\([a-zA-Z0-9]*\)')
DASH_RE = re.compile('--+')
WHITESPACE_RE = re.compile('\s+')
EMPTY_SENT_RE = re.compile('[,\.]\ *[\.,]')
FIX_START_RE = re.compile('^[^A-Za-z]*')
FIX_PERIOD = re.compile('\.([A-Za-z])')
SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

FIX_PERIOD = re.compile('\.([A-Za-z])')

SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

In [5]:
def clean_text(text):
    """
    Borrowed from the FNDS text processing with additional logic added in.
    Note: we do not take care of token breaking - assume SPACY's tokenizer
    will handle this for us.
    """

    # Indicate section headers, we need them for features
    text = SECTION_HEADER_RE.sub('SECTION-HEADER', text)
    # For simplicity later, remove '.' from most common acronym
    text = text.replace("U.S.", "US")
    text = text.replace('SEC.', 'Section')
    text = text.replace('Sec.', 'Section')
    text = USC_re.sub('USC', text)

    # Remove parantheticals because they are almost always references to laws 
    # We could add a special tag, but we just remove for now
    # Note we dont get rid of nested parens because that is a complex re
    #text = PAREN_re.sub('LAWREF', text)
    text = PAREN_re.sub('', text)
    

    # Get rid of enums as bullets or ` as bullets
    text = BULLET_RE.sub(' ',text)
    
    # Clean html 
    text = text.replace('&lt;all&gt;', '')

    # Remove annoying punctuation, that's not relevant
    text = BAD_PUNCT_RE.sub('', text)

    # Get rid of long sequences of dashes - these are formating
    text = DASH_RE.sub( ' ', text)

    # removing newlines, tabs, and extra spaces.
    text = WHITESPACE_RE.sub(' ', text)
    
    # If we ended up with "empty" sentences - get rid of them.
    text = EMPTY_SENT_RE.sub('.', text)
    
    # Attempt to create sentences from bullets 
    text = replace_semicolon(text)
    
    # Fix weird period issues + start of text weirdness
    #text = re.sub('\.(?=[A-Z])', '  . ', text)
    # Get rid of anything thats not a word from the start of the text
    text = FIX_START_RE.sub( '', text)
    # Sometimes periods get formatted weird, make sure there is a space between periods and start of sent   
    text = FIX_PERIOD.sub(". \g<1>", text)

    # Fix quotes
    text = text.replace('``', '"')
    text = text.replace('\'\'', '"')

    # Add special punct back in
    text = text.replace('SECTION-HEADER', '')

    text = remove_stopwords(text)

    return text

In [6]:
billsum_test['clean_content'] = billsum_test.text.map(clean_text)
        
billsum_test['clean_summary'] = billsum_test.summary.map(clean_text)

billsum_test['clean_contract'] = billsum_test.contract.map(clean_text)

In [7]:
content = billsum_test['clean_content'].values.tolist()

content_parser = [PlaintextParser.from_string(doc,Tokenizer("english")) for doc in content]

### Getting significant and redundant keywords
Currently, this is done automatically using tf-idf however, in practice, lawyers can specify what words are significant and what are not significant. Hence, this can be customised

In [8]:
def check_sent(word, sentences): 
    final = [all([w in x for w in word]) for x in sentences] 
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))

In [9]:
def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
    return result

In [16]:
def keyword_extraction(doc):
    total_words = doc.split()
    total_word_length = len(total_words)
#     print(total_word_length)
    
    total_sentences = tokenize.sent_tokenize(doc)
    total_sent_len = len(total_sentences)
#     print(total_sent_len)

    tf_score = {}
    for each_word in total_words:
        each_word = each_word.replace('.','')
        if each_word not in stopWords:
            if each_word in tf_score:
                tf_score[each_word] += 1
            else:
                tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
#     print(tf_score)

    idf_score = {}
    for each_word in total_words:
        each_word = each_word.replace('.','')
        if each_word not in stopWords:
            if each_word in idf_score:
                idf_score[each_word] = check_sent(each_word, total_sentences)
            else:
                idf_score[each_word] = 1

    # Performing a log and divide
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

#     print(idf_score)

    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
#     print(tf_idf_score)

    return get_top_n(tf_idf_score, 5)

In [91]:
def unique_keywords(top_keywords):
    string.punctuation = "\!\"#&'()*+,./:;<=>?@[\]^_`{|}~" #remove $ and % as they are needed in legal documents
    keywords = []
    for words in top_5_keywords:
        for word in words.keys():
            if (word not in keywords) and ('(' not in word) and (')' not in word):
                word = word.translate(str.maketrans('', '', string.punctuation))
                if word != '':
                    keywords.append(word)
    return keywords

In [92]:
#content
top_5_keywords = [keyword_extraction(doc.lower()) for doc in content]

In [93]:
print(top_5_keywords[0])

{'technology,': 0.022344044962009806, 'engineering,': 0.0209978425915327, 'science,': 0.01973855437421424, 'property': 0.01973855437421424, '`stem': 0.01803309738042134}


In [94]:
content_keywords = unique_keywords(top_5_keywords)

In [108]:
print(len(set(content_keywords)))

2798


In [104]:
for k in set(content_keywords):
    print(f"'{k}'", end = ', ')



In [105]:
redundant = ['following',
             'section',
             'shall',
             'which',
             'the',
             'thereof', 
             'an', 
             '-a-u-t-h-o-r-i-z-e-d', 
             'under', 
             'this', 
             '-s-u-c-h', 
             ' - ', 
             '-a-u-t-h-o-r-i-z-a-t-i-o-n', 
             'allow', 
             'held',
            'previous',
            'makes']

significant = []
for k in set(content_keywords):
    if k not in redundant:
        significant.append(k)

In [106]:
len(significant)

2782

### Edmundson Algorithm
Edmundson Implementation: https://iq.opengenus.org/edmundson-heuristic-method-for-text-summarization/

In [43]:
summarizer_edm = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0)

In [111]:
# bonus words are considered significant keywords that model should take note while stigma words are words that model think is significant bu
summarizer_edm.bonus_words = tuple(significant)
summarizer_edm.stigma_words = tuple(redundant)

In [118]:
edm_sum_train = [summarizer_edm(doc.document, 8) for doc in content_parser]

In [119]:
edm_summary = []

for doc in edm_sum_train:
    summary = ""
    for s in doc:
        #print(type(s))
        summary = summary + str(s) + " "
    edm_summary.append(summary)

In [120]:
edm_summary[0]

'purposes section 38, elementary secondary science, technology, engineering, mathematics (stem) contributions credit determined section taxable year equal 100 percent qualified stem contributions taxpayer taxable year. purposes section, term `qualified stem contributions\' means "(1) stem school contributions, "(2) stem teacher externship expenses, "(3) stem teacher training expenses. term `stem property contributions\' means (but subsection (f)) allowed deduction section 170 charitable contribution stem inventory property "(a) donee elementary secondary school described section 170(b)(1)(a)(ii), "(b) substantially use property donee united states defense dependents\' education educational purposes grades k-12 related purpose function donee, "(c) original use property begins donee, "(d) property fit productively donee\'s education plan, "(e) property transferred donee exchange money, property, services, shipping, installation transfer costs, "(f) donee\'s use disposition property accor

### Rouge Evaluation
Rouge Implementation Python: https://github.com/pltrdy/rouge

In [121]:
summary = billsum_test['clean_summary'].values.tolist()

In [122]:
rouge = Rouge()

Scores = rouge.get_scores(edm_summary, summary, avg=True)

In [124]:
for score, f1 in Scores.items():
    print(f"{score}:")
    print(f"precision: {f1['p']}")
    print(f"recall: {f1['r']}")
    print(f"f1-score: {f1['f']}\n")

rouge-1:
precision: 0.2550725921574602
recall: 0.4964914797929329
f1-score: 0.31916416353262794

rouge-2:
precision: 0.12584844717049415
recall: 0.26533318431856145
f1-score: 0.15992918917612772

rouge-l:
precision: 0.24110785772822635
recall: 0.46713319761869243
f1-score: 0.3012420819537806

