In [70]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords


import pandas as pd
import numpy as np
import json
import os
from zipfile import ZipFile

import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx

import nltk
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial import distance
from rouge_score import rouge_scorer
from rouge import Rouge

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivyha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ivyha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import Dataset

In [3]:
data = pd.read_json('billsum_v4_1/ca_test_data_final_OFFICIAL.jsonl', lines = True)

In [4]:
stop_words = stopwords.words("english")

# Pre-processing

In [5]:
def replace_semicolon(text, threshold=10):
    '''
    Get rid of semicolons.
    First split text into fragments between the semicolons. If the fragment 
    is longer than the threshold, turn the semicolon into a period. O.w treat
    it as a comma.
    Returns new text
    '''
    new_text = ""
    for subset in re.split(';', text):
        subset = subset.strip() # Clear off spaces
        # Check word count
        if len(subset.split()) > threshold:
            # Turn first char into uppercase
            new_text += ". " + subset[0].upper() + subset[1:]
        else:
            # Just append with a comma 
            new_text += ", " + subset

    return new_text

In [6]:
USC_re = re.compile('[Uu]\.*[Ss]\.*[Cc]\.]+')
PAREN_re = re.compile('\([^(]+\ [^\(]+\)')
BAD_PUNCT_RE = re.compile(r'([%s])' % re.escape('"#%&\*\+/<=>@[\]^{|}~_'), re.UNICODE)
BULLET_RE = re.compile('\n[\ \t]*`*\([a-zA-Z0-9]*\)')
DASH_RE = re.compile('--+')
WHITESPACE_RE = re.compile('\s+')
EMPTY_SENT_RE = re.compile('[,\.]\ *[\.,]')
FIX_START_RE = re.compile('^[^A-Za-z]*')
FIX_PERIOD = re.compile('\.([A-Za-z])')
SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

FIX_PERIOD = re.compile('\.([A-Za-z])')

SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

In [51]:
def clean_text(text):
    """
    Borrowed from the FNDS text processing with additional logic added in.
    Note: we do not take care of token breaking - assume SPACY's tokenizer
    will handle this for us.
    """

    # Indicate section headers, we need them for features
    text = SECTION_HEADER_RE.sub('SECTION-HEADER', text)
    # For simplicity later, remove '.' from most common acronym
    text = text.replace("U.S.", "US")
    text = text.replace('SEC.', 'Section')
    text = text.replace('Sec.', 'Section')
    text = USC_re.sub('USC', text)

    # Remove parantheticals because they are almost always references to laws 
    # We could add a special tag, but we just remove for now
    # Note we dont get rid of nested parens because that is a complex re
    #text = PAREN_re.sub('LAWREF', text)
    text = PAREN_re.sub('', text)
    

    # Get rid of enums as bullets or ` as bullets
    text = BULLET_RE.sub(' ',text)
    
    # Clean html 
    text = text.replace('&lt;all&gt;', '')

    # Remove annoying punctuation, that's not relevant
    text = BAD_PUNCT_RE.sub('', text)

    # Get rid of long sequences of dashes - these are formating
    text = DASH_RE.sub( ' ', text)

    # removing newlines, tabs, and extra spaces.
    text = WHITESPACE_RE.sub(' ', text)
    
    # If we ended up with "empty" sentences - get rid of them.
    text = EMPTY_SENT_RE.sub('.', text)
    
    # Attempt to create sentences from bullets 
    text = replace_semicolon(text)
    
    # Fix weird period issues + start of text weirdness
    #text = re.sub('\.(?=[A-Z])', '  . ', text)
    # Get rid of anything thats not a word from the start of the text
    text = FIX_START_RE.sub( '', text)
    # Sometimes periods get formatted weird, make sure there is a space between periods and start of sent   
    text = FIX_PERIOD.sub(". \g<1>", text)

    # Fix quotes
    text = text.replace('``', '"')
    text = text.replace('\'\'', '"')

    # Add special punct back in
    text = text.replace('SECTION-HEADER', '')

    text = remove_stopwords(text)

    return text

In [8]:
data['clean_text'] = data.text.map(clean_text)
        
data['clean_summary'] = data.summary.map(clean_text)

data['clean_title'] = data.title.map(clean_text)

In [9]:
data['clean_text'][0]

'The people State California enact follows: <SECTION-HEADER> The Legislature finds declares following: (1) Since 1899 congressionally chartered veterans’ organizations provided valuable service nation’s returning service members. These organizations help preserve memories incidents great hostilities fought nation, preserve strengthen comradeship members. These veterans’ organizations manage properties including lodges, posts, fraternal halls. These properties act safe haven veterans ages families gather camaraderie fellowship, share stories, seek support people understand unique experiences. This aids healing process returning veterans, ensures health happiness. As result congressional chartering veterans’ organizations, United States Internal Revenue Service created special tax exemption organizations Section 501(c)(19) Internal Revenue Code. Section 501(c)(19) Internal Revenue Code related federal regulations provide exemption posts organizations war veterans, auxiliary unit society 

# K Means

References:
https://medium.com/@akankshagupta371/understanding-text-summarization-using-k-means-clustering-6487d5d37255

In [10]:
# Tokenize Sentences
sentences = []
for doc in data['clean_text']:
  sentences.append(sent_tokenize(doc))

#k_sentences = [y for x in k_sentences for y in x] # flatten list

In [11]:
k_sentences = sentences

In [12]:
for doc in k_sentences:
    for s in doc:
        re.sub('[^a-zA-Z]`', " ", s)  

In [13]:

k_sentences = [[s.lower() for s in doc] for doc in k_sentences]

In [14]:
k_sentences = [[[words for words in sentence.split(' ') if words not in stop_words] for sentence in doc] for doc in k_sentences]

In [15]:
k_sentences = [list(map(" ".join,doc))for doc in k_sentences]

In [16]:
k_all_words = [[s.split() for s in doc] for doc in k_sentences]
k_model = [Word2Vec(doc, min_count = 1, vector_size = 300,epochs=100)  for doc in k_all_words]

#[[Word2Vec(sent, vector_size=1, min_count=1) for sent in doc] for doc in pagerank_all_senttoken]

In [17]:
k_sent_vector = []

doc_vector = []

for i in range(len(k_sentences)):
    for s in k_sentences[i]:
        plus = 0
        for j in s.split():
            plus += k_model[i].wv[j]
        plus = plus/len(s.split())

        doc_vector.append(plus)
    k_sent_vector.append(doc_vector)
    doc_vector = []


In [18]:
n_clusters = 5
kmeans = KMeans(n_clusters, init = 'k-means++', random_state = 42)


In [None]:
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(doc)
    Sum_of_squared_distances.append(km.inertia_)

In [19]:

y_kmeans = [kmeans.fit_predict(doc) for doc in k_sent_vector]

In [None]:
sent_list = []
my_list=[]
for h in range(len(y_kmeans)):
    for i in range(n_clusters):
        my_dict={}
        for j in range(len(y_kmeans[h])):
            
            if y_kmeans[h][j]==i:
                my_dict[j] =  distance.euclidean(kmeans.cluster_centers_[i],k_sent_vector[h][j])
        min_distance = min(my_dict.values())
        sent_list.append(min(my_dict, key=my_dict.get))
    my_list.append(sent_list)
    sent_list = []
 
for doc in my_list:
    for s in sorted(doc):
        print(sentences[s])

In [60]:
k_train_sum = [[sentences[i][j] for j in sorted(my_list[i])] for i in range(len(my_list))]

In [61]:
k_train_sum = list(map(" ".join, k_train_sum))

In [63]:
data['k_sum'] = k_train_sum

### Calculate Evaluation scores

References: https://towardsdatascience.com/the-ultimate-performance-metric-in-nlp-111df6c64460

In [67]:
summary = data['clean_summary'].values.tolist()

In [71]:
rouge = Rouge()

Scores = rouge.get_scores(k_train_sum, summary, avg=True)

{'rouge-1': {'r': 0.1411046294012602,
  'p': 0.31130387689391087,
  'f': 0.18326353871071346},
 'rouge-2': {'r': 0.026603814750615972,
  'p': 0.08038325278963127,
  'f': 0.03708949273132507},
 'rouge-l': {'r': 0.13305651294005308,
  'p': 0.29448271643561413,
  'f': 0.17296877639168165}}