In [None]:
!pip install rouge sentence-transformers

In [None]:
import os
import numpy as np
import pandas as pd
from rouge import Rouge
from nltk.cluster import KMeansClusterer
from scipy.spatial import distance_matrix
from sentence_transformers import SentenceTransformer
from sklearn.metrics import pairwise_distances_argmin_min

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
rouge = Rouge()

In [None]:
embedder = SentenceTransformer('distiluse-base-multilingual-cased')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/607 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/528 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
path = "/content/drive/MyDrive/LegSuM/Data/catest_processed.csv"
data = pd.read_csv(path)

**For Single Document**

In [None]:
data.head()

Unnamed: 0,bill_id,clean_text,summary,sum_len,text_len,cleantext_len
0,SB 2,The people of the State of California do enact...,Existing property tax law establishes a vetera...,1181,8203,8133
1,SB 6,The people of the State of California do enact...,Existing law provides that the Board of Parole...,1435,8975,8839
2,SB 8,The people of the State of California do enact...,The Sales and Use Tax Law imposes a tax on ret...,1170,13667,13520
3,SB 9,The people of the State of California do enact...,"Existing law requires all moneys, except for f...",3050,11091,10791
4,SB 19,The people of the State of California do enact...,Existing law defines a request regarding resus...,3255,6624,6364


In [None]:
article = data['clean_text'][0]

In [None]:
sentences = nltk.sent_tokenize(article)
sentences = [sentence.strip() for sentence in sentences]

In [None]:
def get_sentence_embeddings(sentence):
    embedding = embedder.encode([sentence])
    return embedding[0]

In [None]:
tempdata = pd.DataFrame(sentences)
tempdata.columns=['sentence']

In [None]:
tempdata['embeddings'] = tempdata['sentence'].apply(get_sentence_embeddings)

In [None]:
tempdata.sample(5)

Unnamed: 0,sentence,embeddings
16,Activities of a patriotic nature need faciliti...,"[0.063901715, 0.020949904, -0.035603072, 0.004..."
12,The charitable activities of a veteran service...,"[0.05369181, -0.015838014, 0.03653887, -0.0385..."
9,"In a 1994 memorandum, the State Board of Equal...","[0.15361626, 0.0011683615, 0.0314853, 0.029618..."
0,The people of the State of California do enact...,"[-0.018970909, -0.058826435, -0.036109418, -0...."
6,Section 501(c)(19) of the Internal Revenue Cod...,"[0.047679666, 0.038819768, -0.009502156, -0.03..."


In [None]:
NUM_CLUSTERS=10
iterations=25

X = np.array(tempdata['embeddings'].tolist())

kclusterer = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        repeats=iterations,avoid_empty_clusters=True)

assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

In [None]:
tempdata['cluster'] = pd.Series(assigned_clusters, index=tempdata.index)
tempdata['centroid'] = tempdata['cluster'].apply(lambda x: kclusterer.means()[x])

In [None]:
def distance_from_centroid(row):
    return distance_matrix([row['embeddings']], [row['centroid'].tolist()])[0][0]
    
tempdata['distance_from_centroid'] = tempdata.apply(distance_from_centroid, axis=1)

In [None]:
tempdata.sample(5)

Unnamed: 0,sentence,embeddings,cluster,centroid,distance_from_centroid
6,Section 501(c)(19) of the Internal Revenue Cod...,"[0.047679666, 0.038819768, -0.009502156, -0.03...",4,"[0.043540392, 0.029179418, 0.018243918, -0.017...",0.532451
2,These veterans’ organizations also own and man...,"[0.0003238208, -0.010293673, 0.019757222, 0.02...",6,"[0.030445447, 0.041118506, 0.0058346805, 0.060...",0.561235
17,Social and recreational activities for members...,"[0.13261652, -0.017897824, 0.03400064, 0.02967...",7,"[0.072127074, 0.008780821, 0.0085913455, 0.013...",0.657335
24,"In light of this distinction, the use of real ...","[0.056930404, 0.01727493, 0.01888562, 0.013685...",7,"[0.072127074, 0.008780821, 0.0085913455, 0.013...",0.509084
11,The State Board of Equalization’s constriction...,"[0.020662397, -0.0035497814, 0.02083965, 0.013...",3,"[0.06346818, 0.0048467666, 0.03303178, 0.01216...",0.638867


In [None]:
summary = ' '.join(tempdata.sort_values('distance_from_centroid',ascending = True). \
                   groupby('cluster').head(1). \
                   sort_index()['sentence'].tolist())

In [None]:
summary

'The people of the State of California do enact as follows: SECTIONHEADER The Legislature finds and declares all of the following: (1) Since 1899 congressionally chartered veterans’ organizations have provided a valuable service to our nation’s returning service members. These properties act as a safe haven where veterans of all ages and their families can gather together to find camaraderie and fellowship, share stories, and seek support from people who understand their unique experiences. Section 501(c)(19) of the Internal Revenue Code and related federal regulations provide for the exemption for posts or organizations of war veterans, or an auxiliary unit or society of, or a trust or foundation for, any such post or organization that, among other attributes, carries on programs to perpetuate the memory of deceased veterans and members of the Armed Forces and to comfort their survivors, conducts programs for religious, charitable, scientific, literary, or educational purposes, sponso

In [None]:
data['summary'][0]

'Existing property tax law establishes a veterans’ organization exemption under which property is exempt from taxation if, among other things, that property is used exclusively for charitable purposes and is owned by a veterans’ organization. This bill would provide that the veterans’ organization exemption shall not be denied to a property on the basis that the property is used for fraternal, lodge, or social club purposes, and would make specific findings and declarations in that regard. The bill would also provide that the exemption shall not apply to any portion of a property that consists of a bar where alcoholic beverages are served. Section 2229 of the Revenue and Taxation Code requires the Legislature to reimburse local agencies annually for certain property tax revenues lost as a result of any exemption or classification of property for purposes of ad valorem property taxation. This bill would provide that, notwithstanding Section 2229 of the Revenue and Taxation Code, no appr

In [None]:
(rouge.get_scores(summary, data['summary'][0]))[0]

{'rouge-1': {'f': 0.33939393524315886,
  'p': 0.24034334763948498,
  'r': 0.5773195876288659},
 'rouge-2': {'f': 0.15357765750049504,
  'p': 0.10501193317422435,
  'r': 0.2857142857142857},
 'rouge-l': {'f': 0.30303029887952254,
  'p': 0.2145922746781116,
  'r': 0.5154639175257731}}

**For DataSet**

In [None]:
SystemSummary = []
GoldSummary = data['summary']
CaseText = data['clean_text']

In [None]:
def SummariseCase(case, cluster):

  sentences = nltk.sent_tokenize(case)
  sentences = [sentence.strip() for sentence in sentences]
  tempdata_ = pd.DataFrame(sentences, columns=['sentence'])
  tempdata_['embeddings'] = tempdata_['sentence'].apply(get_sentence_embeddings)

  NUM_CLUSTERS=cluster
  iterations=25

  X = np.array(tempdata_['embeddings'].tolist())
  kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,repeats=iterations,avoid_empty_clusters=True)
  assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

  tempdata_['cluster'] = pd.Series(assigned_clusters, index=tempdata_.index)
  tempdata_['centroid'] = tempdata_['cluster'].apply(lambda x: kclusterer.means()[x]) 

  tempdata_['distance_from_centroid'] = tempdata_.apply(distance_from_centroid, axis=1)
  summary = ' '.join(tempdata_.sort_values('distance_from_centroid',ascending = True). \
                   groupby('cluster').head(1). \
                   sort_index()['sentence'].tolist())
  
  return summary

In [None]:
data[['clean_text', 'summary']].sample(5)

Unnamed: 0,clean_text,summary
1087,The people of the State of California do enact...,"Existing law, the Gun-Free School Zone Act of ..."
888,The people of the State of California do enact...,Existing law requires the State Air Resources ...
756,The people of the State of California do enact...,Existing law establishes the Department of Tec...
1167,The people of the State of California do enact...,"Under existing law, the State Water Resources ..."
239,The people of the State of California do enact...,Existing law with respect to claims against pu...


In [None]:
for i, cases in enumerate(data['clean_text']):

  try:
    summary_ = SummariseCase(cases, 10)
    SystemSummary.append(summary_)
    print(i)
    
  except Exception as e:
    SystemSummary.append(np.NaN)
    print(e, 'for' ,i)

In [None]:
SystemSummary[23]

'The meetings of the commission shall be open and public in accordance with the provisions of Article 9 of Chapter 1 of Part 1 of Division 3 of Title 2 of the Government Code. SECTIONHEADER Section 307.1 is added to the Public Utilities Code, to read: 307.1. A contract of any size entered into by the commission for outside legal counsel in any criminal investigation shall not include terms providing for the representation of individual employees except as provided in Section 995.9 of the Government Code. The director may appoint a lead attorney who shall represent the office, and shall report to and serve at the pleasure of the director. The procedures shall include, but shall not be limited to, the development of a code of conduct and procedures for ensuring that advocates and their representatives on a particular case or proceeding are not advising decisionmakers on the same case or proceeding. Moneys from the Public Utilities Commission Utilities Reimbursement Account in the General

In [None]:
GoldSummary[23]

'The California Constitution establishes the Public Utilities Commission with jurisdiction over all public utilities, authorizes the commission PUC to establish its own procedures, subject to statutory limitations or directions and constitutional requirements of due process, and authorizes the commission PUC to fix the rates and establish rules for public utilities, subject to control by the Legislature. The The Public Utilities Act provides that the office of the PUC shall be in the City and County of San Francisco, requires that the PUC hold its sessions at least once in each calendar month in the City and County of San Francisco, and authorizes the PUC to also meet at those other times and places as may be expedient and necessary for the proper performance of its duties. This bill would require that the PUC hold its sessions at least once in each calendar month in the City and County of San Francisco or the City of Sacramento. The Public Utilities Act authorizes the PUC to appoint a

In [None]:
Summaries = pd.DataFrame(zip(GoldSummary, SystemSummary), columns = ['GoldSummary', 'SystemSummary'])
Summaries.sample(3)

Unnamed: 0,GoldSummary,SystemSummary
551,Existing law requires the State Department of ...,Notwithstanding subdivision (a) of Section 127...
23,The California Constitution establishes the Pu...,The meetings of the commission shall be open a...
18,Existing law establishes various career techni...,Middle skill credentials serve as the gateway ...


In [None]:
Summaries.dropna(inplace=True)
Summaries.reset_index(inplace=True, drop=True)

In [None]:
def RougeScore():

    standard_summary = Summaries["GoldSummary"]
    ModelSummary =  Summaries["SystemSummary"]
    
    ModelScore_ = rouge.get_scores(ModelSummary, standard_summary, avg=True)
    ModelDF = pd.DataFrame(ModelScore_).set_index(
        [["recall", "precision", "f-measure"]]
    )
    return ModelDF

In [None]:
KmeansRouge = RougeScore()

In [None]:
KmeansRouge

Unnamed: 0,rouge-1,rouge-2,rouge-l
recall,0.411884,0.205132,0.370737
precision,0.340667,0.174186,0.307483
f-measure,0.356994,0.176384,0.321723


In [None]:
path = "/content/drive/MyDrive/LegSuM/scores/"
KmeansRouge.to_csv(path + "KmeansRouge.csv", index=True, header=True)

------------