In [269]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re

pd.set_option('display.max_columns', 1000)

In [2]:
dataset = pd.read_pickle('other_xmltodict_all_major_fields.pkl')

In [3]:
### Drop the rows with no values in section_bodies
dataset = dataset[dataset['section_bodies'] != 'Null']
print("Without Null rows: " + str(len(dataset)))

Without Null rows: 2520


In [4]:
#only keep the SOR and CRC...SI are not included in the scope
dataset['regulation_type'].value_counts()

SOR     2265
SI       246
Null       9
Name: regulation_type, dtype: int64

In [5]:
dataset = dataset[dataset['regulation_type']!= 'SI']

## 1. Creating the Corpus & Sparse Matrix

In [6]:
def make_corpus(dataset, column):
    '''This function reads in a pandas dataframe (dataset) and the column name (e.g. 'text_column'). 
    Then it strips all punctuation, makes it lower case, tokenizes it, and removes english stopwords. 
    Finally it returns two lists (either directly or via tuple unpacking. the first is the documents
    without stopwords; the second is the documents stemmed via porterstemmer.
    '''
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('english'))
    import string 
    import sys
    translator = str.maketrans('', '', string.punctuation)
    import re
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    filtered_text_list = []
    stemmed_text_list = []
    for i in range (0, len(dataset)):
        text = dataset.iloc[i][column].lower().translate(translator)
        text = re.sub('\n', ' ', text)
        word_tokens = word_tokenize(text) 
        filtered_text = [w for w in word_tokens if not w in stop_words]
        filtered_text = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_text.append(w)
        stemmed_text = []
        for w in filtered_text:
            stemmed_text.append(ps.stem(w))
        filtered_text_list.append(str(filtered_text).translate(translator))
        stemmed_text_list.append(str(stemmed_text).translate(translator))
        sys.stdout.write("\r" + "Creating Corpus.. Processing Record: " + str(i+1) + " of " + str(len(dataset)))
        sys.stdout.flush()
    print("\nDONE!")
    return filtered_text_list, stemmed_text_list

In [7]:
filtered_text, stemmed_text = make_corpus(dataset,'section_bodies')

Creating Corpus.. Processing Record: 2274 of 2274
DONE!


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(stemmed_text).toarray()

In [9]:
X.shape

(2274, 12595)

In [10]:
df_sparse = pd.DataFrame(X, columns = cv.get_feature_names())

## 2. Cleaning the Sparse Matrix

In [11]:
features = pd.Series(df_sparse.columns)

##### Removing features which contain a number

In [12]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [13]:
#remove all the features which are numeric
index = features.map(hasNumbers)

In [14]:
#how many are numeric?
index.value_counts()

False    7512
True     5083
dtype: int64

In [15]:
#list of features to drop
dropped_features = features[index]

In [16]:
len(dropped_features)

5083

In [17]:
#Drop it like its hot
df_sparse = df_sparse.drop(columns=dropped_features)

In [18]:
df_sparse.shape

(2274, 7512)

##### Removing the helping verbs

In [19]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [20]:
#There are 23 helping verbs in total
helping_verbs = ['am', 'is', 'are', 'was', 'were', 'being', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 
                 'did', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'can', 'could']

#Obtain the stemed version of each word
helping_verbs_stemmed = []
for verb in helping_verbs:
    helping_verbs_stemmed.append(ps.stem(verb).lower())

In [21]:
#Which helping verbs are in the sparse matrix?
helping_verbs_stemmed_drop = []

cols = list(df_sparse.columns)

for verb in helping_verbs_stemmed:
    if verb in cols:
        helping_verbs_stemmed_drop.append(verb)
        
print (helping_verbs_stemmed_drop)

['will', 'would', 'shall', 'may', 'might', 'must', 'could']


In [22]:
#Remove the helping verbs and check the outcomes
print ('shape before dropping helping verbs:' + str(df_sparse.shape))

df_sparse = df_sparse.drop(columns=helping_verbs_stemmed_drop)

print ('shape after dropping helping verbs:' + str(df_sparse.shape))

shape before dropping helping verbs:(2274, 7512)
shape after dropping helping verbs:(2274, 7505)


## 3. TF-IDF Vectorizer

In [23]:
#convert the count matrix to tf-idf representation
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(df_sparse).toarray()

In [24]:
#df_sparse = pd.DataFrame(tfidf, columns = cols)

In [25]:
feature_names = df_sparse.columns

# find maximum value for each of the features over all of dataset:
max_val = df_sparse.max(axis=0).ravel()

#sort weights from smallest to biggest and extract their indices 
sort_by_tfidf = max_val.argsort()

print("Features with lowest tfidf:\n{}".format(
      feature_names[sort_by_tfidf[:20]]))

print("\nFeatures with highest tfidf: \n{}".format(
      feature_names[sort_by_tfidf[-20:]]))

Features with lowest tfidf:
Index(['northwestel', 'speech', 'premiumpay', 'spednic', 'spectat',
       'discredit', 'prepaidport', 'discriminatori', 'sphere', 'iran', 'ipc',
       'ioniz', 'ionexchang', 'dish', 'dishonest', 'sovereignti', 'ioc',
       'sparkler', 'discomfort', 'discolor'],
      dtype='object')

Features with highest tfidf: 
Index(['appeal', 'particular', 'system', 'categori', 'licenc', 'produc',
       'hazard', 'act', 'particip', 'certif', 'ferri', 'store', 'vessel',
       'subsect', 'dive', 'suppli', 'invest', 'food', 'plan', 'explos'],
      dtype='object')


## 4. Latent Semantic Analysis

In [26]:
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components = 100, algorithm='arpack', random_state=0)
dtm_lsa = lsa.fit_transform(tfidf)

In [27]:
#What are the top components of each concept?
for i, comp in enumerate(lsa.components_):
    terms_in_comp = zip(cols,comp)
    sorted_terms = sorted(terms_in_comp, key=lambda x: x[1], reverse=True)[:10]
    print('Concept %d:' %i)
    
    for term in sorted_terms:
        print(term[0])
    print (' ')
    

Concept 0:
forbid
come
datum
regi
registrar
lamproom
optim
nwtsoirc
secretari
perpendicular
 
Concept 1:
lamproom
nwtsoirc
plywood
element
airport
build
stripe
appli
grown
oviduct
 
Concept 2:
board
act
perpendicular
subrul
marker
commod
author
poult
secretari
purg
 
Concept 3:
board
commod
marker
poult
exempt
protrud
interpret
tp
optim
exponentialtimeaverag
 
Concept 4:
immov
prisonerofwar
articl
convent
remarri
canada
ordinari
optim
extens
grandpar
 
Concept 5:
remarri
grandpar
gone
implic
condit
optim
custodian
pave
paddleboat
dustproof
 
Concept 6:
compani
bank
entireti
remarri
hoist
grandpar
insul
invert
acquir
subsidiari
 
Concept 7:
gone
station
canada
implic
registrar
chapter
exponentialtimeaverag
come
forbid
enabl
 
Concept 8:
permafrost
exponentialtimeaverag
gone
author
regi
implic
optim
compani
scare
bank
 
Concept 9:
insul
proclaim
miner
loadcarri
lever
applic
board
lend
compani
prioriti
 
Concept 10:
column
servant
scare
proclaim
lever
italic
amount
paus
font
pave
 
Concep

In [34]:
list_concepts = []
    
for i in range(0,len(lsa.components_)):
    list_concepts.append('Concept %d' %i)
    
df_lsa = pd.DataFrame(dtm_lsa, columns=list_concepts)

#df_lsa

###### Document Similarity using LSA

In [35]:
from sklearn.preprocessing import Normalizer
df_norm = Normalizer(copy=False).fit_transform(dtm_lsa)

In [36]:
similarity = np.asarray(np.asmatrix(df_norm) * np.asmatrix(df_norm).T)
regs = list(dataset['instrument_number'])

In [37]:
df_heatmap = pd.DataFrame(similarity,index=regs, columns=regs)

In [38]:
#df_heatmap

## 5. Extracting the highly similar documents

In [40]:
#minimum score/value for regulations to be considered similar
thresh_min = 0.95

#max score/value, above which regulations are the same document
thresh_max = 0.999999

In [35]:
import sys

In [48]:
#column names 
regs = df_heatmap.columns

#master list of similar regs to append to
similar_regs = []

#analyze each column's scores one at a time
for reg_1 in regs:
    reg_1_scores = df_heatmap[reg_1]
    
    #track the index value (integer) which meet the criteria
    index_value = 0
    list_index = []

    for score in reg_1_scores:
        if (score > thresh_min and score < thresh_max):
            list_index.append(index_value)
        index_value += 1
    
    #track the reg pairs which exceed the criteria - check for duplicates
    for i in list_index:
        reg_2 = df_heatmap.index[i]
        
        if [reg_2,reg_1] not in similar_regs:
            similar_regs.append([reg_1,reg_2])

##### Checking the outputs

Given that the formula for the number of possible cominations where order doesn't matter:

C(n,r)=n!/(n−r)!r!

In [65]:
import math
comb = int(math.factorial(len(df_heatmap))/(math.factorial((len(df_heatmap)-2))*math.factorial(2)))
print ('total combinations is: ' + str(comb))

total combinations is: 2584401


In [73]:
print ('Number of reg pairs over the threshold: ' + str(len(similar_regs)))
print ('Percent of reg pairs over the threshold: ' + str(len(similar_regs)/comb*100) + '%')

Number of reg pairs over the threshold: 4880
Percent of reg pairs over the threshold: 0.18882518618434213%


## Which Lebels in the Regs are Most Similar?

In [80]:
#convert the nested list into a DataFrame
reg_1 = []
reg_2 = []

for pair in similar_regs:
    reg_1.append(pair[0])
    reg_2.append(pair[1])

d = {'Reg_1':reg_1, 'Reg_2':reg_2}    
df_similar = pd.DataFrame(data=d)

In [81]:
df_similar

Unnamed: 0,Reg_1,Reg_2
0,SOR/93-293,SOR/93-522
1,SOR/93-293,SOR/2002-138
2,SOR/93-293,SOR/95-558
3,SOR/93-293,SOR/78-771
4,SOR/93-293,SOR/93-524
5,SOR/93-293,"C.R.C., c. 83"
6,SOR/93-293,SOR/87-706
7,SOR/93-293,SOR/93-74
8,SOR/93-293,SOR/80-803
9,SOR/93-293,SOR/93-409


# Testing how to compare Labels - LSA

In [300]:
#Set the target regs to compare
reg1 = 'SOR/93-293'
reg2 = 'SOR/93-522'

#Locate the target regs from the dataframe
df_reg1 = dataset[dataset['instrument_number']==reg1]
df_reg2 = dataset[dataset['instrument_number']==reg2]

#Extract the body text from both regs
reg1_body = list(df_reg1['section_bodies'])
reg2_body = list(df_reg2['section_bodies'])

#Extract the labels from both regs
reg1_labels = list(df_reg1['section_labels'])
reg2_labels = list(df_reg2['section_labels'])

In [301]:
#extract all the text between singular quotes because each set of quoates is a new section (or subsection) 
quoted  = re.compile("'[^']*'")
reg1_body_clean = []
reg2_body_clean = []

for section in quoted.findall(str(reg1_body)):
    reg1_body_clean.append(section)
    
for section in quoted.findall(str(reg2_body)):
    reg2_body_clean.append(section)

#Extract the section labels for each regs body (note the values are also stored between quotes)
## Also note that none include 'Label 1' so it must be added
reg1_labels_clean = [reg1 + ' Label: 1']
reg2_labels_clean = [reg2 + ' Label: 1']

for num in quoted.findall(str(reg1_labels)):
    reg1_labels_clean.append(reg1 + ' Label: ' + num.strip("'"))
    
for num in quoted.findall(str(reg2_labels)):
    reg2_labels_clean.append(reg2 + ' Label: ' + num.strip("'"))

In [310]:
#Combine the body and label lists and convert to a dataframe 
combined_body = []
combined_body.extend(reg1_body_clean)
combined_body.extend(reg2_body_clean)

combined_labels = []
combined_labels.extend(reg1_labels_clean)
combined_labels.extend(reg2_labels_clean)

df_regs = pd.DataFrame(data={'Reg_Label':combined_labels, 'Text':combined_body})

In [311]:
df_regs

Unnamed: 0,Reg_Label,Text
0,SOR/93-293 Label: 1,"'In these Regulations,'"
1,SOR/93-293 Label: 2,"'For the purposes of these Regulations, the el..."
2,SOR/93-293 Label: 3,'These Regulations apply in respect of all lan...
3,SOR/93-293 Label: 4,'No person shall erect or construct on any lan...
4,SOR/93-293 Label: 5,'No owner or lessee of any lands in respect of...
5,SOR/93-293 Label: 6,'In order to minimize bird hazards to aviation...
6,SOR/93-522 Label: 1,"'In these Regulations,'"
7,SOR/93-522 Label: 2,"'For the purposes of these Regulations, the el..."
8,SOR/93-522 Label: 3,"'These Regulations apply to all land, includin..."
9,SOR/93-522 Label: 4,"'No person shall erect or construct, on any la..."


In [305]:
len(combined_body)

12

In [135]:
df_reg2 = dataset[dataset['instrument_number']==Reg_2]
df_reg2

Unnamed: 0,consolidation_date,enabling_authority,enabling_type,file_name,instrument_number,language,last_modified_date,order_number,registration_date,regulation_maker,regulation_provision,regulation_type,repealed,section_bodies,section_labels,title
16,2018-06-11,AERONAUTICS ACT,act,Consolidation_Regs_1.2.0/EN/SOR-93-522.xml,SOR/93-522,en,2011-9-21,1993-1934,1993-12-2,P.C.,"[{'@format-ref': 'indent-0-0', '@language-alig...",SOR,False,"['In these Regulations,', 'For the purposes of...","['2', '3', '4', '5', '6']",Regulations Respecting Zoning at St. Leonard A...


In [136]:
df_reg2['section_bodies'][16]

"['In these Regulations,', 'For the purposes of these Regulations, the elevation of the airport reference point is 234\\xa0m above sea level.', 'These Regulations apply to all land, including public road allowances, that is adjacent to or in the vicinity of the airport, which land is more particularly described as follows:', 'No person shall erect or construct, on any land to which these Regulations apply, any building, structure or object or any addition to any existing building, structure or object, the highest point of which will exceed in elevation at the location of that point', 'No owner or lessee of any land to which these Regulations apply shall permit any part of that land to be used or developed in a manner that causes interference with any signals or communications to and from any aircraft or to and from any facilities used to provide services relating to aeronautics.', 'Where an object of natural growth that is on any land to which these Regulations apply grows to a height 