In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [3]:
#spacy for lemmatization
import spacy

In [4]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from pprint import pprint

  and should_run_async(code)


In [6]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  and should_run_async(code)


In [7]:
# NLTK for stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['tagum', 'city', 'ordinance', 'municipal', 'municipality', 'thereof'])

In [8]:
# Import Dataset
df = pd.read_csv('tagum_ordinances.csv', header = 0)
print(df)
df.head()

                                     OR  \
0        CITY ORDINANCE No. 842, s-2018   
1        CITY ORDINANCE NO. 825, s-2017   
2        CITY ORDINANCE NO. 837, s-2018   
3        CITY ORDINANCE NO. 844, s-2018   
4        CITY ORDINANCE NO. 838, s-2018   
..                                  ...   
670  MUNICIPAL ORDINANCE NO. 05, s-1974   
671  MUNICIPAL ORDINANCE NO. 04, s-1974   
672  MUNICIPAL ORDINANCE NO. 01, s-1974   
673    MUNICIPAL ORDINANCE NO. 03, 1973   
674  MUNICIPAL ORDINANCE NO. 02, S-1973   

                                                    OT  
0    “AN ORDINANCE REGULATING THE PRACTICE OF TRADI...  
1    “AN ORDINANCE RECONSTITUTING THE MEMBERS OF TH...  
2    “AN ORDINANCE ESTABLISHING FUNERAL AND BURIAL ...  
3    AN ORDINANCE AMENDING THE TITLE, SECTION 1, SE...  
4    “AN ORDINANCE CREATING THE CITY HISTORICAL, CU...  
..                                                 ...  
670  “AN ORDINANCE AMENDING SECTION 10 OF MUNICIPAL...  
671  “AN ORDINANCE REGULATI

Unnamed: 0,OR,OT
0,"CITY ORDINANCE No. 842, s-2018",“AN ORDINANCE REGULATING THE PRACTICE OF TRADI...
1,"CITY ORDINANCE NO. 825, s-2017",“AN ORDINANCE RECONSTITUTING THE MEMBERS OF TH...
2,"CITY ORDINANCE NO. 837, s-2018",“AN ORDINANCE ESTABLISHING FUNERAL AND BURIAL ...
3,"CITY ORDINANCE NO. 844, s-2018","AN ORDINANCE AMENDING THE TITLE, SECTION 1, SE..."
4,"CITY ORDINANCE NO. 838, s-2018","“AN ORDINANCE CREATING THE CITY HISTORICAL, CU..."


In [9]:
# List all ordinance titles
# Convert to list
data = list(df.OT) 
print(data[:1])

['“AN ORDINANCE REGULATING THE PRACTICE OF TRADITIONAL HOME BIRTH DELIVERY OR DELIVERIES ATTENDED BY A TRADITIONAL BIRTH ATTENDANT OR MANANABANG”.']


In [10]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]
data = [re.sub("\"", "", sent) for sent in data]

# # Remove words ending with ING like amending, regulating
# data = [re.sub('\S*ING\S*\s?', '', sent) for sent in data]


pprint(data[:5])

['“AN ORDINANCE REGULATING THE PRACTICE OF TRADITIONAL HOME BIRTH DELIVERY OR '
 'DELIVERIES ATTENDED BY A TRADITIONAL BIRTH ATTENDANT OR MANANABANG”.',
 '“AN ORDINANCE RECONSTITUTING THE MEMBERS OF THE CITY TRICYCLE FRANCHISING '
 'AND REGULATORY BOARD (CTFRB) PRESCRIBING ITS FUNCTIONS AND PROCEDURES AND '
 'PROVIDING MOTORIZED TRICYCLE FOR HIRE (MTH), TRICYCLE UTILITY VEHICLE (TUV) '
 'AND TRICYCLE UTILITY FOR HIRE (TUH) GUIDELINES, REGULATIONS AND OPERATIONS, '
 'THE COLLECTION OF FEES, CHARGES,ADMINISTRATIVE PROVISIONS AND PROVIDING '
 'PENALTIES FOR VIOLATION THEREOF”.',
 '“AN ORDINANCE ESTABLISHING FUNERAL AND BURIAL ASSISTANCE FOR INDIGENT '
 'TAGUMENYOS, PROVIDING ITS MANAGEMENT, PROCEDURES AND GUIDELINES AND '
 'APPROPRIATING FUNDS THEREFOR”.',
 'AN ORDINANCE AMENDING THE TITLE, SECTION 1, SECTION 2 AND SECTION 3 OF CITY '
 'ORDINANCE NO. 820, S-2017 “AN ORDINANCE ENTERING INTO A LOAN AGREEMENT WITH '
 'THE DEVELOPMENT BANK OF THE PHILIPPINES, IN THE AMOUNT OF SIX HUNDRED EIGH

In [11]:
#preprocess using gensim
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:5])

[['an', 'ordinance', 'regulating', 'the', 'practice', 'of', 'traditional', 'home', 'birth', 'delivery', 'or', 'deliveries', 'attended', 'by', 'traditional', 'birth', 'attendant', 'or', 'mananabang'], ['an', 'ordinance', 'reconstituting', 'the', 'members', 'of', 'the', 'city', 'tricycle', 'franchising', 'and', 'regulatory', 'board', 'ctfrb', 'prescribing', 'its', 'functions', 'and', 'procedures', 'and', 'providing', 'motorized', 'tricycle', 'for', 'hire', 'mth', 'tricycle', 'utility', 'vehicle', 'tuv', 'and', 'tricycle', 'utility', 'for', 'hire', 'tuh', 'guidelines', 'regulations', 'and', 'operations', 'the', 'collection', 'of', 'fees', 'charges', 'administrative', 'provisions', 'and', 'providing', 'penalties', 'for', 'violation', 'thereof'], ['an', 'ordinance', 'establishing', 'funeral', 'and', 'burial', 'assistance', 'for', 'indigent', 'tagumenyos', 'providing', 'its', 'management', 'procedures', 'and', 'guidelines', 'and', 'appropriating', 'funds', 'therefor'], ['an', 'ordinance', 'a

In [12]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['an', 'ordinance', 'regulating', 'the', 'practice', 'of', 'traditional', 'home', 'birth', 'delivery', 'or', 'deliveries', 'attended', 'by', 'traditional', 'birth', 'attendant', 'or', 'mananabang']


In [13]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

pprint(data_lemmatized[:4])

[['regulate',
  'practice',
  'traditional',
  'home',
  'birth',
  'delivery',
  'delivery',
  'attend',
  'traditional',
  'birth',
  'attendant',
  'mananabang'],
 ['reconstitute',
  'member',
  'tricycle',
  'ctfrb',
  'prescribing',
  'function',
  'procedure',
  'provide',
  'tricycle',
  'utility',
  'vehicle',
  'tricycle',
  'utility',
  'hire',
  'guideline',
  'operation',
  'collection',
  'fee',
  'charge',
  'administrative',
  'provision',
  'provide',
  'penalty',
  'violation'],
 ['establish',
  'funeral',
  'burial_assistance',
  'indigent',
  'tagumenyos',
  'provide',
  'management',
  'procedure',
  'guideline',
  'appropriate',
  'fund'],
 ['amend',
  'title',
  'section',
  'section',
  'section',
  'enter',
  'loan',
  'amount',
  'acquisition',
  'equipment',
  'acquisition',
  'service',
  'vehicle',
  'acquisition',
  'continue_reading']]


In [15]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2)]]


In [16]:
id2word[0]

'attend'

In [17]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('attend', 1),
  ('attendant', 1),
  ('birth', 2),
  ('delivery', 2),
  ('home', 1),
  ('mananabang', 1),
  ('practice', 1),
  ('regulate', 1),
  ('traditional', 2)]]

In [18]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [19]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.100*"impose" + 0.065*"area" + 0.044*"require" + 0.029*"lot" + 0.029*"rate" + 0.021*"reclassify" + 0.020*"own" + 0.019*"sale" + 0.018*"commercial" + 0.017*"name"'), (1, '0.063*"service" + 0.041*"peso" + 0.037*"development" + 0.026*"amount" + 0.026*"appropriating" + 0.024*"property" + 0.023*"grant" + 0.021*"owner" + 0.019*"issuance" + 0.018*"real"'), (2, '0.044*"street" + 0.037*"portion" + 0.034*"permit" + 0.033*"mayor" + 0.022*"park" + 0.020*"grant" + 0.019*"material" + 0.017*"part" + 0.016*"temporary" + 0.015*"town"'), (3, '0.038*"increase" + 0.031*"business" + 0.026*"public" + 0.024*"establishment" + 0.023*"passenger" + 0.022*"territorial_jurisdiction" + 0.021*"series" + 0.019*"license" + 0.018*"fee" + 0.017*"penalty"'), (4, '0.102*"public" + 0.078*"market" + 0.025*"place" + 0.024*"parking" + 0.020*"include" + 0.020*"barangay" + 0.018*"continue_reading" + 0.017*"premise" + 0.016*"repeal" + 0.014*"stall"'), (5, '0.072*"provide" + 0.046*"penalty" + 0.037*"violation" + 0.034*"veh

In [20]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.6580777477097035

Coherence Score:  0.4617134678778355


In [21]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [22]:
mallet_path = '/Users/root1/Documents/lda/mallet/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)