In [1]:
# Remove deprecationWarning that I can't fix
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:

import re
import numpy as np
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy for lemmatization
import spacy


from pprint import pprint
from nltk.corpus import stopwords


# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# NLTK for removing stopwords, 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
Apart from the known english stopwords, these are the pre-determined words that are common in the corpus
stop_words.extend(['tagum', 'mayor', 'series', 'enter','entered', 'behalf', 'known', 'otherwise', 
                  'code', 'sign', 'comprehensive', 'agreement', 'resolution', 'new', 'memorandum', 'city', 
                  'davao','ordinance', 'thereof', 'section', 'municipal', 
                  'municipality', 'government'])

In [4]:
# Import CSV Ordinance Dataset
df = pd.read_csv('ordinances/davao_ordinances.csv', header = 0)
print(df)
df.head()
# List all ordinance titles
# Convert to list
data = list(df.OT) 
print(data[:1])

            OR                                                 OT
0    000172-14  An Ordinance amending Section 42 of the 2005 R...
1    000177-14  An Ordinance placing underground all electrica...
2    000178-14  An Ordinance amending City Ordinance No. 0195-...
3    000173-14  An Ordinance regulating the use of some street...
4    000227-14  An ordinance for the creation of Trust Fund Ac...
..         ...                                                ...
414  000496-13  An Ordinance authorizing the City Mayor to ent...
415  000487-16  An Ordinance amending Section 13 Article XI of...
416  000348-07  Ordinance  Establishing the Gender-Sensitive C...
417  000349-07  ORDINANCE AMENDING DAVAO CITY ORDINANCE NO. 15...
418  000355-07  An  Ordinance  for  the  "Davao  Branding  Sys...

[419 rows x 2 columns]
['An Ordinance amending Section 42 of the 2005 Revenue Code of Davao City from "Time and Place of the Payment - The tax shall be due and payable in advance to the City Treasurer or his

In [5]:
# Remove qoutes
data = [re.sub("\'", "", sent) for sent in data]
data = [re.sub("\"", "", sent) for sent in data]

# Remove words ending with ING like amending, regulating
# Implemtation to be improved / Seems to improve topics
data = [re.sub('\S*ING\S*\s?', '', sent) for sent in data]
data = [re.sub('\S*ing\S*\s?', '', sent) for sent in data]


pprint(data[:5])

#preprocess using gensim simple_preprocess and tokenize into words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

pprint(data_words[:5])

['An Ordinance Section 42 of the 2005 Revenue Code of Davao City from Time and '
 'Place of the Payment - The tax shall be due and payable in advance to the '
 'City Treasurer or his duly authorized representative before the materials '
 'are extracted and shall be based on the volume applied for in the '
 'application for quarry concession to Tax shall be due and payable every '
 'quarter of the Calendar Year to the City Treasurer or his duly authorized '
 'representative after the materials are extracted based on the volume applied '
 'for in the application for quarry concession',
 'An Ordinance underground all electrical and telecommunication wires and '
 'cables within the vicinity of City Hall and the Sangguniang Panlungsod of '
 'the City of Davao',
 'An Ordinance City Ordinance No. 0195-14, Series of 2004, otherwise known as '
 'the Amended scheme of Davao City for a 90-day experimental period',
 'An Ordinance the use of some streets near the Mintal Public Market in '
 'Baranga

  data = [re.sub('\S*ING\S*\s?', '', sent) for sent in data]
  data = [re.sub('\S*ing\S*\s?', '', sent) for sent in data]


In [6]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
# Code snippets from Selva Prabhakaran | Topic Modeling with Gensim (Python)
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [7]:

data_words_nostops = remove_stopwords(data_words)
data_words_nostops = [x for x in data_words_nostops if "amending" not in x]

In [8]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_nostops)

# Create Corpus
texts = data_words_nostops

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 2), (3, 2), (4, 2), (5, 1), (6, 3), (7, 1), (8, 2), (9, 1), (10, 2), (11, 2), (12, 1), (13, 2), (14, 2), (15, 1), (16, 2), (17, 1), (18, 1), (19, 2), (20, 1), (21, 2), (22, 1), (23, 1), (24, 3), (25, 2), (26, 1), (27, 2), (28, 2), (29, 1)]]


In [9]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('advance', 1),
  ('application', 2),
  ('applied', 2),
  ('authorized', 2),
  ('based', 2),
  ('calendar', 1),
  ('city', 3),
  ('code', 1),
  ('concession', 2),
  ('davao', 1),
  ('due', 2),
  ('duly', 2),
  ('every', 1),
  ('extracted', 2),
  ('materials', 2),
  ('ordinance', 1),
  ('payable', 2),
  ('payment', 1),
  ('place', 1),
  ('quarry', 2),
  ('quarter', 1),
  ('representative', 2),
  ('revenue', 1),
  ('section', 1),
  ('shall', 3),
  ('tax', 2),
  ('time', 1),
  ('treasurer', 2),
  ('volume', 2),
  ('year', 1)]]

In [10]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=36, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=True)

In [11]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(24, '0.001*"bugac" + 0.001*"wells" + 0.001*"airport" + 0.001*"safety" + 0.001*"houses" + 0.001*"theaters" + 0.001*"tour" + 0.001*"twenty" + 0.001*"refuse" + 0.001*"careless"'), (1, '0.001*"bugac" + 0.001*"wells" + 0.001*"airport" + 0.001*"safety" + 0.001*"houses" + 0.001*"theaters" + 0.001*"tour" + 0.001*"twenty" + 0.001*"refuse" + 0.001*"careless"'), (6, '0.001*"bugac" + 0.001*"wells" + 0.001*"airport" + 0.001*"safety" + 0.001*"houses" + 0.001*"theaters" + 0.001*"tour" + 0.001*"twenty" + 0.001*"refuse" + 0.001*"careless"'), (17, '0.001*"bugac" + 0.001*"wells" + 0.001*"airport" + 0.001*"safety" + 0.001*"houses" + 0.001*"theaters" + 0.001*"tour" + 0.001*"twenty" + 0.001*"refuse" + 0.001*"careless"'), (10, '0.001*"bugac" + 0.001*"wells" + 0.001*"airport" + 0.001*"safety" + 0.001*"houses" + 0.001*"theaters" + 0.001*"tour" + 0.001*"twenty" + 0.001*"refuse" + 0.001*"careless"'), (15, '0.062*"used" + 0.020*"projects" + 0.009*"cement" + 0.009*"mixers" + 0.009*"actually" + 0.008*"coverage" +

In [12]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5016297853736096


In [13]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


TypeError: Object of type complex is not JSON serializable

PreparedData(topic_coordinates=                        x                   y  topics  cluster       Freq
topic                                                                    
26     0.491205+0.000000j  0.069601+0.000000j       1        1  30.164546
25     0.456969+0.000000j  0.069084+0.000000j       2        1  23.410461
28     0.056738+0.000000j -0.068719+0.000000j       3        1   5.600818
29     0.105444+0.000000j -0.350593+0.000000j       4        1   4.718407
9      0.040533+0.000000j -0.102079+0.000000j       5        1   4.061044
35     0.051991+0.000000j  0.006809+0.000000j       6        1   2.830921
2      0.039349+0.000000j -0.111388+0.000000j       7        1   2.640249
27     0.029252+0.000000j  0.003134+0.000000j       8        1   2.580849
7     -0.008889+0.000000j  0.021047+0.000000j       9        1   2.256067
11     0.061366+0.000000j  0.053894+0.000000j      10        1   2.150003
19    -0.013167+0.000000j  0.025239+0.000000j      11        1   1.961073
34    -