# Topic Modeling Using Senate Bills' Long Title Field

In [1]:
import pandas as pd
import os
from collections import Counter
from gensim import models
from gensim import corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import re
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_pickle(os.path.join('data_files', 'lt_processed.pkl'))

In [3]:
df.head()

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,"[fiscal, regime, mining, industry]",Pending
1,17SBN-2234,"[sale, certain, land, barangay, na, ligas, cit...",Passed
2,17SBN-2233,"[excise, tax, tobacco, subject, tax, increment...",Passed
3,17SBN-2232,"[institutionalization, development, training, ...",Pending
4,17SBN-2231,[bank],Pending


## Most Common Words

In [4]:
lt_list = df.long_title.tolist()

In [5]:
lt_flat = [item for sublist in lt_list for item in sublist]

In [6]:
word_counter = Counter(lt_flat)
most_common_words = word_counter.most_common()[:10]
most_common_words

[('fund', 2859),
 ('national', 2204),
 ('public', 1243),
 ('section', 1156),
 ('program', 1091),
 ('development', 998),
 ('health', 920),
 ('education', 899),
 ('local', 852),
 ('system', 749)]

## Topic Modeling
Topic Modeling Using Latent Dirichlet Allocation(LDA)

### Create Dictionary

In [7]:
id2word = corpora.Dictionary(df.long_title)

### Create Corpus

In [8]:
texts = df.long_title

#### Term Document Frequency

In [9]:
corpus = [id2word.doc2bow(text) for text in texts]

### LDA

In [10]:
# LDA for num_topics = 15
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=15, passes=50,
                      random_state=43)

In [11]:
lda.print_topics()

[(0,
  '0.073*"additional" + 0.071*"fund" + 0.046*"court" + 0.044*"national" + 0.044*"region" + 0.043*"judiciary" + 0.041*"reorganization" + 0.038*"regional" + 0.037*"trial" + 0.034*"judicial"'),
 (1,
  '0.081*"health" + 0.051*"care" + 0.050*"medical" + 0.042*"fund" + 0.041*"family" + 0.040*"security" + 0.033*"research" + 0.032*"development" + 0.027*"center" + 0.025*"system"'),
 (2,
  '0.148*"program" + 0.054*"fund" + 0.049*"national" + 0.047*"civil" + 0.039*"comprehensive" + 0.031*"medium" + 0.030*"prevention" + 0.028*"human" + 0.023*"small" + 0.020*"modernization"'),
 (3,
  '0.070*"department" + 0.050*"fund" + 0.043*"health" + 0.033*"management" + 0.031*"water" + 0.025*"agriculture" + 0.023*"national" + 0.021*"justice" + 0.019*"discrimination" + 0.018*"rural"'),
 (4,
  '0.088*"commission" + 0.070*"labor" + 0.064*"national" + 0.050*"employment" + 0.049*"safety" + 0.033*"administration" + 0.029*"land" + 0.025*"agricultural" + 0.024*"fund" + 0.024*"board"'),
 (5,
  '0.072*"province" + 0

#### Compute Perplexity

In [12]:
print('Perplexity: ', lda.log_perplexity(corpus))

Perplexity:  -7.8540893118471145


#### Compute Coherence Score

In [13]:
coherence_model_lda = CoherenceModel(model=lda, texts=df.long_title,
                                     dictionary=id2word, coherence='c_v')

In [14]:
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.4201803694043507


#### Visualize the Topics

In [15]:
pyLDAvis.enable_notebook()

In [16]:
vis = pyLDAvis.gensim.prepare(lda, corpus, id2word, R=10)

#### Save Outputs to Disk

In [17]:
pyLDAvis.save_html(vis, os.path.join('reports', 'LDAvis.html'))

In [18]:
topic_dict = {}
for idx, i in enumerate(lda.show_topics()):
    topic_dict[idx] = re.findall('"([^"]*)"', i[1])

In [19]:
pickle.dump(topic_dict, open(os.path.join('data_files',
                                          'topic_dict.pkl'), 'wb'))