## Canada Open Data Inventory using LDA

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 800)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
OPEN_DATA_URL = '../data/canada-open-data/inventory.csv'

import re

HANDLE = '@\w+'
LINK = 'https?://t\.co/\w+'
SPECIAL_CHARS = '&lt;|&lt;|&amp;|#'
PARA='\n+'
def clean(text):
    text = re.sub(LINK, ' ', text)
    text = re.sub(SPECIAL_CHARS, ' ', text)
    text = re.sub(PARA, '\n', text)
    return text

catalog=pd.read_csv(OPEN_DATA_URL)
catalog = catalog.dropna(subset=['description_en'])
file='../data/canada-open-data/catalog.txt'
catalog['description_en'].sample(frac=0.25,replace=False,random_state=0).to_csv(file,encoding='utf-8')
f=open(file,'r',encoding='utf-8')
text=f.read()
f.close()
text = clean(text)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc=nlp(text)
pos_list=['NOUN']
preproc_text=[]
preproc_sent=[]

for token in doc:
    if token.text!='\n':
        if not(token.is_stop) and not(token.is_punct) and token.pos_ in pos_list:
            preproc_sent.append(token.lemma_)
    else:
        preproc_text.append(preproc_sent)
        preproc_sent=[]

preproc_text.append(preproc_sent) #last sentence

print(preproc_text)

In [None]:
import tomotopy as tp
NUM_TOPICS=20
mdl = tp.LDAModel(k=NUM_TOPICS,seed=1234)

for line in preproc_text:
    mdl.add_doc(line)

for i in range(0, 110, 10):
    mdl.train(i)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

In [None]:
mdl.train(10)
for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=7))

In [None]:
bag_of_words=[word for sent in preproc_text for word in sent]
doc_inst = mdl.make_doc(bag_of_words)
mdl.infer(doc_inst)[0]
np.argsort(np.array(mdl.infer(doc_inst)[0]))[::-1]

In [None]:
print(mdl.get_topic_words(11, top_n=7))

In [None]:
print(mdl.get_topic_words(17, top_n=7))

In [None]:
print(mdl.get_topic_words(5, top_n=7))