## Topic Fingerprints

In [None]:
import pandas as pd

pd.set_option('display.max_colwidth', 800)

In [None]:
OPEN_DATA_URL = 'data/canada-open-data/inventory.csv'
COLUMNS = ['title_en', 'description_en','date_released']
catalog = pd.read_csv(OPEN_DATA_URL, usecols=COLUMNS)
catalog = catalog.dropna(subset=['description_en'])

In [None]:
catalog

In [None]:
%autosave 60

In [None]:
def text_to_tokens(text):
    text = text.lower()
    text = remove_stopwords(text)
    tokens = simple_preprocess(text)
    return tokens

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess

dataset_descriptions = catalog.description_en.apply(text_to_tokens)

In [None]:
dataset_descriptions

In [None]:
import gensim
from gensim.models import LdaModel
from gensim.parsing.preprocessing import preprocess_string

dictionary = gensim.corpora.Dictionary(dataset_descriptions)
corpus = [dictionary.doc2bow(text) for text in dataset_descriptions]

## Dictionary

In [None]:
for i in range(0, 20):
    print(i, dictionary[i])

## Corpus

In [None]:
VECTOR_SIZE=50
lda_model:LdaModel = LdaModel(corpus, 
                              num_topics=VECTOR_SIZE, 
                              passes=4)
lda_model.num_topics

In [None]:
text = catalog.description_en[0]
tokens = text_to_tokens(text)
bag_of_words = dictionary.doc2bow(tokens)
pd.DataFrame(lda_model[bag_of_words], 
             columns=['Topic','Relevance']).set_index('Topic')

In [None]:
def topic_vector(topic_model:LdaModel, text:str):
    processed_text = text_to_tokens(text)
    bag_of_words = dictionary.doc2bow(processed_text)
    fingerprint = [0] * topic_model.num_topics
    for topic, prob in topic_model[bag_of_words]:
        fingerprint[topic] = prob
    return fingerprint

In [None]:
import matplotlib.pyplot as plt
import matplotlib.style as style
from IPython.display import display

style.use('fivethirtyeight')

VECTOR_SIZE=50
%matplotlib inline

def show_fingerprint(topic_model, text:str):
    display(text)
    vector = topic_vector(topic_model, text)
    plt.figure(figsize=(14,2))
    ax = plt.bar( range(len(vector)), 
                 vector, 
                 0.25, 
                 linewidth=1)
    plt.ylim(top=0.4)
    plt.tick_params(axis='both', 
                    which='both',
                    left=False, 
                    bottom=False, 
                    top=False,
                    labelleft=False, 
                    labelbottom=False)
    plt.grid(False)

In [None]:
show_fingerprint(lda_model, catalog.description_en[0])

In [None]:
show_fingerprint(lda_model, catalog.description_en[3])

In [None]:
show_fingerprint(lda_model, catalog.description_en[2])

In [None]:
topic_vector(lda_model, catalog.description_en[0])