# 2. Basic Topic Modelling
*Carlos A. Toru√±o Paniagua*

## Loading and preparing the data

In [2]:
import json
import pandas as pd

# Load data
speech_data = pd.read_csv("..//..//Data//speech_data.csv")

with open("..//..//Data//tklem_speeches.json", "r") as file:
    lemmatized_speeches = json.load(file)

## Creating a Gensim Corpus

In [3]:
from gensim import corpora
from gensim.models import TfidfModel

# Creating a Document-Term Matrix
dictmat = {sp:corpora.Dictionary(tked) for (sp,tked) in lemmatized_speeches.items()}

# Creating Corpus
corpus = {sp:[dictmat.doc2bow(speech) for speech in lemmatized_speeches[sp]]
          for (sp,dictmat) in dictmat.items()}

# Adjusting Corpus by using a TF-IDF weighting scheme
tfidf_model  = {sp:TfidfModel(c) for (sp,c) in corpus.items()}
corpus_tfidf = {sp:mod[corpus[sp]] for (sp,mod) in tfidf_model.items()}

## Training a LDA Model

In [4]:
from gensim.models import LdaModel

ntopics = {"Daniel" : 4,
           "Rosario": 3}

# Training an LDA model
LDA = {
    sp:LdaModel(corpus_tfidf[sp], 
                id2word    = dictmat[sp], 
                num_topics = n)
    for (sp,n) in ntopics.items()
}

## Visualizing the results using pyLDAvis

In [5]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

dataviz = {
    sp:gensimvis.prepare(LDA[sp], corpus_tfidf[sp], dictmat[sp])
    for sp in ["Daniel", "Rosario"]
}

In [54]:
pyLDAvis.display(dataviz["Daniel"])

In [55]:
pyLDAvis.display(dataviz["Rosario"])

In [8]:
# Save as HTML
pyLDAvis.save_html(dataviz["Daniel"],  "../../assets/LDAvis_daniel.html")
pyLDAvis.save_html(dataviz["Rosario"], "../../assets/LDAvis_rosario.html")