In [None]:
%matplotlib inline

In [None]:
from datetime import datetime as dt

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from gensim.corpora import MmCorpus
from gensim.models import LdaModel

In [None]:
plt.style.use("ggplot")

In [None]:
lda = LdaModel.load("data/jdc-lda-model")
corpus = MmCorpus("data/jdc-corpus.mm")

In [None]:
def show_topics(model, num_topics=10, num_words=10, labels=None):
    
    df = pd.DataFrame(index=range(num_words))
    
    for tid, topic, in model.show_topics(num_topics, num_words, formatted=False):
        words, _ = zip(*topic)
        if labels:
            df[labels[tid]] = words
        else:
            df["Topic %d" % (tid + 1)] = words
    
    return df

In [None]:
show_topics(lda)

In [None]:
labels = ["AngloCrisis", "CamairCo", "BokoHaram"]
show_topics(lda, labels=labels)

In [None]:
def topic_by_post(model, corpus, labels=None):
    
    date = np.load("data/jdc-date.npy")
    data = {0:[], 1:[], 2:[]}
    
    for bow in corpus:
        score = dict(model[bow])
        tid = max(score, key=lambda k: score[k])
        
        for id in [0, 1, 2]:
            if id == tid:
                match = int(score.get(tid) > .5)
            else:
                match = 0
            data[id].append(match)
        
    df = pd.DataFrame(data, index=date)
    if labels:
        df.columns = labels
        
    return df

In [None]:
topics = topic_by_post(lda, corpus, labels=labels)

weekly = topics.resample("W", closed="left", label="left").sum()
weekly

In [None]:
def plot_topic_evolution(df, rot=45):
    
    ctime = lambda ts: re.sub("\d+:\d+:\d+ ", "", ts.ctime())[4:]
    labels = map(ctime, df.index)
    
    fig = plt.figure(figsize=(15,6))
    ax = fig.add_subplot(111)
    ax.set_title("Topic evolution (no overlap)")
    df.plot.bar(ax=ax, rot=rot)
    ax.set_xticklabels(labels)

In [None]:
plot_topic_evolution(weekly)