In [2]:
from gensim.models import LdaModel
import pandas as pd
from pandasql import sqldf 
from random import randrange, random
import numpy as np
pysqldf = lambda q: sqldf(q, globals())

In [3]:
num_topics = 2
chunksize = 2000
passes = 20
iterations = 400
eval_every = None 

In [4]:
cold_war_terms = ['soviet', 'cold', 'war', 'nato', 'communist', 'capitalist', 'political', 
                 'western', 'powers', 'propaganda']

economic_policy_terms = ['financial', 'taxation', 'monetary', 'policy', 'economic', 'interest',
                        'rate', 'money', 'supply', 'bank']

In [5]:
# generate documents
# topic mix
cold_war_topics = [.75, .25]
economic_policy_topics = [.75, .25]

In [6]:
cold_war_words = []
economic_policy_words = []

for i in range(10):
    if random() <= cold_war_topics[0]:
        w = cold_war_terms[randrange(len(cold_war_terms))]
        print(w)
        if w not in cold_war_words:
            cold_war_words.insert(0,w)
    else:
        w = economic_policy_terms[randrange(len(economic_policy_terms))]
        print(w)
        if w not in cold_war_words:
            cold_war_words.append(w)

for i in range(10):
    if random() <= economic_policy_topics[0]:
        w = economic_policy_terms[randrange(len(economic_policy_terms))]
        if w not in economic_policy_words:
            economic_policy_words.insert(0,w)
    else:
        w = cold_war_terms[randrange(len(cold_war_terms))]
        if w not in economic_policy_words:
            economic_policy_words.append(w)
    
    

cold
soviet
rate
economic
rate
interest
interest
powers
soviet
western


In [7]:
print(cold_war_words)
print(economic_policy_words)

['western', 'powers', 'soviet', 'cold', 'rate', 'economic', 'interest']
['bank', 'rate', 'money', 'economic', 'interest', 'monetary', 'political']


In [8]:
# now produce two cumulative dirichlet vectors to represent the distribution of words within a topic
cold_war_den = np.random.dirichlet(np.ones(len(cold_war_words)),size=1)
cold_war_dist = np.cumsum(cold_war_den)

economic_policy_den = np.random.dirichlet(np.ones(len(economic_policy_words)),size=1)
economic_policy_dist = np.cumsum(economic_policy_den)

In [9]:
print(list(cold_war_den[0]))
print(economic_policy_dist)

[0.04582576241792725, 0.25051825324052573, 0.08744440572036905, 0.11521107379119272, 0.2964406279105447, 0.03312604712195362, 0.1714338297974868]
[0.3874439  0.56212799 0.57689658 0.71144367 0.74588585 0.87237972
 1.        ]


In [10]:
n = random()
index = np.searchsorted(cold_war_dist,n)
index, cold_war_words[index]

(1, 'powers')

In [11]:
# generate 10 documents from each category
documents = []
topics = []

for i in range(5):
    sentence = []
    for i in range(10):
        n = random()
        index = np.searchsorted(cold_war_dist,n)
        sentence.append(cold_war_words[index])
    documents.append(' '.join(sentence))
    topics.append(0)

for i in range(5):
    sentence = []
    for i in range(10):
        n = random()
        index = np.searchsorted(economic_policy_dist,n)
        sentence.append(economic_policy_words[index])
    documents.append(' '.join(sentence))
    topics.append(1)

In [12]:
for s in documents:
    print(s)

rate rate powers powers interest rate cold rate rate soviet
western powers powers powers cold rate economic powers interest rate
western soviet rate rate interest powers interest economic rate cold
western rate rate cold rate powers rate rate powers rate
cold cold interest powers western economic rate powers rate interest
bank political bank political monetary bank monetary economic rate rate
bank rate rate rate bank bank monetary bank economic monetary
economic rate bank economic monetary monetary interest bank monetary bank
rate bank economic economic political economic monetary rate economic economic
rate monetary bank bank economic monetary economic rate political bank


In [13]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [14]:
from gensim.corpora import Dictionary

In [15]:
dataset = [d.split() for d in documents]

In [16]:
#dataset

In [17]:
dictionary = Dictionary(dataset)

In [18]:
temp = dictionary[0]
id2word = dictionary.id2token

In [19]:
id2word

{0: 'cold',
 1: 'interest',
 2: 'powers',
 3: 'rate',
 4: 'soviet',
 5: 'economic',
 6: 'western',
 7: 'bank',
 8: 'monetary',
 9: 'political'}

In [20]:
corpus = [dictionary.doc2bow(doc) for doc in dataset]

In [21]:
#corpus

In [22]:
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [23]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 10
Number of documents: 10


In [24]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
#print('Average topic coherence: %.4f.' % avg_topic_coherence)


print(top_topics[1])

([(0.2635903, 'bank'), (0.20912604, 'economic'), (0.19089016, 'monetary'), (0.19083814, 'rate'), (0.08180606, 'political'), (0.027287923, 'interest'), (0.009150289, 'powers'), (0.009109061, 'cold'), (0.009107611, 'western'), (0.0090943035, 'soviet')], -8.230941752285617)


In [25]:
l=[model.get_document_topics(item) for item in corpus]

In [26]:
lda_topic = [v[0][0] for v in l]

In [27]:
print(lda_topic)

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [28]:
df = pd.DataFrame({'text':documents, 'topic':topics, 'lda_topic':lda_topic})

In [29]:
df

Unnamed: 0,text,topic,lda_topic
0,rate rate powers powers interest rate cold rate rate soviet,0,1
1,western powers powers powers cold rate economic powers interest rate,0,1
2,western soviet rate rate interest powers interest economic rate cold,0,1
3,western rate rate cold rate powers rate rate powers rate,0,1
4,cold cold interest powers western economic rate powers rate interest,0,1
5,bank political bank political monetary bank monetary economic rate rate,1,0
6,bank rate rate rate bank bank monetary bank economic monetary,1,0
7,economic rate bank economic monetary monetary interest bank monetary bank,1,0
8,rate bank economic economic political economic monetary rate economic economic,1,0
9,rate monetary bank bank economic monetary economic rate political bank,1,0


In [30]:
df['topic'].corr(df['lda_topic'], method='spearman')

-1.0

In [31]:
df_cold_war = pd.DataFrame({'term':cold_war_words, 'density':cold_war_den[0]})
df_economic_policy = pd.DataFrame({'term':economic_policy_words, 'density':economic_policy_den[0]})

In [32]:
wd0 = dict(zip(cold_war_words, list(cold_war_den[0])))
df_wd0 = pd.DataFrame(wd0.items(), columns=['Term', 'Prob'])

In [33]:
df_wd0

Unnamed: 0,Term,Prob
0,western,0.045826
1,powers,0.250518
2,soviet,0.087444
3,cold,0.115211
4,rate,0.296441
5,economic,0.033126
6,interest,0.171434


In [34]:
wd1 = dict(zip(economic_policy_words, list(economic_policy_den[0])))
df_wd1 = pd.DataFrame(wd1.items(), columns=['Term', 'Prob'])

In [35]:
df_wd1

Unnamed: 0,Term,Prob
0,bank,0.387444
1,rate,0.174684
2,money,0.014769
3,economic,0.134547
4,interest,0.034442
5,monetary,0.126494
6,political,0.12762


In [36]:
df_0 = pd.DataFrame(top_topics[1][0],columns=['Prob', 'Term'])
df_1 = pd.DataFrame(top_topics[0][0],columns=['Prob', 'Term'])

In [37]:
df_1

Unnamed: 0,Prob,Term
0,0.336406,rate
1,0.208993,powers
2,0.118149,interest
3,0.118143,cold
4,0.081787,western
5,0.063629,economic
6,0.045444,soviet
7,0.009186,bank
8,0.009145,monetary
9,0.009117,political


In [38]:
pysqldf("""
SELECT 
    df_0.Term, df_wd0.Prob as wd0, df_wd1.Prob as wd1, df_0.Prob as C0, df_1.Prob as C1
FROM
    df_0
LEFT JOIN
    df_1
ON
    df_0.Term = df_1.Term
LEFT JOIN
    df_wd0
ON
    df_0.Term = df_wd0.Term
LEFT JOIN
    df_wd1
ON
    df_0.Term = df_wd1.Term
ORDER BY wd0 DESC

""")

Unnamed: 0,Term,wd0,wd1,C0,C1
0,rate,0.296441,0.174684,0.190838,0.336406
1,powers,0.250518,,0.00915,0.208993
2,interest,0.171434,0.034442,0.027288,0.118149
3,cold,0.115211,,0.009109,0.118143
4,soviet,0.087444,,0.009094,0.045444
5,western,0.045826,,0.009108,0.081787
6,economic,0.033126,0.134547,0.209126,0.063629
7,bank,,0.387444,0.26359,0.009186
8,monetary,,0.126494,0.19089,0.009145
9,political,,0.12762,0.081806,0.009117
