In [1]:
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html

In [2]:
from gensim.models import LdaModel
import pandas as pd
from pandasql import sqldf 
from random import randrange, random
import numpy as np
pysqldf = lambda q: sqldf(q, globals())

In [3]:
num_topics = 2
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

In [4]:
cold_war_terms = ['soviet', 'cold', 'war', 'nato', 'communist', 'capitalist', 'political', 
                 'western', 'powers', 'propaganda']

economic_policy_terms = ['financial', 'taxation', 'monetary', 'policy', 'economic', 'interest',
                        'rate', 'money', 'supply', 'bank']

In [5]:
# generate documents
# topic mix
cold_war_topics = [.75, .25]
economic_policy_topics = [.75, .25]

In [6]:
cold_war_words = []
economic_policy_words = []

for i in range(10):
    if random() <= cold_war_topics[0]:
        w = cold_war_terms[randrange(len(cold_war_terms))]
        print(w)
        if w not in cold_war_words:
            cold_war_words.insert(0,w)
    else:
        w = economic_policy_terms[randrange(len(economic_policy_terms))]
        print(w)
        if w not in cold_war_words:
            cold_war_words.append(w)

for i in range(10):
    if random() <= economic_policy_topics[0]:
        w = economic_policy_terms[randrange(len(economic_policy_terms))]
        if w not in economic_policy_words:
            economic_policy_words.insert(0,w)
    else:
        w = cold_war_terms[randrange(len(cold_war_terms))]
        if w not in economic_policy_words:
            economic_policy_words.append(w)
    
    

propaganda
communist
powers
monetary
propaganda
soviet
political
nato
cold
economic


In [7]:
print(cold_war_words)
print(economic_policy_words)

['cold', 'nato', 'political', 'soviet', 'powers', 'communist', 'propaganda', 'monetary', 'economic']
['bank', 'financial', 'economic', 'rate', 'monetary', 'war', 'capitalist']


In [8]:
# now produce two cumulative dirichlet vectors to represent the distribution of words within a topic
cold_war_den = np.random.dirichlet(np.ones(len(cold_war_words)),size=1)
cold_war_dist = np.cumsum(cold_war_den)

economic_policy_den = np.random.dirichlet(np.ones(len(economic_policy_words)),size=1)
economic_policy_dist = np.cumsum(economic_policy_den)

In [9]:
print(list(cold_war_den[0]))
print(economic_policy_dist)

[0.08556672070985204, 0.16850599074840406, 0.10025797931714356, 0.1936988454410914, 0.15041043724215017, 0.04761487902621159, 0.12760929748762953, 0.06273813749146412, 0.06359771253605348]
[0.17264503 0.2514258  0.51254855 0.51673349 0.6469019  0.83652708
 1.        ]


In [10]:
n = random()
index = np.searchsorted(cold_war_dist,n)
index, cold_war_words[index]

(8, 'economic')

In [11]:
# generate 10 documents from each category
documents = []
topics = []

for i in range(5):
    sentence = []
    for i in range(10):
        n = random()
        index = np.searchsorted(cold_war_dist,n)
        sentence.append(cold_war_words[index])
    documents.append(' '.join(sentence))
    topics.append(0)

for i in range(5):
    sentence = []
    for i in range(10):
        n = random()
        index = np.searchsorted(economic_policy_dist,n)
        sentence.append(economic_policy_words[index])
    documents.append(' '.join(sentence))
    topics.append(1)

In [12]:
for s in documents:
    print(s)

nato propaganda nato nato political soviet soviet nato soviet political
nato political monetary powers soviet soviet propaganda powers communist soviet
nato economic nato powers nato nato nato powers powers powers
soviet political economic nato powers economic powers powers propaganda political
nato political nato economic propaganda soviet soviet powers propaganda soviet
monetary war monetary economic bank economic war capitalist economic bank
bank monetary capitalist capitalist economic economic war economic bank economic
economic economic monetary economic economic capitalist monetary economic economic economic
bank war capitalist bank war bank economic war war monetary
economic capitalist monetary bank capitalist capitalist economic financial capitalist financial


In [13]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [14]:
from gensim.corpora import Dictionary

In [15]:
dataset = [d.split() for d in documents]

In [16]:
#dataset

In [17]:
dictionary = Dictionary(dataset)

In [18]:
temp = dictionary[0]
id2word = dictionary.id2token

In [19]:
id2word

{0: 'nato',
 1: 'political',
 2: 'propaganda',
 3: 'soviet',
 4: 'communist',
 5: 'monetary',
 6: 'powers',
 7: 'economic',
 8: 'bank',
 9: 'capitalist',
 10: 'war',
 11: 'financial'}

In [20]:
corpus = [dictionary.doc2bow(doc) for doc in dataset]

In [21]:
#corpus

In [22]:
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [23]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 12
Number of documents: 10


In [24]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
#print('Average topic coherence: %.4f.' % avg_topic_coherence)


print(top_topics[1])

([(0.3124849, 'economic'), (0.16961162, 'capitalist'), (0.15175892, 'bank'), (0.13390757, 'monetary'), (0.13390447, 'war'), (0.044634435, 'financial'), (0.008988549, 'nato'), (0.008976396, 'powers'), (0.008936508, 'political'), (0.008936071, 'soviet'), (0.008932709, 'propaganda'), (0.008927882, 'communist')], -10.958492536191605)


In [25]:
l=[model.get_document_topics(item) for item in corpus]

In [26]:
lda_topic = [v[0][0] for v in l]

In [27]:
print(lda_topic)

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [28]:
df = pd.DataFrame({'text':documents, 'topic':topics, 'lda_topic':lda_topic})

In [29]:
df

Unnamed: 0,text,topic,lda_topic
0,nato propaganda nato nato political soviet soviet nato soviet political,0,1
1,nato political monetary powers soviet soviet propaganda powers communist soviet,0,1
2,nato economic nato powers nato nato nato powers powers powers,0,1
3,soviet political economic nato powers economic powers powers propaganda political,0,1
4,nato political nato economic propaganda soviet soviet powers propaganda soviet,0,1
5,monetary war monetary economic bank economic war capitalist economic bank,1,0
6,bank monetary capitalist capitalist economic economic war economic bank economic,1,0
7,economic economic monetary economic economic capitalist monetary economic economic economic,1,0
8,bank war capitalist bank war bank economic war war monetary,1,0
9,economic capitalist monetary bank capitalist capitalist economic financial capitalist financial,1,0


In [30]:
df['topic'].corr(df['lda_topic'], method='spearman')

-1.0

In [31]:
df_cold_war = pd.DataFrame({'term':cold_war_words, 'density':cold_war_den[0]})
df_economic_policy = pd.DataFrame({'term':economic_policy_words, 'density':economic_policy_den[0]})

In [32]:
wd0 = dict(zip(cold_war_words, list(cold_war_den[0])))
df_wd0 = pd.DataFrame(wd0.items(), columns=['Term', 'Prob'])

In [33]:
df_wd0

Unnamed: 0,Term,Prob
0,cold,0.085567
1,nato,0.168506
2,political,0.100258
3,soviet,0.193699
4,powers,0.15041
5,communist,0.047615
6,propaganda,0.127609
7,monetary,0.062738
8,economic,0.063598


In [34]:
wd1 = dict(zip(economic_policy_words, list(economic_policy_den[0])))
df_wd1 = pd.DataFrame(wd1.items(), columns=['Term', 'Prob'])

In [35]:
df_wd1

Unnamed: 0,Term,Prob
0,bank,0.172645
1,financial,0.078781
2,economic,0.261123
3,rate,0.004185
4,monetary,0.130168
5,war,0.189625
6,capitalist,0.163473


In [36]:
df_0 = pd.DataFrame(top_topics[1][0],columns=['Prob', 'Term'])
df_1 = pd.DataFrame(top_topics[0][0],columns=['Prob', 'Term'])

In [37]:
df_1

Unnamed: 0,Prob,Term
0,0.241047,nato
1,0.18752,soviet
2,0.18748,powers
3,0.11608,political
4,0.098224,propaganda
5,0.080337,economic
6,0.02679,monetary
7,0.026789,communist
8,0.008935,capitalist
9,0.008933,war


In [38]:
pysqldf("""
SELECT 
    df_0.Term, df_wd0.Prob as wd0, df_wd1.Prob as wd1, df_0.Prob as C0, df_1.Prob as C1
FROM
    df_0
LEFT JOIN
    df_1
ON
    df_0.Term = df_1.Term
LEFT JOIN
    df_wd0
ON
    df_0.Term = df_wd0.Term
LEFT JOIN
    df_wd1
ON
    df_0.Term = df_wd1.Term
ORDER BY wd0 DESC

""")

Unnamed: 0,Term,wd0,wd1,C0,C1
0,soviet,0.193699,,0.008936,0.18752
1,nato,0.168506,,0.008989,0.241047
2,powers,0.15041,,0.008976,0.18748
3,propaganda,0.127609,,0.008933,0.098224
4,political,0.100258,,0.008937,0.11608
5,economic,0.063598,0.261123,0.312485,0.080337
6,monetary,0.062738,0.130168,0.133908,0.02679
7,communist,0.047615,,0.008928,0.026789
8,capitalist,,0.163473,0.169612,0.008935
9,bank,,0.172645,0.151759,0.008933
