In [9]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import MmCorpus
from gensim.corpora import Dictionary
from collections import Counter
from numpy.random import seed
from collections import defaultdict
from metadata.metadata import ACL_metadata
from metadata import Gender
import matplotlib.pyplot as plt
import numpy as np
import logging
import gensim 
import pyLDAvis.gensim as gensimvis
import pyLDAvis
seed(1)
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

In [2]:
model = gensim.models.wrappers.ldamallet.LdaMallet.load("../ldamodelmallet.lda")

loading LdaMallet object from ../ldamodelmallet.lda
loading id2word recursively from ../ldamodelmallet.lda.id2word.* with mmap=None
loaded ../ldamodelmallet.lda


In [6]:
doc_topics = MmCorpus('corpusldamallet.mm')

loaded corpus index from corpusldamallet.mm.index
initializing corpus reader from corpusldamallet.mm
accepted corpus with 22405 documents, 100 features, 2240500 non-zero entries


In [7]:
acl = ACL_metadata()
df = acl.modeling_df

Remember to use acl.modeling_files and modeling_df for topic modeling


In [8]:
for i, file in enumerate(acl.modeling_files[:10]):
    title = df.loc[acl.get_id(acl.modeling_files[i])]['title']
    print(title)
    m = max(doc_topics[i], key = lambda x: x[1])[0]
    print(m)
    a = model.show_topic(m,10)
    print(a)

BusTUC - A Natural Language Bus Route Oracle
73
[('computer', 0.071882374296605558), ('store', 0.027242693773824652), ('align_allel', 0.026414957342530406), ('telephone_conversation', 0.023096750771464877), ('http', 0.014695952078417136), ('management', 0.013163913595933927), ('node', 0.012800871301506625), ('transfer', 0.012089308404429115), ('statistical', 0.010143401706298783), ('automata_theory', 0.0085169722272644757)]
Machine Translation Of Very Close Languages
24
[('matching', 0.2850304595391478), ('gian', 0.056287477568170874), ('beginning', 0.053836389898017242), ('alike', 0.029966138382998365), ('slavic', 0.028672950894686789), ('align', 0.019091426365904416), ('memory', 0.018773103291858488), ('open', 0.013966424873765006), ('den', 0.01392265545108369), ('imit', 0.013811242375167617)]
Cross-Language Multimedia Information Retrieval
6
[('literal', 0.0096342060802367315), ('command', 0.0092293234961861054), ('merger', 0.0089314098911315726), ('log_linear', 0.008923412210458966

In [32]:
print(a)

[('translation', 0.21969493714910318), ('translate', 0.045941604146999881), ('parallel', 0.043997283913813946), ('bilingual', 0.043669366191351582), ('corpora', 0.025090674218711185)]


In [33]:
a =model.print_topic(23,5)

In [8]:
females_topics = []
females_topics_proportions = defaultdict(int) # P(topic|gender)'
males_topics_proportions = defaultdict(int)
fem_norm = 0 # normalizer
male_norm = 0
males_topics = []
for i, file in enumerate(acl.modeling_files):
    try:
        topics = doc_topics[i]
        gender = df.loc[acl.get_id(file)]["genders"][0] #first author
        if gender == Gender.female:
            for topic, p in topics:
                females_topics_proportions[topic] += p
            fem_norm +=1
        elif gender == Gender.male:
            for topic, p in topics:
                males_topics_proportions[topic] += p
            male_norm +=1
    except KeyError:
        pass

In [9]:
for idx in range(100):
    f = females_topics_proportions[idx] / fem_norm
    m = males_topics_proportions[idx] / male_norm
    odds = (f * (1-f)) / (m * (1-m))
    if odds > 1.5:
        print("Females are {0:.2f} more likeky than men to write about topic {1:d}".format(odds, idx))
        print("Most common words for this topic are:")
        for w,p in model.show_topic(idx,10):
            print(w,)
        print("-"*30)
        print("-"*30)


Females are 1.64 more likeky than men to write about topic 4
Most common words for this topic are:
frame
role
framenet
agent
theme
thematic
alternation
verbs
object
argument
------------------------------
------------------------------
Females are 1.51 more likeky than men to write about topic 18
Most common words for this topic are:
annotation
annotate
scheme
annotator
corpora
annotated
manual
mark
annotation_scheme
guideline
------------------------------
------------------------------
Females are 1.90 more likeky than men to write about topic 24
Most common words for this topic are:
student
learner
essay
read
reading
native
grade
tutor
feedback
readability
------------------------------
------------------------------
Females are 1.60 more likeky than men to write about topic 33
Most common words for this topic are:
connective
implicit
discourse
causal
explicit
indicator
license
argument
sentential
creative
------------------------------
------------------------------
Females are 1.9

In [11]:
for idx in range(100):
    f = females_topics_proportions[idx] / fem_norm
    m = males_topics_proportions[idx] / male_norm
    odds = (m * (1-m)) / (f * (1-f))
    if odds > 1.5:
        print("Males are {0:.2f} more likeky than females to write about topic {1:d}".format(odds, idx))
        print("Most common words for this topic are:")
        for w,p in model.show_topic(idx,10):
            print(w,)
        print("-"*30)
        print("-"*30)


Males are 1.67 more likeky than females to write about topic 0
Most common words for this topic are:
span
forest
beam
prune
decoding
derivation
pruning
chart
decode
beam_search
------------------------------
------------------------------
Males are 1.72 more likeky than females to write about topic 3
Most common words for this topic are:
japanese
korean
expression
particle
fig
kanji
candidate
bunsetsu
thai
morpheme
------------------------------
------------------------------
Males are 1.65 more likeky than females to write about topic 26
Most common words for this topic are:
joint
linear
loss
inference
objective
update
vector
perceptron
prediction
log
------------------------------
------------------------------
Males are 1.94 more likeky than females to write about topic 34
Most common words for this topic are:
path
finite
transducer
lattice
finite_state
string
transition
regular
arc
automaton
------------------------------
------------------------------
Males are 1.50 more likeky th

In [21]:
 model.show_topic(20,10)

[('article', 0.15098623060851826),
 ('news', 0.10720219783835359),
 ('story', 0.056648625528484708),
 ('paragraph', 0.031700857091154361),
 ('title', 0.027793771694605753),
 ('compression', 0.027555233849342785),
 ('news_article', 0.018095974468225112),
 ('headline', 0.015266422096829914),
 ('topic', 0.01157319821672397),
 ('newspaper', 0.011137249740898547)]