In [24]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import numpy as np
import os

tokenizer = RegexpTokenizer(r'\w+')

In [25]:
# create English stop words list
en_stop = get_stop_words('en')

In [26]:
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    

In [27]:
paper_directory = "../2014/papers_text/"

#paper_array = names of all papers in dataset ordered lexically
paper_array = []
for filename in os.listdir(paper_directory):
    if filename.endswith(".txt"): 
        #print(os.path.join(directory, filename))
        paper_array.append(filename[:-4])
paper_array = sorted(paper_array)

filenames = [paper_directory + x + ".txt" for x in paper_array]
doc_list = [open(f).read() for f in filenames[:2]]

In [28]:
# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_list:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # removing words of length < 3 (some integers were interfering in topics otherwise)
    filtered_tokens = [i for i in stemmed_tokens if len(i)>2]
    
    # add tokens to list
    texts.append(filtered_tokens)

In [29]:
texts[:5]

[['bustuc',
  'natura',
  u'anguag',
  u'rout',
  'rac',
  'tore',
  u'ambl',
  'dept',
  u'comput',
  u'inform',
  u'scienc',
  u'univers',
  'trondheim',
  'norway',
  '7491',
  u'ambl',
  'idi',
  'ntnu',
  'abstract',
  'paper',
  u'describ',
  u'natur',
  u'anguag',
  u'base',
  'expert',
  'system',
  u'rout',
  'advisor',
  'public',
  'transport',
  'trondheim',
  'norway',
  'system',
  u'avail',
  'internet',
  u'intstal',
  'com',
  u'pani',
  'web',
  'server',
  u'sinc',
  u'begin',
  '1999',
  'system',
  u'bilingu',
  u'reli',
  u'intern',
  u'anguag',
  u'independ',
  'logic',
  u'represent',
  'introduct',
  'ion',
  u'natur',
  u'anguag',
  u'interfac',
  u'comput',
  u'databas',
  u'provid',
  u'user',
  u'capabl',
  u'obtain',
  u'format',
  u'store',
  u'databas',
  u'queri',
  'system',
  u'natur',
  u'languag',
  u'natur',
  u'languag',
  u'mean',
  u'commun',
  'com',
  'puter',
  'system',
  u'user',
  'can',
  'make',
  'question',
  'statement',
  'way',
  u'

In [30]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

In [31]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]


In [32]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)


In [34]:
ldamodel.save('ldamodel')

In [35]:
m1 = gensim.models.ldamodel.LdaModel.load('ldamodel')

In [36]:
print(m1.print_topics(num_topics=2, num_words=5))

[(0, u'0.034*"translat" + 0.022*"system" + 0.021*"languag" + 0.015*"czech" + 0.009*"word"'), (1, u'0.018*"system" + 0.011*"languag" + 0.011*"grammar" + 0.010*"natur" + 0.008*"queri"')]


In [33]:
print(ldamodel.print_topics(num_topics=2, num_words=5))

[(0, u'0.034*"translat" + 0.022*"system" + 0.021*"languag" + 0.015*"czech" + 0.009*"word"'), (1, u'0.018*"system" + 0.011*"languag" + 0.011*"grammar" + 0.010*"natur" + 0.008*"queri"')]


In [46]:
ldamodel[corpus[0]]

[(0, 0.84984623341586552), (1, 0.1501537665841344)]

In [47]:
ldamodel.inference([corpus[0]])

(array([[ 1698.84685903,   300.15314097]]), None)

In [11]:
def get_doc_topics(lda, bow):
    gamma, _ = lda.inference([bow])
    
    topic_dist = gamma[0] / sum(gamma[0])  # normalize distribution
    
    return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)]

In [49]:
get_doc_topics(ldamodel, corpus[0])

[(0, 0.84985114445041054), (1, 0.15014885554958948)]

In [12]:
topic_dist = []
for j in xrange(len(corpus)):
    topic_dist.append(k[1] for k in get_doc_topics(ldamodel, corpus[j]))
for entry in topic_dist:
    print entry

<generator object <genexpr> at 0x7f6f6277adc0>
<generator object <genexpr> at 0x7f6f6277ae10>
<generator object <genexpr> at 0x7f6f6277aeb0>
<generator object <genexpr> at 0x7f6f60b74050>
<generator object <genexpr> at 0x7f6f60b74190>
<generator object <genexpr> at 0x7f6f60b74320>
<generator object <genexpr> at 0x7f6f60b740a0>
<generator object <genexpr> at 0x7f6f60b741e0>
<generator object <genexpr> at 0x7f6f60b74370>
<generator object <genexpr> at 0x7f6f60b743c0>
<generator object <genexpr> at 0x7f6f60b74410>
<generator object <genexpr> at 0x7f6f60b744b0>
<generator object <genexpr> at 0x7f6f60b74280>
<generator object <genexpr> at 0x7f6f60b74640>
<generator object <genexpr> at 0x7f6f60b745a0>
<generator object <genexpr> at 0x7f6f60b74690>
<generator object <genexpr> at 0x7f6f60b742d0>
<generator object <genexpr> at 0x7f6f60b745f0>
<generator object <genexpr> at 0x7f6f60b747d0>
<generator object <genexpr> at 0x7f6f60b74730>
<generator object <genexpr> at 0x7f6f60b74500>
<generator ob

In [61]:
pd = []
count=0
for entry in topic_dist:
    count+=1
    pd.append([j[1] for j in entry ])
for pds in pd:    
    print pds
print count

[0.84984995176808187, 0.15015004823191813]
[0.025795282646404571, 0.97420471735359537]
[0.11072053148144659, 0.88927946851855355]
[0.00069883687122690148, 0.99930116312877304]
[0.0062364830610904219, 0.99376351693890952]
[0.00038104989391370418, 0.99961895010608637]
[0.00025209656514479288, 0.99974790343485509]
[0.00031290535553427096, 0.99968709464446581]
[0.00031579517634078096, 0.99968420482365927]
[0.00036250533407085567, 0.9996374946659291]
[0.99873271257267804, 0.0012672874273220842]
[0.86330788424849714, 0.13669211575150278]
[0.11591977183050814, 0.88408022816949183]
[0.00020479465608878983, 0.99979520534391131]
[0.00034154391733669199, 0.99965845608266335]
[0.00025776237353681571, 0.99974223762646319]
[0.00031285936275076949, 0.99968714063724928]
[0.0011355365630097115, 0.99886446343699031]
[0.037567849360063334, 0.9624321506399367]
[0.0085653911813352891, 0.99143460881866474]
[0.99971773863276037, 0.00028226136723967385]
[0.99923063727003758, 0.00076936272996245246]
[0.9983302

In [41]:
ldamodel.get_document_topics(corpus[0])

[(0, 0.99961207911446837)]

In [33]:
ldamodel[corpus[0]]

[(0, 0.94263649115196291), (1, 0.057363508848037148)]

In [52]:
#JSDiv_0
from scipy.stats import entropy
from numpy.linalg import norm
import numpy as np

def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [53]:
# JSDiv_1
def jsdiv(P, Q):
    """Compute the Jensen-Shannon divergence between two probability distributions.

    Input
    -----
    P, Q : array-like
        Probability distributions of equal length that sum to 1
    """

    def _kldiv(A, B):
        return np.sum([v for v in A * np.log2(A/B) if not np.isnan(v)])

    P = np.array(P)
    Q = np.array(Q)

    M = 0.5 * (P + Q)

    return 0.5 * (_kldiv(P, M) +_kldiv(Q, M))

In [34]:
a = (9,0)

In [35]:
type(a)

tuple

In [36]:
a[1]

0

In [66]:
JSD(pd[0],pd[1])

0.41400665037249312

In [67]:
JSD(pd[0],pd[4])

0.4523300746974277

In [68]:
JSD(pd[2],pd[4])

0.029809960143430105

In [69]:
JSD(pd[3],pd[4])

0.0012743512921435996

In [70]:
JSD(pd[0],pd[2])

0.30687543131537759

In [71]:
JSD(pd[2],pd[2])

0.0

In [63]:
div_mat = np.zeros((50,50))
for i in xrange(50):
    for j in xrange(50):
        div_mat[i][j] = JSD(pd[i],pd[j]) 

In [64]:
div_mat[0]

array([  0.00000000e+00,   4.14006650e-01,   3.06875431e-01,
         4.67564559e-01,   4.52330075e-01,   4.68714447e-01,
         4.69214960e-01,   4.68975633e-01,   4.68964416e-01,
         4.68784871e-01,   5.18164066e-02,   1.84341683e-04,
         3.01718669e-01,   4.69406146e-01,   4.68865046e-01,
         4.69192375e-01,   4.68975811e-01,   4.66101139e-01,
         3.95243674e-01,   4.46955636e-01,   5.41480108e-02,
         5.29105483e-02,   5.10064896e-02,   5.08351359e-02,
         5.43090286e-02,   4.55309427e-01,   5.42758858e-02,
         4.41764894e-02,   4.66047355e-01,   5.38739949e-02,
         5.41304044e-02,   2.82220088e-01,   5.42326995e-02,
         5.38171180e-02,   5.36343690e-02,   5.38022389e-02,
         5.42302027e-02,   5.38349435e-02,   4.77551884e-02,
         5.40461886e-02,   5.40044171e-02,   5.38787465e-02,
         1.42216051e-01,   5.38797870e-02,   5.31012976e-02,
         4.68787974e-01,   4.69355245e-01,   4.69106167e-01,
         4.15899348e-01,

In [65]:
for entry in div_mat:
    print entry

[  0.00000000e+00   4.14006650e-01   3.06875431e-01   4.67564559e-01
   4.52330075e-01   4.68714447e-01   4.69214960e-01   4.68975633e-01
   4.68964416e-01   4.68784871e-01   5.18164066e-02   1.84341683e-04
   3.01718669e-01   4.69406146e-01   4.68865046e-01   4.69192375e-01
   4.68975811e-01   4.66101139e-01   3.95243674e-01   4.46955636e-01
   5.41480108e-02   5.29105483e-02   5.10064896e-02   5.08351359e-02
   5.43090286e-02   4.55309427e-01   5.42758858e-02   4.41764894e-02
   4.66047355e-01   5.38739949e-02   5.41304044e-02   2.82220088e-01
   5.42326995e-02   5.38171180e-02   5.36343690e-02   5.38022389e-02
   5.42302027e-02   5.38349435e-02   4.77551884e-02   5.40461886e-02
   5.40044171e-02   5.38787465e-02   1.42216051e-01   5.38797870e-02
   5.31012976e-02   4.68787974e-01   4.69355245e-01   4.69106167e-01
   4.15899348e-01   5.35624995e-02]
[  4.14006650e-01   0.00000000e+00   1.51958484e-02   7.64695397e-03
   3.25473691e-03   8.15884280e-03   8.39993837e-03   8.28297907e-0

In [11]:
s = "¿qué?"
u = unicode(s, "utf-8")


In [12]:
u


u'\xbfqu\xe9?'

In [10]:
if u:
    print 6

6


In [None]:
def gen_ind_graph(paper_id, out_net, in_net,facet):
    
    nodes = []
    out_1 = out_net[paper_id][facet]
    in_1 = in_net[paper_id][facet]
    nodes=nodes+out_1+in_1                           ## adds all 1-hop papers...
   
                               
    for paper in out_1:                              ## adds all 2-hop papers...
        nodes=nodes+out_net[paper][facet]
        nodes=nodes+in_net[paper][facet]
    for paper in in_1:
        nodes=nodes+out_net[paper][facet]
        nodes=nodes+in_net[paper][facet]
    
    nodes = nodes + cos_sim_top(paper_id)            ## adds papers above COS_LIMIT

    node_set = set(nodes)                            ## takes unique values only...
    nodes = list(node_set)
    
    tpm = np.zeros((len(nodes),len(nodes)))
    graph = []
    
    for i in range(1,len(nodes)):
        for j in range(0,i):
            p1 = nodes[i]
            p2 = nodes[j]
            if (p1 in out_net[p2][facet]) or (p2 in out_net[p1][facet]) or (cos_sim(p1,p2)>COS_LIMIT):
                tpm[i][j] = cos_sim(p1,p2)
                tpm[j][i] = tpm[i][j]
    for i in xrange(len(nodes)):
        tpm[i] = tpm[i]/float(np.sum(tpm[i]))*0.9 
        tpm[i]+= (1-np.sum(tpm[i]))/len(nodes)
        
    for i in range(0,len(nodes)):
        for j in range(0,len(nodes)):    
            graph.append([nodes[i], nodes[j], tpm[i,j]])
                
    return graph