In [56]:
from __future__ import unicode_literals
import os
from gensim import corpora, models
import string
import nltk
import numpy as np
from langdetect import detect
import re
import pickle
from pattern.text.fr import parse
import math

from whoosh import query
import whoosh.index as index
from whoosh.fields import Schema, ID, TEXT
from whoosh.index import create_in, open_dir
from whoosh.query import Term, SpanNear, And

import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

In [2]:
%install_ext autotime.py
%load_ext autotime

# Execution du topic modeling
Importer les données des pickles: la structure texts contenant la liste de listes de mots, infopath contenant les noms et chemins des documents et infolength contenant la tailel des documents
Construction du dictionnaire et de la structure corpus puis execution du LDA


In [3]:
texts = pickle.load(open(os.getcwd()+"/5000/texts.obj", "rb"))
infopath= pickle.load(open(os.getcwd()+"/5000/infopath.obj", "rb"))
infolength= pickle.load(open(os.getcwd()+"/5000/infolength.obj", "rb"))

In [4]:
import unicodedata #remplacer les accents 
texts = [[unicodedata.normalize('NFD', word).encode('ascii', 'ignore') for word in text] for text in texts]

In [5]:
#construction du dictionnaire pour LDA
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts] #term_frequency

In [6]:
num_topics=30 #nombre de topics
len_vocab=len(dictionary.token2id) #nombre de mots dans le vocabulaire

In [7]:
#Execution de LDA
lda = models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, workers =2,chunksize=10000, passes=1)

In [14]:
#affichage des topics avec fréquences
lda.print_topics(num_topics=4, num_words=10)
#lda.show_topic(29, topn=10)

[(u'faire', 0.011940197752507507),
 (u'france', 0.0088737718992821565),
 (u'a', 0.0085395776033475555),
 (u'grand', 0.0073334296902422807),
 (u'pouvoir', 0.0070702249485926141),
 (u'devoir', 0.0060057841170735116),
 (u'europeen', 0.0055063935114626769),
 (u'ministre', 0.0052916304673419458),
 (u'an', 0.0047609720792872717),
 (u'pays', 0.0047118960566896444)]

# Contruction index
Nous construisons un index sur l'ensemble des documents afin de faciliter la tâche d'estimation des probabilités liées aux mots et topics

In [15]:
#creation de l'index sur l'ensemble du corpus
def create_index(texts, indexPath = "index"):
    schema = Schema(content=TEXT(stored=True),nid=ID(stored=True))
    if not os.path.exists(indexPath):
        os.mkdir(indexPath)
    index = create_in(indexPath, schema)
    writer = index.writer()
    for i in range(len(texts)):
        writer.add_document(content=texts[i],nid=unicode(i))
    writer.commit()
create_index(texts)

In [16]:
idx=index.open_dir("index") #load index
searcher=idx.searcher() #open index searcher

# Recuperation des distributions
Nous récupérons les distributions de probabilités dans des matrices: Distributoons des topics sur les documents et distributions des mots sur les topics

In [32]:
#Matrice Doc Topic Distributions
def get_doc_topic_dists(corpus, lda, num_topics):
    doc_topic_dists = []
    for doc in range(len(corpus)):
        list_topic_proba = [0]*num_topics
        temp = lda.get_document_topics(corpus[doc], minimum_probability=0)
        for topic, proba in temp:
            list_topic_proba[topic]=proba
        doc_topic_dists.append(list_topic_proba)
    return doc_topic_dists

#Matrice Topic Doc Distributions
def get_topic_doc_dists(doc_topic_dists):
    topic_doc_dists=[]
    for k in range(len(doc_topic_dists[0])):
        tmp=[]
        for d in range(len(doc_topic_dists)):
            tmp.append(doc_topic_dists[d][k])
        topic_doc_dists.append(tmp)
    return topic_doc_dists

In [33]:
doc_topic_dists=get_doc_topic_dists(corpus, lda, num_topics)
topic_doc_dists=get_topic_doc_dists(doc_topic_dists)

In [27]:
# matrice Topic Mot distributions
def get_topic_term_dists(lda, num_topics, len_vocab):
    topic_term_dists = []
    for topic in range(num_topics):
        list_term_proba = [0]*len_vocab
        temp = lda.get_topic_terms(topic,topn=len_vocab)
        for term, proba in temp:
            list_term_proba[term] = proba
        topic_term_dists.append(list_term_proba)
    return topic_term_dists

# matrice Mots Topic distributions
def get_term_topic_dists(topic_term_dists):
    term_topic_dists=[]
    for w in range(len(topic_term_dists[0])):
        tmp=[]
        for k in range(len(topic_term_dists)):
            tmp.append(topic_term_dists[k][w])
        term_topic_dists.append(tmp)
    return term_topic_dists


In [34]:
topic_term_dists=get_topic_term_dists(lda, num_topics, len_vocab)
term_topic_dists=get_term_topic_dists(topic_term_dists)

# Information mutuelle et entropie
Dans cette partie nous définissons les fonctions qui calculent l'information mutuelle (pmi, npmi, pmi-average) et l'entropie (relevance ou pertinence) des mots

In [60]:
# PMI for two words
def pmi(searcher, w1, w2,epsilon=0.1, window=10):
    doc_count=float(searcher.doc_count())
    t1 = query.Term("content", w1)
    t2 = query.Term("content", w2)  
    pw1=float((searcher.doc_frequency("content", w1)))/doc_count #probabilité de w1
    pw2=float((searcher.doc_frequency("content", w2)))/doc_count #probabilité de w2
    pocc= float(len(searcher.search(SpanNear(t1, t2, slop=window))))/doc_count     #proba de co-occurence
    return math.log((pocc+epsilon)/pw1*pw2)    
#NPMI for two words
def npmi(searcher, w1, w2,epsilon=0.1, window=20):
    doc_count=float(searcher.doc_count())
    t1 = query.Term("content", w1)
    t2 = query.Term("content", w2) 
    pocc= float(len(searcher.search(SpanNear(t1, t2, slop=window))))/doc_count  #proba de co-occurence
    pmi_w1w2=pmi(searcher, w1, w2,epsilon, window) #pmi entre w1 et w2
    return pmi_w1w2 / (- math.log(pocc+epsilon))

In [37]:
#compute p(k|w)
def marginal_k_w(doc_dist_topic_k,infolength,word_proba_topic_k):
    # p(k|w) est estimée par p(w/k) * somme(p(k/w)*Nd)
    return word_proba_topic_k*sum([a*b for a,b in zip(doc_dist_topic_k,infolength)])

#compute entropy en w sachant la distrubtion des topics sur les documents et la distributoon du mot w sur les topics
def entropy(topic_doc_dist,infolength,topic_dist_word_w):
    entropy_w=0
    for k in range(len(topic_doc_dist)):
        entropy_w+=marginal_k_w(topic_doc_dist[k],infolength,topic_dist_word_w[k])  
    return entropy_w

# Relevance of word w for topic-id
def relevance(topic_id,topic_doc_dist,infolength, topic_dist_word_w):
    # divide p(w|k) by entropy
    entropy_w=entropy(topic_doc_dist,infolength,topic_dist_word_w)
    return topic_dist_word_w[topic_id]/np.exp(entropy_w)
    

In [41]:
# PMI average
# calcule la moyenne de pmi pour chaque mot du topic par rapport aux autres mots
def pmi_average(searcher,topic_words):
    pmi_average=[]
    for i in range(len(topic_words)):
        w1=topic_words[i]
        avg=0
        for j in range(len(topic_words)):
            if i!=j:
                w2=topic_words[j]
                avg+=pmi(searcher, w1, w2,epsilon=0.1, window=10)
        pmi_average.append(avg/(len(topic_words)-1))
    return pmi_average
                


## Récupérations des topics (fréquences)
Nous récupérons les ensemblents de mots qui constituent les topics, et ce on prenant les TOP N mots de chaque topic par ordre de fréquence

In [None]:
#pour un topic donné avoir les topn mots les plus fréquents (bag of ids)
def get_topic_terms_list_id(lda,topicid, topn=10):
    terms_list=[]
    tuples=lda.get_topic_terms(topicid, topn)
    for termid, proba in tuples:
        terms_list.append(termid)
    return terms_list

# pour un topic donné, avoir les topn mots les plus fréquents (bag of words)
def get_topic_terms_list_alpha(lda,topicid, topn=10):
    terms_list=[]
    tuples=lda.show_topic(topicid, topn)
    for term, proba in tuples:
        terms_list.append(term)
    return terms_list

# afficher la liste des topics (bag of word) selon le topn des mots les plus fréquents
def get_list_of_topics_alpha_frequency(lda,num_topics, topn=10):
    topic_lists=[]
    for i in range(num_topics):
        topic_lists.append(get_topic_terms_list_alpha(lda,i, topn=topn))
    return topic_lists


## Récupérations des topics (Pertinence)
Nous récupérons les ensemblents de mots qui constituent les topics, et ce on prenant les TOP N mots de chaque topic par ordre de pertinence (relevance)

In [39]:

# avoir pour un topic donné, la liste ordonéee des n termes les plus pertinents sur les 50 termes les plus fréquents
def get_relevant_topic_terms(lda,topicid,topn, topic_doc_dist, infolength, word_topic_dist):
    bag_words=get_topic_terms_list_alpha(lda,topicid,topn=50) #get 50 most frequent words
    bag_words_id=get_topic_terms_list_id(lda,topicid,topn=50) #get their ids
    m_relevance=[]
    for i in range(len(bag_words)):
        m_relevance.append(relevance(topicid,topic_doc_dist,infolength, word_topic_dist[bag_words_id[i]]))
    sortedRes = sorted(zip(m_relevance, bag_words), key=lambda x: x[0], reverse=True)
    return sortedRes[0:topn]

# afficher liste des topics (bag of words) selon les topn plus pertinents
def get_list_of_topics_alpha_relevance(lda,nu_topics,infolength,topic_doc_dists, term_topic_dists,topn=10):
    topic_lists=[]
    for i in range(num_topics):
        topic_lists.append(get_relevant_topic_terms(lda,i,topn, topic_doc_dists, infolength, term_topic_dists))
    return topic_lists

# Mesures d'évaluation de cohérence

In [42]:
#================================== Mesures de cohérences pour un seul topic

#UCI for one topic
def coherence_uci(searcher, topic_words,epsilon=0.1, window=20):
    somme=0
    N=len(topic_words)
    for i in range(N):
        for j in range(i+1,N):
            somme+=pmi(searcher,topic_words[i], topic_words[j],epsilon, window)
    return (somme*2)/(N*(N-1))

#U_NPMI for one topic
def coherence_npmi(searcher, topic_words,epsilon=0.1, window=10):
    somme=0
    N=len(topic_words)
    for i in range(N):
        for j in range(i+1,N):
            somme+=npmi(searcher,topic_words[i], topic_words[j],epsilon, window)
    return (somme*2)/(N*(N-1))

#UMASS for one topic
def coherence_umass(searcher,topic_words,epsilon=0.1):
    somme=0
    N=len(topic_words)
    doc_count=float(searcher.doc_count())
    
    for i in range(1,N):
        for j in range(0,i):
            t1 = query.Term("content", topic_words[i])
            t2 = query.Term("content", topic_words[j]) 
            pocc= float(len(searcher.search(And([t1,t2]))))/doc_count  #proba de co-occurence                       
            pwj=float(len(searcher.search(t2)))/doc_count  #proba du mot le moins rare       
            somme+=math.log((pocc+epsilon)/pwj)
    return (somme*2)/(N*(N-1))

#================================== Mesures de cohérences pour une liste de topics dans l'ordre

#UCI
def matrixuci(searcher,list_of_topics):
    index=[i for i in range(len(list_of_topics))]
    coherence=[]
    for i in range(len(list_of_topics)):
        coherence.append(coherence_uci(searcher, list_of_topics[i],epsilon=0.1, window=10))
    sortedRes = sorted(zip(coherence, list_of_topics, index), key=lambda x: x[0], reverse=True)
    return sortedRes
 #5minutes   

#NPMI
def matrixnpmi(searcher,list_of_topics):
    index=[i for i in range(len(list_of_topics))]
    coherence=[]
    for i in range(len(list_of_topics)):
        coherence.append(coherence_npmi(searcher, list_of_topics[i],epsilon=0.1, window=10))
    sortedRes = sorted(zip(coherence, list_of_topics, index), key=lambda x: x[0], reverse=True)
    return sortedRes
    #11 minutes
#UMASS
def matrixumass(searcher,list_of_topics):
    index=[i for i in range(len(list_of_topics))]
    coherence=[]
    for i in range(len(list_of_topics)):
        coherence.append(coherence_umass(searcher, list_of_topics[i],epsilon=0.1))
    sortedRes = sorted(zip(coherence, list_of_topics, index), key=lambda x: x[0], reverse=True)
    return sortedRes
#5 mn

# Tests et résultats

## Topics organisés par fréquence

In [44]:
topics_by_frequency=get_list_of_topics_alpha_frequency(lda,num_topics, topn=10)

In [57]:
# UCI
ucifreq=matrixuci(searcher,topics_by_frequency)

In [58]:
#NPMI
npmifreq=matrixnpmi(searcher,topics_by_frequency)

In [59]:
umassfreq=matrixumass(searcher,topics_by_frequency)

## Topics organisés par pertinence

In [46]:
dic_by_relevance=get_list_of_topics_alpha_relevance(lda,num_topics,infolength,topic_doc_dists, term_topic_dists,topn=10)

In [48]:
topics_by_relevance=[]
for topic in dic_by_relevance:
    tmp=[]
    for proba, word in topic:
        tmp.append(word)
    topics_by_relevance.append(tmp)

In [51]:
# UCI
ucirelev=matrixuci(searcher,topics_by_relevance)

[u'francais',
 u'faire',
 u'france',
 u'an',
 u'pouvoir',
 u'dernier',
 u'president',
 u'homme',
 u'sarkozy',
 u'ministre']

In [None]:
#NPMI
npmirelev=matrixnpmi(searcher,topics_by_relevance)

## Labellisation des topics

In [63]:
#Le mot les plus pertinent est le label
liste_labels_pertinence=[]
for topic in topics_by_relevance:
    tmp=[]
    tmp.append(topic[0])
    tmp.append(topic)
    liste_labels_pertinence.append(tmp)


In [65]:
#Le mot dont la PMI est la plus elevée est le label
liste_labels_pmi=[]
for topic in topics_by_relevance:
    tmp=[]
    tmp.append(topic[np.argmax(pmi_average(searcher, topic))])
    tmp.append(topic)
    liste_labels_pmi.append(tmp)