In [3]:
import gensim
from gensim import corpora, models, similarities

import numpy as np
import pandas as pd

from tabulate import tabulate

import dit
from dit.divergences import jensen_shannon_divergence

from collections import defaultdict

import pickle

from collections import Counter

import json

In [4]:
# path for models
model_path = '/home/yyang/models/'

In [5]:
# path for corpora and dictionaries
corp_path = '/home/yyang/corpora_and_dictionaries/'

#load models and corpora

In [6]:
lda_10topics_names = ['lda_1506_10topics.model', 'lda_1507_10topics.model', 'lda_1508_10topics.model', 
                      'lda_1509_10topics.model', 'lda_1510_10topics.model', 'lda_1511_10topics.model']

In [7]:
lda_15topics_names = ['lda_1506_15topics.model', 'lda_1507_15topics.model', 'lda_1508_15topics.model', 
                      'lda_1509_15topics.model', 'lda_1510_15topics.model', 'lda_1511_15topics.model']

In [8]:
lda_25topics_names = ['lda_1506_25topics.model', 'lda_1507_25topics.model', 'lda_1508_25topics.model', 
                      'lda_1509_25topics.model', 'lda_1510_25topics.model', 'lda_1511_25topics.model']

In [9]:
def load_lda_models(models, model_names):
    for i in model_names:
        models.append(gensim.models.ldamodel.LdaModel.load(model_path + i))

In [10]:
models = []
load_lda_models(models, lda_10topics_names)

In [11]:
corpora_names = ['corp_1506.mm', 'corp_1507.mm', 'corp_1508.mm', 'corp_1509.mm', 'corp_1510.mm', 'corp_1511.mm']

In [12]:
corps = []
for i in corpora_names:
    corps.append(corpora.MmCorpus(corp_path + i))

#lda result exploration

### show topics in models

In [None]:
def show_lda_topics(wordnum):
    count = 0
    for model in models:
        alist = []
        header = []
        for i in range(len(model.show_topics(-1))):
            alist.append([model.show_topic(i,wordnum)[j][1] for j in range(wordnum)])
            header.append('topic {}'.format(i))
        alist = np.asarray(alist)
        print 'time slice {}'.format(count)
        print tabulate(pd.DataFrame(alist.T), headers=header, tablefmt='psql') + '\n'
        count += 1

In [None]:
show_lda_topics(20)

### load topics from models

In [480]:
# define how many words wanted in each topic
topn_words = 20

In [481]:
topic_list = []
for model in models:
    alist = []
    for i in range(len(model.show_topics(-1))):
        sublist = []
        sublist = dict(model.show_topic(i,topn_words))
        sublist = dict(map(reversed, sublist.iteritems()))
        alist.append(sublist)
    topic_list.append(alist)

### normalise words probabilities in topics

In [482]:
for topics in topic_list:
    for topic in topics:
        total = float(sum(topic.values()))
        for key in topic.keys():
            topic[key] = topic[key] / total

### get words distribution in topics

In [483]:
dists = []
for topics in topic_list:
    alist = []
    for i in range(len(topics)):
        alist.append(dit.ScalarDistribution(topics[i]))
    dists.append(alist)

### calculate the divergence between topics in different month to get the similar topics crossing time

In [484]:
similar_topics = []
    
count_a = 0
count_b = 0
for count_a in range(len(dists)):
    a = dists[count_a]
    for count_b in range(count_a+1, len(dists)):
        b = dists[count_b]
        newlist = []
        index = []
        index_count = 0
        header = []         
        for i in a:
            sublist = []
            for j in b:
                div = jensen_shannon_divergence([i,j])
                sublist.append(div) 
            if np.amin(sublist) < 0.5: # define divergence less than 0.5 as a same topic
                similar_topics.append([count_a, index_count, count_b, sublist.index(np.amin(sublist))])
            newlist.append(sublist)
            index_count += 1
            index.append('time_{0}_{1}'.format(count_a,index_count))
            header.append('time_{0}_{1}'.format(count_b,index_count))
        df = pd.DataFrame(newlist, index=index)
#         print 'time slice {0} vs {1}'.format(count_a,count_b)
#         print tabulate(df, headers=header, tablefmt='psql') + '\n'
    count_a += 1

###seperate divergence for similar topics in adjacent months and nonadjacent months

In [485]:
adjacent_topics = []

for i in range(len(similar_topics)):
    if similar_topics[i][0] == similar_topics[i][2]-1:
        adjacent_topics.append(similar_topics[i])

adjacent_topics = np.array(adjacent_topics)

In [486]:
jumping_topics = []

for i in range(len(similar_topics)):
    if similar_topics[i][0:2] not in adjacent_topics[:,0:2].tolist():
        count = 0
        for j in range(len(jumping_topics)):
            if similar_topics[i][0:2] == jumping_topics[j][0:2]:
                count += 1
        if count == 0:
            jumping_topics.append(similar_topics[i])
            
jumping_topics = np.array(jumping_topics)

###construct the topic vs time matrix to show topic changes

In [488]:
columns = ['time slice {}'.format(i) for i in list(xrange(len(models)))]
topic_time_matrix = pd.DataFrame(columns=columns)
topic_time_matrix['common words'] = ''

count = 0
adict = defaultdict(list)

for time in range(len(models)):
    for topic in range(len(models[time].show_topics(-1))):
        astring = '{0}{1}'.format(time,topic)
#         for word in range(topn_words):
#             astring += str(models[time].show_topic(topic,topn_words)[word][1]) + ' '
            
        if [time, topic] not in adjacent_topics[:,2:4].tolist() and [time, topic] not in jumping_topics[:,2:4].tolist():
            topic_time_matrix.set_value('topic {}'.format(count), 'time slice {}'.format(time), str(astring))
            topic_time_matrix.set_value('topic {}'.format(count), 'common words', 
                              set([models[time].show_topic(topic,topn_words)[i][1] for i in range(topn_words)]))
            
            adict['{0}{1}'.format(time,topic)].append(count)
            count += 1
            
        else:
            for i in range(len(jumping_topics)):
                if (jumping_topics[i,2:4].tolist() == [time,topic]):
                        topic_ids = adict.get('{0}{1}'.format(jumping_topics[i,0:2][0],jumping_topics[i,0:2][1]))
                        for topic_id in topic_ids:
                            topic_time_matrix['time slice {}'.format(time)]['topic {}'.format(topic_id)] = str(astring)
                            topic_time_matrix['common words']['topic {}'.format(topic_id)] = \
                                topic_time_matrix['common words']['topic {}'.format(topic_id)]\
                                .intersection([models[time].show_topic(topic,topn_words)[i][1] \
                                               for i in range(topn_words)])
                            adict['{0}{1}'.format(time,topic)].append(topic_id)
                            
            for i in range(len(adjacent_topics)):
                if adjacent_topics[i,2:4].tolist() == [time, topic]:
                    topic_ids = adict.get('{0}{1}'.format(adjacent_topics[i,0:2][0],adjacent_topics[i,0:2][1]))
                    for topic_id in topic_ids:
                        topic_time_matrix['time slice {}'.format(time)]['topic {}'.format(topic_id)] = str(astring)
                        topic_time_matrix['common words']['topic {}'.format(topic_id)] = \
                            topic_time_matrix['common words']['topic {}'.format(topic_id)]\
                            .intersection([models[time].show_topic(topic,topn_words)[i][1] for i in range(topn_words)])
                        adict['{0}{1}'.format(time,topic)].append(topic_id)

topic_time_matrix = topic_time_matrix.replace(np.nan,' ', regex=True)

###cluster merged topics together

In [489]:
for col in range(topic_time_matrix.shape[1]-1,-1,-1):
    for i in range(topic_time_matrix.shape[0]):
        for j in range(i+1,topic_time_matrix.shape[0]):
            if topic_time_matrix.iloc[:,col][i] == topic_time_matrix.iloc[:,col][j] and \
               topic_time_matrix.iloc[:,col][j] != ' ':
                    same = pd.DataFrame(columns=topic_time_matrix.columns.values.tolist())
                    same.loc[0] = topic_time_matrix.iloc[j]
                    topic_time_matrix.drop(topic_time_matrix.index[j], inplace=True)
                    temp = topic_time_matrix.iloc[i+1:]
                    topic_time_matrix.drop(topic_time_matrix.index[i+1:], inplace=True)
                    topic_time_matrix = pd.concat([topic_time_matrix, same], ignore_index=True)
                    topic_time_matrix = pd.concat([topic_time_matrix, temp], ignore_index=True)
                    break

In [490]:
topic_time_matrix

Unnamed: 0,time slice 0,time slice 1,time slice 2,time slice 3,time slice 4,time slice 5,common words
0,0.0,15.0,26.0,37.0,48.0,58.0,"{credit, account, please, thank, card}"
1,1.0,,,,,,"{status, timothy, process, remote, description..."
2,2.0,13.0,21.0,36.0,45.0,,"{receive, please}"
3,4.0,13.0,21.0,36.0,45.0,,"{information, use, intend, sender, confidentia..."
4,3.0,,,30.0,,,"{do, ser, favor, support, tier, eu, mail, tick..."
5,5.0,14.0,28.0,33.0,44.0,55.0,"{right, thank, anything, get, else, go, see, c..."
6,,16.0,,,40.0,55.0,"{set, like, get, send, one, see, ticket}"
7,6.0,10.0,22.0,31.0,43.0,53.0,"{customer, thank, get, support, advocate, plea..."
8,9.0,10.0,22.0,31.0,43.0,53.0,"{customer, thank, get, support, advocate, know..."
9,,11.0,22.0,31.0,43.0,53.0,"{customer, thank, get, support, advocate, plea..."


In [456]:
topic_time_matrix.to_pickle('/home/yyang/data/lda_topic_time_matrix_10.mm')

#lda result processing

##calculate topic popularity

###calculate topic distribution for documents

In [57]:
all_list = []

for i in range(len(models)):
    topics_list = []
    dists = [models[i].get_document_topics(j) for j in corps[i]]
    for j in range(len(dists)):
        topics_dict = {}
        for k in range(len(dists[j])):
            if dists[j][k][1] > 0.1: #define probability threshold as 0.1
                topics_dict[dists[j][k][0]] = 1
        topics_list.append(topics_dict)
    all_list.append(topics_list)

###aggregate topic distribution crossing documents to get topic popularity

In [59]:
topic_dicts = [dict(sum(map(Counter, i),Counter())) for i in all_list]

In [60]:
count = 0
for adict in topic_dicts:
    for k in adict.keys():
        adict['{0}{1}'.format(count,k)] = adict.pop(k)
    count += 1

In [61]:
popularity = {}
for d in topic_dicts:
    popularity.update(d)

In [62]:
popularity

{'00': 5376,
 '01': 6729,
 '02': 9041,
 '03': 3214,
 '04': 7814,
 '05': 13399,
 '06': 14261,
 '07': 5561,
 '08': 3622,
 '09': 16096,
 '10': 16226,
 '11': 10167,
 '12': 11106,
 '13': 6745,
 '14': 10321,
 '15': 8038,
 '16': 6602,
 '17': 5427,
 '18': 1983,
 '19': 9049,
 '20': 10180,
 '21': 6271,
 '22': 15043,
 '23': 4333,
 '24': 8176,
 '25': 12300,
 '26': 5574,
 '27': 3808,
 '28': 7835,
 '29': 15063,
 '30': 4391,
 '31': 22365,
 '32': 3036,
 '33': 13136,
 '34': 12303,
 '35': 6362,
 '36': 7015,
 '37': 7952,
 '38': 2899,
 '39': 11452,
 '40': 15177,
 '41': 8287,
 '42': 2433,
 '43': 18504,
 '44': 11355,
 '45': 7507,
 '46': 7505,
 '47': 4256,
 '48': 9540,
 '49': 14785,
 '50': 9009,
 '51': 9476,
 '52': 9071,
 '53': 17461,
 '54': 5371,
 '55': 20685,
 '56': 4033,
 '57': 6390,
 '58': 9734,
 '59': 6681}

In [63]:
with open('/home/yyang/data/lda_topic_popularity_10.mm', 'wb') as f:
    pickle.dump(popularity, f)

##construct json objects for visualisation

###loda models

In [63]:
models = []
load_lda_models(models, lda_10topics_names)

###load topic vs time matrix

In [65]:
topic_time_matrix = pd.read_pickle('/home/yyang/data/lda_topic_time_matrix_10.mm') 

In [66]:
topic_time_matrix.columns = [201506, 201507, 201508, 201509, 201510, 201511, 'words']

###load topic popularity

In [64]:
with open('/home/yyang/data/lda_topic_popularity_10.mm', 'rb') as f:
    popularity = pickle.load(f)

###combine matrix and popularity to create json object

In [67]:
array = []

for i in range(topic_time_matrix.shape[0]):
    data = []
    new_row = []
    count_same = 0
    
    for j in range(topic_time_matrix.shape[1]-1):
        cell = []
        new_cell = []
        
        if topic_time_matrix.iloc[i].iloc[j] != ' ':
            cell.append(topic_time_matrix.columns[j])
            cell.append(topic_time_matrix.iloc[i].iloc[j])
            
            for k in range(i) + range(i+1,topic_time_matrix.shape[0]):
                if topic_time_matrix.iloc[:,j][i] == topic_time_matrix.iloc[:,j][k] and \
                   topic_time_matrix.iloc[:,j][k] != ' ':
                        count_same += 1
            
            if count_same == 0:
                cell.append(int(round(popularity[topic_time_matrix.iloc[i].iloc[j]])))
            else:
                cell.append(' ')
                new_cell.append(topic_time_matrix.columns[j])
                new_cell.append(topic_time_matrix.iloc[i].iloc[j])
                new_cell.append(int(round(popularity[topic_time_matrix.iloc[i].iloc[j]])))
                new_row.append(new_cell)  
 
            data.append(cell)
    array.append(data)
    
    if count_same != 0:
        count_add = 0
        for z in range(len(array)):
            if new_row == array[z]:
                count_add += 1
        if count_add == 0:
            array.append(new_row)

for i in range(len(array)):
    for j in range(i+1, len(array)):
        for k in range(len(array[j])):
            if array[j][k] in array[i] and array[j][k][2] != ' ':
                if len(array[j]) > len(array[i]):
                    for l in range(k,len(array[j])):
                        array[j][l][2] = ' '
                if len(array[i]) > len(array[j]):
                    for l in range(len(array[i])):
                        if array[i][l] == array[j][k]:
                            array[i][l][2] = ' '

In [68]:
complete_array = []

for i in range(len(array)):
    data = {}
    common_words = {}
    
    for j in range(len(array[i])):
        if array[i][j][2] != ' ':
            topic = models[int(list(array[i][j][1])[0])].show_topic(int(array[i][j][1][1:]),20)
            array[i][j].append(topic)
        
        words = [topic[k][1] for k in range(len(topic))]
        if len(common_words) == 0:
            common_words = set(words)
        else:
            common_words = set(common_words).intersection(words)
        
    data['topics'] = array[i]
    data['words'] = ' '.join(common_words)
    json_data = json.dumps(data)
    complete_array.append(json_data)

In [69]:
with open('/home/yyang/data/lda_result_10.json', 'w') as f:
     json.dump(complete_array, f)