In [1]:
# coding: utf-8
import numpy as np
import pandas as pd
import time
from configparser import ConfigParser, ExtendedInterpolation

In [2]:
config = ConfigParser(inline_comment_prefixes="#;", interpolation=ExtendedInterpolation())
config.read('config.ini')
xlsfile = config['General']['output_file'] # output of generate trends is input here

In [3]:
uni_topics = pd.read_excel(xlsfile, sheetname=0).iloc[:,-1]
uni_topics = (uni_topics[~uni_topics.isnull()])[:100].to_list()

In [4]:
def readData(filename):
    print('Reading data....')
    start = time.time()
    df = pd.read_pickle(filename)
    end = time.time()
    print(f'Read finished in {end-start:.2f} seconds.\n')
    return df

In [5]:
inputfile = config['Text Cleaning']['tokenized_file']
df = readData(inputfile)

Reading data....
Read finished in 3.38 seconds.



In [6]:
data = df.reset_index().Unigrams
idxs = []
for idx, row in enumerate(data.to_list()):
    for topic in row:
        if topic in uni_topics:
            idxs.append(idx)

In [7]:
texts = df.ix[idxs].reset_index().Unigrams
texts.shape

(16376,)

In [8]:
from gensim.models import LdaModel, LsiModel, HdpModel
from gensim.corpora import Dictionary
# from gensim.models.wrappers import LdaMallet
# import pyLDAvis.gensim

In [9]:
dictionary = Dictionary(texts)

In [10]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [12]:
#use lda multicore

In [11]:
ldamodel = LdaModel(corpus=corpus, num_topics=6, id2word=dictionary)
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

In [13]:
ldamodel.show_topics()

[(0,
  '0.024*"mulled" + 0.014*"ive" + 0.011*"speeches" + 0.011*"phone" + 0.010*"intelligence" + 0.010*"big" + 0.010*"fake" + 0.009*"still" + 0.009*"5g" + 0.009*"fold"'),
 (1,
  '0.041*"5g" + 0.024*"ways" + 0.023*"unleash" + 0.022*"essay" + 0.014*"objectivity" + 0.014*"ai-powered" + 0.014*"launch" + 0.013*"airpods" + 0.011*"first" + 0.011*"code-breaking"'),
 (2,
  '0.032*"blockchains" + 0.025*"computers" + 0.024*"pi" + 0.018*"raspberry" + 0.016*"falling" + 0.015*"ai" + 0.014*"took" + 0.013*"model" + 0.013*"two" + 0.012*"un"'),
 (3,
 (4,
  '0.029*"learning" + 0.025*"quantum" + 0.021*"reality" + 0.021*"carbon" + 0.019*"photon" + 0.016*"nature" + 0.015*"world" + 0.015*"machine" + 0.014*"physicists" + 0.013*"researchers"'),
 (5,
  '0.029*"creativity" + 0.015*"blood" + 0.013*"runs" + 0.011*"algorithm" + 0.008*"create" + 0.008*"ive" + 0.008*"skywalker" + 0.008*"pixel" + 0.008*"heart" + 0.007*"process"')]

In [15]:
hdpmodel.show_topics()

[(0,
  '0.003*intelligence + 0.003*artificial + 0.003*darker + 0.003*side + 0.003*string + 0.003*dumber + 0.003*revealed + 0.002*controversies + 0.001*two + 0.001*pacman + 0.001*5g + 0.001*carbon + 0.001*lurking + 0.001*hole + 0.001*heavy + 0.001*raspberry + 0.001*ai + 0.000*pi + 0.000*figured + 0.000*took'),
 (1,
 (2,
  '0.004*researchers + 0.004*nature + 0.004*physicists + 0.004*reality + 0.004*forcing + 0.004*objectivity + 0.004*reconsider + 0.001*5g + 0.001*first + 0.001*lexicon + 0.001*carbon + 0.001*mod + 0.001*pi + 0.001*mobypicture + 0.001*raspberry + 0.001*matchmaking + 0.000*reveal + 0.000*mind-control + 0.000*game + 0.000*biopsies'),
 (3,
 (4,
  '0.001*5g + 0.001*federighi + 0.001*blood + 0.001*pi + 0.001*revolution + 0.001*dragon + 0.001*mechanisms + 0.001*time + 0.001*needed + 0.001*passwords + 0.000*first + 0.000*breast + 0.000*carbon + 0.000*hosted + 0.000*forcing + 0.000*revolutionize + 0.000*prostate-specific + 0.000*fails + 0.000*terrible + 0.000*ftc'),
 (5,
 (6,
  '0