In [1]:
# coding: utf-8
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from configparser import ConfigParser, ExtendedInterpolation
from gensim.models import LdaModel, LsiModel, HdpModel, LdaMulticore
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
import pyLDAvis.gensim
import pickle

In [2]:
config = ConfigParser(inline_comment_prefixes="#;", interpolation=ExtendedInterpolation())
config.read('config.ini')
xlsfile = config['General']['output_file'] # output of generate trends is input here

In [3]:
def readData(filename):
    print('Reading data....')
    start = time.time()
    df = pd.read_pickle(filename)
    end = time.time()
    print(f'Read finished in {end-start:.2f} seconds.\n')
    return df

Nas sheets de unigramas e bigramas, os tópicos estão na última coluna.

In [4]:
inputfile = config['Text Cleaning']['tokenized_file']
df = readData(inputfile)
print('Generating models....')
start = time.time()
# for gram in [0,1]:
for gram in [0]:
    column =  pd.read_excel(xlsfile, sheet_name=gram).iloc[:,-1]    
#     for size in [30, 100, 500]:
    for size in [30]:
        trends = (column[~column.isnull()])[:size].to_list()
        
#         for mask in [('01-01-2010', '30-06-2019'), ('01-01-2019', '30-06-2019')]:
        for mask in [('01-01-2010', '30-06-2019')]:
            start_date, end_date = mask
            data = df.reset_index()
            data['Date'] = pd.to_datetime(data['Date'])
            mask = (data['Date']>=start_date) & (data['Date']<=end_date)
            column_name=['Unigrams', 'Bigrams'][gram]
            data = data[mask][column_name]
            idxs = []
            #filtering rows with trend in trends
            for idx, row in tqdm(enumerate(data.to_list())):
                for trend in row:
                    if trend in trends:
                        idxs.append(idx)
            texts = df.ix[idxs].reset_index()[column_name]
            dictionary = Dictionary(texts)
            corpus = [dictionary.doc2bow(text) for text in texts]
#             for num_topics in [6, 12, 24]:
            for num_topics in [12]:
                startmodel = time.time()
                ldamodel = LdaModel(corpus=corpus,
                    num_topics=num_topics, 
                    id2word=dictionary, 
                    random_state=42, 
                    update_every=1, 
                    chunksize=100, 
                    passes=10, 
                    alpha='auto', 
                    per_word_topics=True)
                endmodel = time.time()
                filename = f'./models/lda_{column_name}_{size}_d{start_date}d_d{end_date}d_{num_topics}'
                with open(filename+'.model', 'wb') as handle:
                    pickle.dump(ldamodel, handle, protocol=pickle.HIGHEST_PROTOCOL)
                with open(filename+'.corpus', 'wb') as handle:
                    pickle.dump(corpus, handle, protocol=pickle.HIGHEST_PROTOCOL)
                with open(filename+'.dict', 'wb') as handle:
                    pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)
                print(f'Generated model {filename} in {endmodel-startmodel:.2f} seconds. \n')
end = time.time()
print(f'Generated in {end-start:.2f} seconds.\n')

Reading data....
Read finished in 2.90 seconds.

Generating models....


518145it [00:02, 216346.73it/s]
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Generated model ./models/lda_Unigrams_30_d01-01-2010d_d30-06-2019d_12 in 77.49 seconds. 

Generated in 81.61 seconds.



In [5]:
# Visualize the topics
pyLDAvis.enable_notebook()

In [None]:
start = time.time()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
end = time.time()
print(f'Prepared visualization in {end-start:.2f} seconds.\n')

In [None]:
vis

In [None]:
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

In [None]:
hdpmodel.show_topics()

In [None]:
import os
os.environ.update({'MALLET_HOME':"/usr/local/opt/mallet-2.0.8/bin/mallet"})# update this path accordingly

mallet_path = "/usr/local/opt/mallet-2.0.8/bin/mallet" # update this path accordingly

In [None]:
ldamallet = LdaMallet(mallet_path, 
                        corpus=corpus, 
                        num_topics=12, 
                        id2word=dictionary,   
                        workers=4)