In [1]:
import sys
sys.path.append('../../..')
import src.data.data_loader as dl
from src.features.vectorizer import Vectorizer
from src.models.topic_models import TopicModel
import pandas as pd
import sklearn.utils as skutil
pd.set_option('display.max_rows', None)



In [2]:
# Data 
language = 'english'
typex = 'editorial'

# Vectorization
min_df = 0.005
max_df = 0.9

# Topic Modeling
algorithm = 'lda'
num_topics = 130

train_percentage = 0.9
alpha = 'auto'
eta = 0.01
iterations = 200
passes = 60
chunksize = 5000
kappa = 0
tau_0 = 0

In [3]:
data = dl.get_articles_by_type(language, typex,kind = "wobigrams")
texts = data['article_texts']

def min_length (texts,min_characters):
    neu = []
    for t in texts: 
        token_perdoc_list = t.split()
        token_min_character = []
        for token in token_perdoc_list: 
            if len(token)>= min_characters: 
                token_min_character.append(token)
        joined = (" ").join(token_min_character)
        neu.append(joined)
    return neu

texts = min_length(texts,3)

In [4]:
vec = Vectorizer('tf', texts, min_df=min_df, max_df=max_df)
vec.save('{}_{}_{}_{}.pkl'.format(algorithm, language, typex,"without_bigrams"))

document_term_matrix = vec.get_document_token_matrix(texts)
id2token = vec.get_id2token_mapping()

In [5]:
document_term_matrix = skutil.shuffle(document_term_matrix, random_state=1)
num_docs, num_terms = document_term_matrix.shape
train = int(num_docs * train_percentage)
train_document_term_matrix = document_term_matrix[0:train, :]
test_document_term_matrix = document_term_matrix[train: num_docs, :]

In [None]:
model = TopicModel('lda', num_topics, train_document_term_matrix, id2token, alpha=alpha, eta=eta, iterations=iterations, passes=passes, chunksize=chunksize, test_document_term_matrix=test_document_term_matrix, decay=kappa, offset=tau_0)
model.save('{}_{}_{}_{}_articles_{}.pkl'.format(algorithm, language, typex,"without_bigrams",num_topics))

2018-07-31 17:59:03,233 : INFO : using autotuned alpha, starting with [0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.0076923077, 0.007

2018-07-31 17:59:33,677 : INFO : topic #32 (0.009): 0.017*"farm" + 0.008*"local" + 0.008*"year" + 0.007*"animal" + 0.007*"grow" + 0.006*"farmer" + 0.006*"good" + 0.006*"consumer" + 0.006*"chicken" + 0.006*"produce"
2018-07-31 17:59:33,689 : INFO : topic diff=inf, rho=1.000000
2018-07-31 17:59:35,480 : INFO : Epoch 1: perplexity estimate: 111421.48630553547
2018-07-31 17:59:35,587 : INFO : PROGRESS: pass 2, at document #2112/2112
2018-07-31 17:59:44,360 : INFO : optimized alpha [0.006841876, 0.0072139744, 0.006951603, 0.0070570484, 0.0084408, 0.006956318, 0.0070883413, 0.0072453483, 0.0071490523, 0.0072795167, 0.007507634, 0.0071884217, 0.0076539936, 0.007020023, 0.008009104, 0.00681844, 0.007240817, 0.0068853917, 0.0071018785, 0.0071944855, 0.0069225156, 0.0069230357, 0.006960597, 0.007299197, 0.006750143, 0.00784297, 0.0093360115, 0.006892861, 0.007018824, 0.0071029277, 0.006986739, 0.0070944303, 0.008841424, 0.0068918653, 0.0073671923, 0.0074445806, 0.0071553676, 0.0069238795, 0.0066

2018-07-31 18:00:04,443 : INFO : topic #65 (0.006): 0.033*"beef" + 0.021*"cereal" + 0.018*"grow" + 0.014*"raise" + 0.014*"vegetable" + 0.012*"breakfast" + 0.012*"come" + 0.011*"sustainably" + 0.011*"garden" + 0.010*"magazine"
2018-07-31 18:00:04,455 : INFO : topic #50 (0.006): 0.062*"hormone" + 0.031*"estrogen" + 0.023*"therapy" + 0.017*"compound" + 0.016*"child" + 0.016*"woman" + 0.013*"soy" + 0.011*"health" + 0.011*"supplement" + 0.009*"future"
2018-07-31 18:00:04,484 : INFO : topic #62 (0.011): 0.025*"farm" + 0.015*"grow" + 0.014*"farmer" + 0.011*"local" + 0.011*"year" + 0.009*"land" + 0.009*"farming" + 0.007*"crop" + 0.007*"work" + 0.007*"produce"
2018-07-31 18:00:04,499 : INFO : topic #39 (0.011): 0.013*"world" + 0.012*"agriculture" + 0.011*"farming" + 0.008*"farmer" + 0.007*"need" + 0.007*"people" + 0.007*"global" + 0.007*"change" + 0.006*"make" + 0.006*"health"
2018-07-31 18:00:04,505 : INFO : topic #26 (0.012): 0.032*"store" + 0.017*"foods" + 0.017*"grocery" + 0.017*"price" + 0

2018-07-31 18:00:35,101 : INFO : topic #65 (0.006): 0.031*"beef" + 0.022*"cereal" + 0.020*"grow" + 0.016*"vegetable" + 0.014*"raise" + 0.013*"sustainably" + 0.012*"come" + 0.012*"breakfast" + 0.012*"magazine" + 0.011*"garden"
2018-07-31 18:00:35,103 : INFO : topic #118 (0.006): 0.031*"autoimmune" + 0.028*"gut" + 0.026*"pot" + 0.025*"disease" + 0.021*"use" + 0.019*"inflammation" + 0.018*"immune" + 0.012*"product" + 0.011*"stress" + 0.010*"state"
2018-07-31 18:00:35,106 : INFO : topic #41 (0.014): 0.035*"people" + 0.028*"think" + 0.020*"thing" + 0.018*"really" + 0.017*"make" + 0.016*"just" + 0.016*"know" + 0.016*"like" + 0.013*"lot" + 0.012*"want"
2018-07-31 18:00:35,114 : INFO : topic #39 (0.015): 0.016*"world" + 0.014*"agriculture" + 0.013*"farming" + 0.010*"farmer" + 0.008*"need" + 0.008*"global" + 0.007*"change" + 0.007*"production" + 0.007*"people" + 0.006*"feed"
2018-07-31 18:00:35,117 : INFO : topic #26 (0.016): 0.034*"store" + 0.017*"grocery" + 0.016*"price" + 0.014*"market" + 0.