In [1]:
import pandas as pd
import numpy as np
import os, sys
from datetime import datetime
from pprint import pprint
import numpy as np
import tqdm
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
# For Visualization
import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [2]:
df = pd.read_pickle('uniqueData13mill_part2.pkl')

In [3]:
# Set up log to external log file
import logging
logging.basicConfig(filename='lda_1mill_model_part2Hyper.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
df

Unnamed: 0,hashtags,text
3939113,IPCC climatechange sr15 ipcc30,"[retweeted, ipcc, ipcc, statement, medium, rep..."
3939117,,"[woohoo, salute, solid, yes]"
3939120,environment fishing climatechange,"[climate, change, spark, global, fish, war]"
3939126,itstimetochange,"[emission, control, c, climate, target, cost, ..."
3939130,,"[global, warming, cause, iithe, sequence, preq..."
...,...,...
318587,,"[dont, talk, politics, funnier, lol]"
318589,ClimateChange Coal Trump,"[great, news, environment, im, sad, american, ..."
318592,QandA,"[craig, kelly, climate, change, denialist, hom..."
318594,TakeYourSeat ClimateChange DavidAttenborough G...,"[today, it, time, people, amp, participate, co..."


In [5]:
texts = df['text'].tolist()

In [6]:
# Create Dictionary
id2word = corpora.Dictionary(texts)

In [7]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [8]:
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1)]]


In [9]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=15, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=2,
                                       workers=7,
                                       per_word_topics=True)

In [10]:
# Print the Keyword in the 15 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.049*"global" + 0.044*"warming" + 0.021*"climate" + 0.017*"change" + '
  '0.016*"mean" + 0.011*"hot" + 0.011*"long" + 0.010*"human" + 0.009*"time" + '
  '0.009*"start"'),
 (1,
  '0.034*"climate" + 0.029*"change" + 0.025*"thanks" + 0.022*"latest" + '
  '0.016*"news" + 0.016*"daily" + 0.014*"science" + 0.010*"forest" + '
  '0.010*"deal" + 0.009*"read"'),
 (2,
  '0.025*"climate" + 0.022*"amp" + 0.021*"change" + 0.019*"water" + '
  '0.016*"day" + 0.011*"world" + 0.011*"food" + 0.009*"flood" + '
  '0.008*"according" + 0.008*"flooding"'),
 (3,
  '0.032*"global" + 0.030*"warming" + 0.021*"yes" + 0.014*"thing" + '
  '0.011*"time" + 0.010*"control" + 0.010*"need" + 0.009*"water" + '
  '0.008*"forest" + 0.008*"california"'),
 (4,
  '0.044*"climate" + 0.039*"change" + 0.019*"amp" + 0.019*"believe" + '
  '0.013*"dont" + 0.008*"farmer" + 0.008*"matter" + 0.008*"help" + '
  '0.008*"doesnt" + 0.007*"drought"'),
 (5,
  '0.060*"climate" + 0.057*"change" + 0.026*"it" + 0.017*"people" + '
  '0.0

In [11]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3298266121168467


In [12]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))


Perplexity:  -11.209593985947043


In [13]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [14]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=2,
                                           workers=7,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [15]:
# Hyperparamter Tuning
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 10
max_topics = 22
step_size = 3
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

 44%|████▍     | 240/540 [19:12<24:00,  4.80s/it]


In [16]:
tuning = pd.read_csv('lda_tuning_results.csv')

In [17]:
tuning

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,10,0.01,0.01,0.310444
1,75% Corpus,10,0.01,0.31,0.343230
2,75% Corpus,10,0.01,0.61,0.337183
3,75% Corpus,10,0.01,0.9099999999999999,0.307721
4,75% Corpus,10,0.01,symmetric,0.323008
...,...,...,...,...,...
235,100% Corpus,19,asymmetric,0.01,0.311208
236,100% Corpus,19,asymmetric,0.31,0.408466
237,100% Corpus,19,asymmetric,0.61,0.418818
238,100% Corpus,19,asymmetric,0.9099999999999999,0.397556


In [18]:
tuning.sort_values('Coherence',ascending=False)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
207,100% Corpus,16,asymmetric,0.61,0.423071
237,100% Corpus,19,asymmetric,0.61,0.418818
206,100% Corpus,16,asymmetric,0.31,0.415708
217,100% Corpus,19,0.31,0.61,0.415051
146,100% Corpus,10,asymmetric,0.31,0.413323
...,...,...,...,...,...
48,75% Corpus,13,0.9099999999999999,0.9099999999999999,0.204224
47,75% Corpus,13,0.9099999999999999,0.61,0.201493
18,75% Corpus,10,0.9099999999999999,0.9099999999999999,0.201418
78,75% Corpus,16,0.9099999999999999,0.9099999999999999,0.198256


In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

# Hashtags
hashtags = df['hashtags']
hashtags = hashtags.reset_index()

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

# Merge
df_with_hashtags = df_dominant_topic.join(hashtags)
df_with_hashtags = df_with_hashtags.drop('index', 1)

In [None]:
df_with_hashtags

In [None]:
import pickle
lda_model.save('lda1mill_part1.model')

In [None]:
# # To Load again later
# # later on, load trained model from file
# model =  models.LdaModel.load('lda.model')

In [None]:
# save the model to disk
filename = '1Million_model_part1.sav'
pickle.dump(lda_model, open(filename, 'wb'))

In [None]:
# # To Load again later
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)