In [1]:
import pandas as pd
import numpy as np
import os, sys
from datetime import datetime
from pprint import pprint
import numpy as np
import tqdm
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
# For Visualization
import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [2]:
df = pd.read_pickle('uniqueData13mill_part1.pkl')

In [3]:
# Set up log to external log file
import logging
logging.basicConfig(filename='lda_1mill_model_part1.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
df

Unnamed: 0,hashtags,text
0,,"[gotta, love, fact]"
1,,"[great, day, action, message, dear, dan, toola..."
2,,"[harvey, norman, reckons, climate, change, bun..."
3,,"[skepticism, immigration, walk, hand, hand, sk..."
4,climatechange,"[th, november, creative, citizen, w, practical..."
...,...,...
3939094,climatechange,"[texas, longer, afford, act, state, projected,..."
3939095,,"[dont, allow, negative, climate, dictate, futu..."
3939097,ClimateChange fascism WhiteSupremacy racism POC,"[money, time, throw, resource, problem, exist,..."
3939098,climatechange txlege,"[texas, longer, afford, act, state, projected,..."


In [5]:
texts = df['text'].tolist()

In [6]:
# Create Dictionary
id2word = corpora.Dictionary(texts)

In [7]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [8]:
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1)]]


In [9]:
%%time
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=15, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=2,
                                       workers=7,
                                       per_word_topics=True)

CPU times: user 2h 4min 55s, sys: 27min 36s, total: 2h 32min 31s
Wall time: 2h 24min 33s


In [10]:
# Print the Keyword in the 15 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.272*"global" + 0.226*"warming" + 0.018*"ipcc" + 0.013*"little" + '
  '0.011*"stupid" + 0.011*"called" + 0.011*"having" + 0.011*"e" + '
  '0.011*"truth" + 0.009*"nature"'),
 (1,
  '0.044*"let" + 0.034*"company" + 0.034*"carbon" + 0.025*"cant" + '
  '0.025*"thanks" + 0.024*"ice" + 0.023*"beer" + 0.023*"hope" + 0.021*"best" + '
  '0.021*"melting"'),
 (2,
  '0.105*"climate" + 0.094*"change" + 0.033*"news" + 0.020*"denier" + '
  '0.018*"solution" + 0.016*"natural" + 0.016*"story" + 0.015*"political" + '
  '0.015*"th" + 0.014*"science"'),
 (3,
  '0.033*"c" + 0.031*"city" + 0.029*"im" + 0.024*"pollution" + 0.022*"live" + '
  '0.022*"air" + 0.020*"sure" + 0.019*"tree" + 0.018*"help" + 0.018*"water"'),
 (4,
  '0.033*"weather" + 0.033*"yes" + 0.028*"making" + 0.026*"getting" + '
  '0.025*"thought" + 0.022*"show" + 0.022*"worse" + 0.021*"place" + '
  '0.020*"case" + 0.016*"role"'),
 (5,
  '0.051*"trump" + 0.040*"like" + 0.020*"it" + 0.018*"look" + 0.016*"thats" + '
  '0.015*"wont" + 0.0

In [11]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.33138300587039


In [12]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))


Perplexity:  -13.373275264253412


In [13]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
# Hyperparamter Tuning
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 10
max_topics = 22
step_size = 3
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

In [14]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [15]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

# Hashtags
hashtags = df['hashtags']
hashtags = hashtags.reset_index()

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

# Merge
df_with_hashtags = df_dominant_topic.join(hashtags)
df_with_hashtags = df_with_hashtags.drop('index', 1)

In [16]:
df_with_hashtags

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,hashtags
0,0,11.0,0.5167,"climate, change, it, dont, people, real, scien...","[gotta, love, fact]",
1,1,12.0,0.5678,"climate, change, amp, action, policy, fight, l...","[great, day, action, message, dear, dan, toola...",
2,2,3.0,0.3770,"c, city, im, pollution, live, air, sure, tree,...","[harvey, norman, reckons, climate, change, bun...",
3,3,2.0,0.7100,"climate, change, news, denier, solution, natur...","[skepticism, immigration, walk, hand, hand, sk...",
4,4,5.0,0.4066,"trump, like, it, look, thats, wont, people, pr...","[th, november, creative, citizen, w, practical...",climatechange
...,...,...,...,...,...,...
999994,999994,14.0,0.5388,"level, sea, hurricane, fuel, gas, fossil, big,...","[texas, longer, afford, act, state, projected,...",climatechange
999995,999995,8.0,0.3170,"climate, change, challenge, were, saying, cont...","[dont, allow, negative, climate, dictate, futu...",
999996,999996,11.0,0.3099,"climate, change, it, dont, people, real, scien...","[money, time, throw, resource, problem, exist,...",ClimateChange fascism WhiteSupremacy racism POC
999997,999997,14.0,0.3863,"level, sea, hurricane, fuel, gas, fossil, big,...","[texas, longer, afford, act, state, projected,...",climatechange txlege


In [17]:
import pickle

In [18]:
lda_model.save('lda1mill_part1.model')

In [19]:
# # To Load again later
# # later on, load trained model from file
# model =  models.LdaModel.load('lda.model')

In [20]:
# save the model to disk
filename = '1Million_model_part1.sav'
pickle.dump(lda_model, open(filename, 'wb'))

In [21]:
# # To Load again later
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)

In [22]:
df_with_hashtags.to_csv('uniqueLabelled_1.csv')