In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd

In [None]:
df1 = pd.read_csv("cnn_FINAL.csv")
df2 = df1[['clean_text']]

df2.head(5)
df2.info()

In [None]:
i=0
for index, row in df2.iterrows():
    if i > len(df2):
       break
    else:
       f = open(str(i)+'.txt', 'w', encoding='utf-8')
       f.write(row[0])
       f.close()
       i+=1

In [None]:
import preprocess2
cnn_corpus = preprocess2.load_corpus('./cnntext')
cnn_docs = preprocess2.corpus2docs(cnn_corpus)

import gensim
sg_dictionary = gensim.corpora.Dictionary(cnn_docs)
sg_vecs = preprocess2.docs2vecs(cnn_docs, sg_dictionary)

In [None]:

sg_lda = gensim.models.ldamodel.LdaModel(corpus=sg_vecs, id2word=sg_dictionary, num_topics=8, random_state=20)


In [None]:
topics = sg_lda.show_topics(8, 5) #10 topics and top 5 most frequent words

for i in range(0, 8):
    print(topics[i])

In [None]:

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [None]:
visual= gensimvis.prepare(sg_lda, sg_vecs, sg_dictionary)

In [None]:

pyLDAvis.save_html(visual, "topic_viz.html")

In [None]:
# Compute Perplexity
perplex= sg_lda.log_perplexity(sg_vecs, total_docs=None)


In [None]:
print('\nPerplexity for LDAModel: ', perplex)


In [None]:
# Compute Coherence 
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=sg_lda, texts=cnn_docs, dictionary=sg_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score LDAModel: ', coherence_lda)

In [None]:
# Perplexity lower better
import datetime
print(datetime.datetime.now())

from gensim.models import CoherenceModel

model_list = []
perplexity_values = []
model_topics = []

for num_topics in range(2, 21, 1):
    sg_lda_x = gensim.models.ldamodel.LdaModel(corpus=sg_vecs, id2word=sg_dictionary, num_topics=num_topics,  random_state = 20)
    perplex= sg_lda_x.log_perplexity(sg_vecs, total_docs=None)
    model_topics.append(num_topics)
    model_list.append(sg_lda_x)
    perplexity_values.append(perplex)
    print("#Topics: " + str(num_topics) + " Score: " + str(perplex))

print(datetime.datetime.now())

In [None]:
# Coherence higher better
import datetime
print(datetime.datetime.now())

from gensim.models import CoherenceModel

model_list = []
coherence_values = []
model_topics = []

for num_topics in range(2, 21, 1): 
    sg_lda_x = gensim.models.ldamodel.LdaModel(corpus=sg_vecs, id2word=sg_dictionary, num_topics=num_topics,  random_state = 20)
    coherencemodel = CoherenceModel(model=sg_lda_x, texts=cnn_docs, dictionary=sg_dictionary, coherence='c_v')
    model_topics.append(num_topics)
    model_list.append(sg_lda_x)
    coherence_values.append(coherencemodel.get_coherence())
    print("#Topics: " + str(num_topics) + " Score: " + str(coherencemodel.get_coherence()))

print(datetime.datetime.now())

In [None]:
# Show graph for perplexity

import matplotlib.pyplot as plt

limit=21; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, perplexity_values)
plt.xlabel("Num Topics")
plt.ylabel("Perplexity score")
plt.legend(("perplexity_values"), loc='best')
plt.show()

In [None]:
# Print the perplexity scores
for m, cv in zip(x, perplexity_values):
    print("Num Topics =", m, " has Perplexity Value of", round(cv, 4))

In [None]:
# Show graph for coherence

import matplotlib.pyplot as plt

limit=21; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
#Find most dominant topic

import numpy as np
import pandas as pd

def format_topics_sentences(ldamodel, corpus, data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,8), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(data)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=model_list[8], corpus=sg_vecs, data=cnn_docs)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)


In [None]:
sg_lda_sav=model_list[6]

sg_lda_sav.save("model_8Topics")

lda_disk=gensim.models.ldamodel.LdaModel.load("model_8Topics")

In [None]:
import pandas as pd

vec=sg_lda_sav[sg_vecs]
pd.DataFrame(vec).to_csv("doc_topic_mixture.csv")

In [None]:

top_words_per_topic = []
for t in range(sg_lda_sav.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in sg_lda_sav.show_topic(t, topn = 10)])

pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words.csv")
