In [None]:
import pandas as pd

In [None]:
#loading the data
data = pd.read_csv('Corpus.csv', error_bad_lines=False);

In [None]:
#number of rows and column
data.shape


In [None]:
data.head

In [None]:
#Keeping English, cleaning data
from langdetect import detect
from tqdm import tqdm_notebook
tqdm_notebook().pandas()
data['lang'] = data.content.progress_map(detect)
data.lang.value_counts()



In [None]:
# Import the wordcloud library
# word cloud before implementing stop words 
import matplotlib.pyplot as plt

#from wordcloud import WordCloud
# Join the different processed titles together.
#long_string = ','.join(list(data['content'].values))
# Create a WordCloud object
#wordcloud = WordCloud(background_color="black", max_words=5000, contour_width=10, contour_color='steelblue')
# Generate a word cloud
#wordcloud.generate(long_string)
# Visualize the word cloud
##wordcloud.to_image()
#wordcloud.to_file('N.png')

#changes





In [None]:
#Tokenization
#Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
data['sentences'] = data.content.progress_map(sent_tokenize)
#data['sentences'].head(1).tolist()[0][:3] # Print the first row

In [None]:

#tokenization 
from nltk.tokenize import word_tokenize
data['tokens_sentences'] = data['sentences'].progress_map(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
#print(data['tokens_sentences'].head(1).tolist()[0][:3])

In [None]:
#Lemmatization - verbs in past and future tenses are changed into present.
#Lemmatizing with POS tagging¶
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
data['POS_tokens'] = data['tokens_sentences'].progress_map(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])
#print(data['POS_tokens'].head(1).tolist()[0][:3])


In [None]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:

# Lemmatizing each word with its POS tag, in each sentence
data['tokens_sentences_lemmatized'] = data['POS_tokens'].progress_map(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

In [None]:
data['tokens_sentences_lemmatized'].head(1).tolist()[0][:3]

In [None]:
#Regrouping tokens and removing stop words¶
#nlk is a package for building Python programs to work with human language data
from nltk.corpus import stopwords
#stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can']


with open('stopwords.txt', 'r') as file:dataExtendedStopwords = file.read().replace('\n', '')
stop_words = stopwords.words('english')
stop_words.extend(dataExtendedStopwords)
#my_stopwords = stop_words
#my_stopwords = stop_words + stopwords_other
#my_stopwords = stopwords_other
#print(my_stopwords)
from itertools import chain # to flatten list of sentences of tokens into list of tokens
data['tokens'] = data['tokens_sentences_lemmatized'].map(lambda sentences: list(chain.from_iterable(sentences)))
data['tokens'] = data['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in stop_words and len(token)>1])
print(data['tokens'])



In [None]:
#LDA preparation
#Prepare bi-grams and tri-grams
from gensim.models import Phrases
tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])


In [None]:
#Prepare objects for LDA gensim implementation¶
from gensim import corpora
dictionary_LDA = corpora.Dictionary(tokens)
#dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(token) for token in tokens]
print(corpus[:2])

In [None]:
#Running LDA
from gensim import models
import numpy as np
np.random.seed(123456)
num_topics = 10
lda_model = models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, chunksize=450, \
                                  id2word=dictionary_LDA, random_state=500, \
                                  passes=10, alpha='auto', per_word_topics=True, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

In [None]:
#Viewing the topics
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
  

In [None]:
top_words_per_topic = []

for t in range(lda_model.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 10)])

pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("10Topics.csv")

In [None]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
# importing os module 
#OS module in Python provides functions for interacting with the operating system
import os 
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join(str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary_LDA)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, str(num_topics) +'.html')
LDAvis_prepared




In [None]:
#What is the Dominant topic and its percentage contribution in each document
#In LDA models, each document is composed of multiple topics. But, typically only one of the topics is dominant. 
#The below code extracts this dominant topic for each sentence and shows the weight of the topic and the keywords in a nicely formatted output.
#This way, you will know which document belongs predominantly to which topic.




def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=tokens)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
#The most representative sentence for each topic
# Display setting to show more characters in column

pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

In [None]:
#Let’s plot the word counts and the weights of each keyword in the same chart.
from collections import Counter
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in tokens for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
import matplotlib.colors as mcolors
fig, axes = plt.subplots(5, 2, figsize=(10,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=10)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

