# NRC Emotions lexicon based text and sentiment analysis between two text corpuses
This analyses the two corpuses PA and YT on the basis of association of words with eight emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). The [NRC lexicon](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) is used.


# Load the python packages and initialize global variables

In [1]:
import docx
import gensim
import os
import pandas as pd
import nltk
import numpy as np
import re
import spacy
import sys
from scipy.stats import mannwhitneyu
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

import pyLDAvis
import pyLDAvis.sklearn
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

nlp = spacy.load('en', disable=['parser', 'ner'])
sys.getdefaultencoding()

NRC_EMOTION_LEXICON_PATH = "Input/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
nouns = ['NNS', 'NNPS']  # + 'NN' + 'NNP',
adjectives = ['JJ', 'JJR', 'JJS']

OUTPUT_PATH = "Output"
PA_CORPUS_TEXT = "PACorpus.txt"
YT_CORPUS_TEXT = "YTCorpus.txt"
INPUT_PATH = "Input"
INPUT_FILE = "US3_ALL_TRANSCRIPTS.docx"
WORD_LIST = ['i', 'me', 'our', 'my', 'we', 'us', 'you', 'your', 'she', 'her', 'he', 'him', 'his', 'they','them', 'their']

# Read given input document file 
Read the given input file and return list of lines

In [2]:
def read_input_file(file_path):
    doc = docx.Document(file_path)
    total_lines = list()
    for i in doc.paragraphs:
        total_lines.append(i.text)
    return total_lines
filename = 'Input/US3_ALL_TRANSCRIPTS.docx'
lines = read_input_file(filename)

# Grouping of Corpuses
This method groups the given text into two groups (PA and YT) and removes unnecessary lines and characters. We have list of sentences read from the docx file as argument.

In [3]:
def group_to_corpuses(lines_in_input):
    yt_group = list()
    pa_group = list()
    initial_group_flag = None
    regex = re.compile('^P[0-9]+$')

    for line in lines_in_input:
        # remove empty line
        if not line.strip():
            continue
        # remove line starting with Joni or Jim
        if line.strip().startswith("Joni: ") or line.strip().startswith("Jim:"):
            continue
            
        # remove line starting with R: or R : since it is the interviwer part. There is no uniformity in starting character
        # so I have to use different srating characters
        if line.strip().startswith("R:") or line.strip().startswith("R: ") or line.strip().startswith("R :"):
            continue
            
        # If R: is present in a group of lines, remove the one sentence starting with R: and leave others
        if "R:" in line:
            line_list = line.splitlines()
            lines_list = line_list
            for single_line in line_list:
                if single_line.startswith("R:"):
                    lines_list.remove(single_line)
            line = ",".join(lines_list)

        # some line have P0, P1, (P+Number) so we use regex to find matching and remove those sentences
        if re.match(regex, line):
            continue
        # some line with date and time stamp information of inverview are removed
        if line.strip().startswith('2018-11-') or line.strip().startswith('Total experiment talk time:'):
            continue
            
        # regex removing text between brackets
        line = re.sub(r'\[[^()]*\]', '', line)

        # replace special characters given below with comma
        line = line.replace('...', ' ,')
        line = line.replace('…', ' ,')

        # this is a case of non-alphanumeric character present in sentence which does not start with P: or YT or PA
        # convert non-alphanumeric to numeric and remove sentences starting with R : or R4. or P10(that is left over 
        # due to non-alpha numeric character)
        if not line.strip().startswith("P:") and not line.strip().startswith('YT') and not line.strip().startswith(
                'PA'):
            line = "".join([i if ord(i) < 128 else ' ' for i in line])
            if line.strip().startswith("R :") or line.strip().startswith("R4.") or line.strip().startswith("P10"):
                continue

        # check if line start with YT or PA
        # if it starts with YT set group flag to YT and keep all sentences to YT group until flag is changed to PA
        # if flag is PA, keep all sentences to PA group until flag is changed to YT
        if line.strip().lower() == 'yt':
            initial_group_flag = 'yt'
            continue
        elif line.strip().lower() == 'pa':
            initial_group_flag = 'pa'
            continue
        elif line:
            if initial_group_flag == 'pa':
                pa_group.append(line)
            else:
                yt_group.append(line)
    return pa_group, yt_group
pa_group, yt_group = group_to_corpuses(lines)

# Write to each corpus file 
write the classified text to each corpus files

In [4]:
def write_to_corpus_file(data, _type=PA_CORPUS_TEXT):
    text_file_path = os.path.join(OUTPUT_PATH, _type)
    with open(text_file_path, 'w') as outfile:
        for line in data:
            outfile.write("%s\n" % line)

In [5]:
# write to pa corpus text file
write_to_corpus_file(pa_group, PA_CORPUS_TEXT)

# write to yt corpus text file
write_to_corpus_file(yt_group, YT_CORPUS_TEXT)

# Tokenize and Clean-up
We consider:
- Each text is a list of sentences
- Each sentence is a list of tokens
- Each token is a tuple of three elements: a word form (the exact word that appeared in the text), a word lemma (a generalized version of the word), and a list of associated tags

Tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether. Gensim’s ```simple_preprocess()``` is great for this. Additionally I have set ```deacc=True``` to remove the punctuations.

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))
pa_sentences_group = sent_to_words(pa_group)
yt_sentences_group = sent_to_words(yt_group)

# Lemmatization
Lemmatization convert words to its root word. This method lemmatizes the tokenized words and joins them into sentences.
For example: ‘Studying’ becomes ‘Study’, ‘Meeting becomes ‘Meet’, ‘Better’ and ‘Best’ becomes ‘Good’.

The advantage of this is, we get to reduce the total number of unique words in the dictionary. As a result, the number of columns in the document-word matrix (created by CountVectorizer in the further step) will be denser with lesser columns.

We can expect better topics to be generated in the end.

In [7]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(" ".join(
            [token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out
pa_sentences_group = lemmatization(pa_sentences_group)
pa_sentences_group = ". ".join(pa_sentences_group)
yt_sentences_group = lemmatization(yt_sentences_group)
yt_sentences_group = ". ".join(yt_sentences_group)

# Splitting
Splitter class use english pickle splitter to split paragraph into list of sentences and tokenize each sentence into words using ```nltk.tokenize```

In [8]:
class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """
        input format: a paragraph of text
        output format: a list of lists of words.
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences

# POS Tagging
A POS tag is a tag that indicates the part of speech for a word. POS tags provide linguistic signal on how a word is being used within the scope of a phrase, sentence, or document. 

What I mean by this is that the word “run” can be used as a verb “I run 5 miles every day” or as a noun “I went for a run”. Sometimes the POS is very very useful in cases where it distinguishes the word sense (the meaning of the word). In other cases, it is still useful in explaining the syntactic role of a word and we can often infer semantic information from this due to our knowledge of how this syntactic role is commonly used semantically.

If we donot POS tag, performance on tasks like word sense disambiguation and sentiment analysis would be radically worse (as there are many results that show that POS provides linguistic signal that is essential for strong performance on these tasks).

Parts of Speech tagging of splitted words in a sentence by ```Splitter()```.

In [9]:
class POSTagger(object):
    def __init__(self):
        pass

    def pos_tag(self, sentences):
        """
        input format: list of lists of words
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        output format: list of lists of tagged tokens. Each tagged tokens has a
        form, a lemma, and a list of tags
            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        # adapt format
        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos

# Dictionary Tagging of NRC emotion lexicon 
Read the NRC emotion lexicon and tag the words with emotion and lexicon. Use this tagged data to find the emotions and sentiment of each word of the POS tagged PA and YT corpus sentences.

In [10]:
class DictionaryTagger(object):
    def __init__(self, file_path):
        self.dictionary = dict()
        self.max_key_size = 0
        with open(file_path, "r", encoding="utf-8") as nrc_file:
            for line in nrc_file.readlines():
                if not line.strip():
                    continue
                line = re.sub(r'\s+', '\t', line)
                splited = line.replace("\n", "").split("\t")
                word, emotion, value = splited[0], splited[1], splited[2]

                if word in self.dictionary.keys():
                    self.dictionary[word].append((emotion, int(value)))
                else:
                    self.dictionary[word] = [(emotion, int(value))]

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=False):
        """
        the result is only one tagging of all the possible ones.
        The resulting tagging is determined by these two priority rules:
            - longest matches have higher priority
            - search is made from left to right
        """
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while i < N:
            j = min(i + self.max_key_size, N)  # avoid overflow
            tagged = False
            while j > i:
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    # self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token:  # if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence

# MannWhitney-U test
Perform MannWhitney-U test to compare the emotions and sentiments between PA corpus and YT corpus

In [11]:
def mann_whitney_u_test(group_pa, group_yt):
    print("Mann Whitney-u Test:")
    output = dict()
    for key, yt_value in group_yt.items():
        pa_value = group_pa.get(key)
        try:
            mw_stat, mw_p = mannwhitneyu([pa_value], [yt_value], use_continuity=False, alternative="greater")
        except ValueError:
            mw_stat = -1  # in case of ties, Mann-Whitney cannot rank, and so cannot calculate U
            mw_p = -1
        output.update({
            key: {
                'PA': pa_value,
                'YT': yt_value,
                'U-value': mw_stat,
                'P-value': mw_p
            }
        })
    return output

# Sentiment and Emotion Score Calculation
Get sentiments and emotions for each word from dictionary tagged data.

In [12]:
def get_sentiment(sentiment):
    if not isinstance(sentiment, tuple):
        return dict()
    results = dict()
    results.update({sentiment[0]: sentiment[1]})
    return results

Aggregate the sentiment score and emotions

In [13]:
def sentiment_score(dict_tagged_sentences):
    emotions = dict()
    for sentence in dict_tagged_sentences:
        for token in sentence:
            for tag in token[2]:
                value = get_sentiment(tag)
                if not value:
                    continue
                emotions.update({tag[0]: emotions.get(tag[0], 0) + value.get(tag[0], 0)})
    return emotions

Split the sentences in each corpus using the ```Splitter()``` we defined and POS tag the words using our ```POSTagger()``` before tagging these wors with sentiments and emotions using ```DictionaryTagger()```.

In [14]:
# read NRC emotion lexicon and tag these words with the given emotions and sentiments
dicttagger = DictionaryTagger(NRC_EMOTION_LEXICON_PATH)
# initialize 
postagger = POSTagger()
splitter = Splitter()

# split the paragraphs pa corpus into sentences and each sentence are tokenized to words list
splitted_pa_sentences = splitter.split(pa_sentences_group)

# POS tagging of tokenized words in sentences
pos_tagged_pa_sentences = postagger.pos_tag(splitted_pa_sentences)

# associating emotions and sentiment to the words in PA corpus using the loaded word emotion-sentiment from NRC emmotion lexicon
dict_tagged_pa_sentences = dicttagger.tag(pos_tagged_pa_sentences)

In [15]:
# split the paragraphs yt corpus into sentences and each sentence are tokenized to words list
splitted_yt_sentences = splitter.split(yt_sentences_group)

# POS tagging of tokenized words in sentences
pos_tagged_yt_sentences = postagger.pos_tag(splitted_yt_sentences)

# associating emotions and sentiment to the words in YT corpus using the loaded word emotion-sentiment from NRC emmotion lexicon
dict_tagged_yt_sentences = dicttagger.tag(pos_tagged_yt_sentences)

In [16]:
# calculate the emotions and sentiment score for PA corpus
pa_sentiment = sentiment_score(dict_tagged_pa_sentences)

# calculate the emotions and sentiment score for YT corpus
yt_sentiment = sentiment_score(dict_tagged_yt_sentences)

print("Is PA's Postive Sentiment > YT's Postive Sentiment? %s\n" % (pa_sentiment.get('positive') > yt_sentiment.get('positive')))

# calculate the MannWhitney-U test
output = mann_whitney_u_test(pa_sentiment, yt_sentiment)
pd.DataFrame.from_dict({(i): output[i] for i in output.keys()}, orient='index')

Is PA's Postive Sentiment > YT's Postive Sentiment? True

Mann Whitney-u Test:


Unnamed: 0,PA,YT,U-value,P-value
anger,57,55,1.0,0.158655
anticipation,352,460,0.0,0.841345
disgust,202,92,1.0,0.158655
fear,153,205,0.0,0.841345
joy,210,168,1.0,0.158655
negative,167,122,1.0,0.158655
positive,939,635,1.0,0.158655
sadness,235,93,1.0,0.158655
surprise,149,87,1.0,0.158655
trust,493,416,1.0,0.158655


Null Hypothesis (𝐻0): PA has less positive sentiment than YT.

Alternative hypothesis (𝐻𝑎)= PA has more positive sentiment than YT.

Null hypothesis is ``accepted for higher P-value``, and ``rejected for low P-values``.

Thus with low P-values for positive sentiment, we can say ``PA has more positive sentiment than YT``.

# Removing Special characters
It is done because previous tokenization and cleaning up removes all pronuns and words not needed for word analysis eg conjunction.

This method removes the special characters, articles, un-necessary words before steming and lemmatization.

In [17]:
def remove_special_characters_from_lines(lines):
    total_clean_word_list = list()
    line_wise_word_list = list()
    for sentence in lines:
        tokens = nltk.word_tokenize(sentence)
        if 'P' in tokens:
            tokens.remove('P')
        tokens = list(filter(lambda x: x, map(lambda x: re.sub(r'[^A-Za-z0-9]+', '', x), tokens)))
        # remove articles
        tokens = [token for token in tokens if token.lower() not in ['a', 'an', 'the']]
        tagged = nltk.pos_tag(tokens)

        # lemmatize and stem the words
        stemmer = nltk.stem.PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        clean_word_list = list()
        for x, y in tagged:
            x = x.lower()
            if y in adjectives:
                clean_word_list.append(lemmatizer.lemmatize(x, pos='a'))
            elif y in verbs:
                clean_word_list.append(lemmatizer.lemmatize(x, pos='v'))
            elif y in nouns:
                clean_word_list.append(stemmer.stem(x))
            else:
                clean_word_list.append(x)
        total_clean_word_list.extend(clean_word_list)
        line_wise_word_list.append(clean_word_list)
    return total_clean_word_list, line_wise_word_list

tokenize, remove special characters, stem and lemmatize

In [18]:
pa_cleaned_up, _ = remove_special_characters_from_lines(pa_group)
yt_cleaned_up, _ = remove_special_characters_from_lines(yt_group)

# Lexical Diversity Analyser

In [19]:
def lexical_diversity_analyser(words_grouping):
    return len(set(words_grouping)) / len(words_grouping)

In [20]:
# calculate the lexical diversity
pa_ld = lexical_diversity_analyser(pa_cleaned_up)
yt_ld = lexical_diversity_analyser(yt_cleaned_up)

print("PA lexical diversity: %s" % pa_ld)
print("YT lexical diversity: %s" % yt_ld)

print("\nPA>YT? %s" % (pa_ld > yt_ld))

PA lexical diversity: 0.08002443494196701
YT lexical diversity: 0.08059488245226652

PA>YT? False


# Personal Pronoun Analysis
This method counts the number of personal pronouns in two separate corpuses for analysis. The pronouns are ```'i' 'me' 'our' 'my' 'we' 'us' 'you' 'your' 'she' 'her' 'he' 'him' 'his' 'they' 'them' 'their'```

In [21]:
def personal_pronoun_analysis(words_grouping):
    pronouns_to_count = WORD_LIST
    return dict((x, words_grouping.count(x)) for x in set(pronouns_to_count))

In [22]:
pa_personal_pronouns = personal_pronoun_analysis(pa_cleaned_up)

yt_personal_pronouns = personal_pronoun_analysis(yt_cleaned_up)

print("Count of SHE in PA > YT ? %s" % (pa_personal_pronouns.get('she') > yt_personal_pronouns.get('she')))
print("Count of HE in PA > YT ? %s" % (pa_personal_pronouns.get('he') > yt_personal_pronouns.get('he')))
print("Count of HER in PA > YT ? %s" % (pa_personal_pronouns.get('her') > yt_personal_pronouns.get('her')))
print("Count of HIS in PA > YT ? %s\n" % (pa_personal_pronouns.get('his') > yt_personal_pronouns.get('his')))


# calculate MannWhitney-U test
output = mann_whitney_u_test(pa_personal_pronouns, yt_personal_pronouns)
pd.DataFrame.from_dict({(i): output[i] for i in output.keys()}, orient='index')

Count of SHE in PA > YT ? True
Count of HE in PA > YT ? True
Count of HER in PA > YT ? True
Count of HIS in PA > YT ? True

Mann Whitney-u Test:


Unnamed: 0,PA,YT,U-value,P-value
he,281,28,1.0,0.158655
her,6,1,1.0,0.158655
him,34,3,1.0,0.158655
his,59,6,1.0,0.158655
i,766,842,0.0,0.841345
me,63,77,0.0,0.841345
my,44,50,0.0,0.841345
our,37,80,0.0,0.841345
she,5,2,1.0,0.158655
their,14,16,0.0,0.841345


Null Hypothesis (𝐻0): PA have less counts of personal pronouns than YT.

Alternative hypothesis (𝐻𝑎)= PA have greater counts of personal pronouns than YT.

Null hypothesis is ``accepted for higher P-value``, and ``rejected for low P-values``.

Thus with low P-values for third person singular pronouns, we can say PA refers more to third person singular (she, he, her, his) than YT.

# Get the top 10 keywords from each topic
Top 10 keywords that are representative of the topic are created by ```show_topics()```

In [23]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

# Create the Document-Word matrix
The LDA topic model algorithm requires a document word matrix as the main input.

We can create one using CountVectorizer. In the below code, I have configured the CountVectorizer to consider words that has occurred at least 10 times (min_df), remove built-in english stopwords, convert all words to lowercase, and a word can contain numbers and alphabets of at least length 3 in order to be qualified as a word.

So, to create the doc-word matrix, we need to first initialise the CountVectorizer class with the required configuration and then apply fit_transform to actually create the matrix.

Sparsicity is nothing but the percentage of non-zero datapoints in the document-word matrix, that is data_vectorized.

In [24]:
def get_vectorized_data(data):
    data_words = list(sent_to_words(data))

    # Do lemmatization keeping only Noun, Adj, Verb, Adverb
    data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # print(data_lemmatized)

    vectorizer = CountVectorizer(
        analyzer='word',
        min_df=10,  # minimum reqd occurences of a word
        stop_words='english',  # remove stop words
        lowercase=True,  # convert all words to lowercase
        token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
    )

    data_vectorized = vectorizer.fit_transform(data_lemmatized)

    # Materialize the sparse data
    data_dense = data_vectorized.todense()

    # Compute Sparsicity = Percentage of Non-Zero cells
    print("Sparsicity: ", ((data_dense > 0).sum() / data_dense.size) * 100, "%")

    return vectorizer, data_vectorized

# GridSearch to find the best LDA model
The most important tuning parameter for LDA models is n_components (number of topics). In addition, learning_decay (which controls the learning rate) is important as well.

Besides these, other possible search params could be learning_offset (downweigh early iterations. Should be > 1) and max_iter.

The best topic model and its parameters are printed.

```Warning: process can consume a lot of time and resources.```

In [25]:
def get_optimized_lda(data):
    vectorizer, data_vectorized = get_vectorized_data(data)

    # Define Search Param
    search_params = {'n_components': [10, 15, 20, 25, 30, 35, 40], 'learning_decay': [0.1, 0.3, .5, .7, .9]}

    # Init the Model
    lda = LatentDirichletAllocation()

    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params, cv=3, iid=True)

    # Do the Grid Search
    model.fit(data_vectorized)

    # Best Model
    best_lda_model = model.best_estimator_
    print(best_lda_model)
    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
    return best_lda_model, data_vectorized, vectorizer

 # Finding dominant topic in each document
 
To classify a document as belonging to a particular topic, a logical approach is to see which topic has the highest contribution to that document and assign it.

```Note: Document here refers to each sentence of interviewee```

In [26]:
def create_document_topic_matrix(best_lda_model, data_vectorized):
    # Create Document - Topic Matrix

    lda_output = best_lda_model.transform(data_vectorized)

    # column names
    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

    # index names
    docnames = ["Sentence" + str(i) for i in range(int(lda_output.size/best_lda_model.n_components))]
    
    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic

    # Apply Style
    df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)

    return df_document_topics, df_document_topic, topicnames


# Review topics distribution across documents

In [27]:
def get_topic_distribution_over_documents(df_document_topic):
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Number', 'Number of Sentences']
    return df_topic_distribution

# Visualize the LDA model with pyLDAvis
The pyLDAvis offers the best visualization to view the topics-keywords distribution.

A good topic model will have non-overlapping, fairly big sized blobs for each topic.

In [28]:
def visualize_lda_model(best_lda_model, data_vectorized, vectorizer):
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
    return panel


# Get Topic’s keywords
The weights of each keyword in each topic is contained in lda_model.components_ as a 2d array. The names of the keywords itself can be obtained from vectorizer object using get_feature_names().

Let’s use this info to construct a weight matrix for all keywords in each topic.

In [29]:
def get_topic_keywords(best_lda_model, vectorizer, topicnames):
    # Topic-Keyword Matrix
    df_topic_keywords = pd.DataFrame(best_lda_model.components_)

    # Assign Column and Index
    df_topic_keywords.columns = vectorizer.get_feature_names()
    df_topic_keywords.index = topicnames
    return df_topic_keywords

# Get the top 10 keywords of each topic

In [30]:
def get_top_ten_keywords_in_topics(best_lda_model, vectorizer):
    topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=10)

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word ' + str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic ' + str(i) for i in range(df_topic_keywords.shape[0])]
    return df_topic_keywords

# Build LDA model with sklearn
Latent Dirichlet Allocation (LDA) is initialised and fit_transform() is called to build the LDA model.

```Note: This is not used here but we can use the optimized params from GridSearch Algorithm give above to initialized and fit the model.```

In [31]:
def analyser(data):
    _, data_vectorized = get_vectorized_data(data)
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=20,  # Number of topics
        max_iter=10,  # Max learning iterations
        learning_method='online',
        random_state=100,  # Random state
        batch_size=128,  # n docs in each learning iter
        evaluate_every=-1,  # compute perplexity every n iters, default: Don't
        n_jobs=-1,  # Use all available CPUs
    )
    lda_output = lda_model.fit_transform(data_vectorized)

    print(lda_output)

    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))

    # See model parameters
    pprint(lda_model.get_params())

In [32]:
# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)


def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

Get the optimized LDA parameters for each corpuses

In [33]:
print("Optimal LDA Parameters for PA corpus:")
best_lda_model_pa, data_vectorized_pa, vectorizer_pa = get_optimized_lda(pa_group)

Optimal LDA Parameters for PA corpus:
Sparsicity:  2.959816003294264 %
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.1,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
Best Model's Params:  {'learning_decay': 0.1, 'n_components': 10}
Best Log Likelihood Score:  -6603.29771211544
Model Perplexity:  106.21942876092282


In [34]:
print("Optimal LDA Parameters for YT corpus:")
best_lda_model_yt, data_vectorized_yt, vectorizer_yt = get_optimized_lda(yt_group)

Optimal LDA Parameters for YT corpus:
Sparsicity:  3.286978508217446 %
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.5,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -6086.15535114086
Model Perplexity:  105.13397706825653


In [35]:
print("Sentence-topic matrix for PA corpus:")
df_document_topics_pa, df_document_topic_pa, topicnames_pa = create_document_topic_matrix(best_lda_model_pa, data_vectorized_pa)
df_document_topics_pa

Sentence-topic matrix for PA corpus:


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Sentence0,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05,0.05,0.05,3
Sentence1,0.01,0.01,0.01,0.01,0.38,0.01,0.01,0.39,0.14,0.01,7
Sentence2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Sentence3,0.37,0.37,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0
Sentence4,0.02,0.02,0.22,0.42,0.22,0.02,0.02,0.02,0.02,0.02,3
Sentence5,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.55,9
Sentence6,0.01,0.01,0.01,0.01,0.3,0.42,0.01,0.19,0.01,0.01,5
Sentence7,0.01,0.01,0.16,0.01,0.01,0.01,0.01,0.01,0.3,0.44,9
Sentence8,0.03,0.03,0.03,0.03,0.03,0.77,0.03,0.03,0.03,0.03,5
Sentence9,0.0,0.11,0.0,0.06,0.0,0.62,0.11,0.0,0.08,0.0,5


In [36]:
print("Sentence-topic matrix for YT corpus:")
df_document_topics_yt, df_document_topic_yt, topicnames_yt = create_document_topic_matrix(best_lda_model_yt, data_vectorized_yt)
df_document_topics_yt

Sentence-topic matrix for YT corpus:


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Sentence0,0.22,0.02,0.02,0.02,0.02,0.02,0.62,0.02,0.02,0.02,6
Sentence1,0.01,0.01,0.01,0.01,0.01,0.01,0.27,0.01,0.46,0.19,8
Sentence2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Sentence3,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.94,0.01,8
Sentence4,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Sentence5,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.37,0.37,0.03,7
Sentence6,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.77,0.03,8
Sentence7,0.01,0.01,0.23,0.26,0.01,0.01,0.46,0.01,0.01,0.01,6
Sentence8,0.02,0.02,0.02,0.02,0.02,0.82,0.02,0.02,0.02,0.02,5
Sentence9,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.55,0.05,8


In [37]:
print("Topics distribution over sentences in PA corpus:")
df_topic_distribution_pa = get_topic_distribution_over_documents(df_document_topic_pa)
df_topic_distribution_pa

Topics distribution over sentences in PA corpus:


Unnamed: 0,Topic Number,Number of Sentences
0,0,338
1,6,82
2,2,82
3,3,68
4,9,67
5,5,66
6,4,61
7,7,50
8,1,46
9,8,37


In [38]:
print("Topics distribution over sentences in YT corpus:")
df_topic_distribution_yt = get_topic_distribution_over_documents(df_document_topic_yt)
df_topic_distribution_yt

Topics distribution over sentences in YT corpus:


Unnamed: 0,Topic Number,Number of Sentences
0,0,268
1,3,99
2,8,76
3,2,65
4,5,59
5,6,51
6,7,50
7,4,48
8,9,40
9,1,35


In [39]:
print("Visualization of topics and most influencial words in the topics in PA corpus:")
pa_panel = visualize_lda_model(best_lda_model_pa, data_vectorized_pa, vectorizer_pa)
filename = 'Output/lda_pa.html'
pyLDAvis.save_html(pa_panel, filename)
pyLDAvis.display(pa_panel)

Visualization of topics and most influencial words in the topics in PA corpus:


In [40]:
print("Visualization of topics and most influencial words in the topics in YT corpus:")
yt_panel = visualize_lda_model(best_lda_model_yt, data_vectorized_yt, vectorizer_yt)
pyLDAvis.save_html(yt_panel, 'Output/lda_yt.html')
pyLDAvis.display(yt_panel)

Visualization of topics and most influencial words in the topics in YT corpus:


In [41]:
print("Weights of each keyword in each topic in PA corpus:")
df_topic_keywords_pa = get_topic_keywords(best_lda_model_pa, vectorizer_pa, topicnames_pa)
df_topic_keywords_pa.head()

Weights of each keyword in each topic in PA corpus:


Unnamed: 0,actually,age,american,analytic,audience,average,base,big,click,college,...,united,use,video,view,want,watch,work,write,year,youtube
Topic0,6.075188,0.100008,4.837155,0.100011,0.100009,0.100017,14.734043,0.100002,12.099965,0.1,...,0.100005,8.756649,0.100014,0.100024,0.100009,4.096332,0.10002,0.100011,0.100002,0.10002
Topic1,9.276368,0.100005,0.100056,0.100005,0.100005,0.100003,0.100012,1.240332,0.100006,3.060798,...,0.100004,0.100015,2.852568,3.161691,0.100006,0.100016,0.100011,0.100012,0.100003,0.100007
Topic2,0.100021,0.100038,0.100017,0.1,0.100003,0.100004,0.100002,0.100001,0.100003,0.100011,...,0.100008,0.100005,0.100011,0.100008,0.100016,0.10001,3.211776,9.398381,0.100002,0.100007
Topic3,8.244376,0.100027,0.100016,1.671503,0.100008,0.100011,0.10001,0.100004,0.100009,0.100008,...,0.100005,2.781822,0.100009,0.100035,7.345402,0.100003,6.026908,0.100017,0.100009,2.528608
Topic4,0.100009,39.626587,0.100014,0.100005,0.100014,0.100018,0.100011,0.1,0.100001,6.827783,...,0.100014,0.100004,0.100024,0.100017,0.100005,0.100012,0.100007,0.599884,0.100003,0.100006


In [42]:
print("Weights of each keyword in each topic in YT corpus:")
df_topic_keywords_yt = get_topic_keywords(best_lda_model_yt, vectorizer_yt, topicnames_yt)
df_topic_keywords_yt.head()

Weights of each keyword in each topic in YT corpus:


Unnamed: 0,actually,age,analytic,audience,average,big,bit,channel,click,come,...,view,viewer,want,watch,way,woman,work,write,year,youtube
Topic0,0.100008,8.452175,0.100002,0.100023,17.589971,11.041212,0.100011,0.100004,7.795185,11.352018,...,42.145721,0.100019,0.100015,20.619627,0.1,0.100006,0.100009,0.100003,3.943871,3.955746
Topic1,3.374296,7.49945,0.100002,2.685908,0.952563,2.158697,0.100027,1.09991,0.100005,0.100007,...,9.181564,11.533218,0.100006,23.521182,0.100014,6.87788,0.1,0.1,0.100007,0.816617
Topic2,0.100014,0.100008,0.10001,0.100014,1.56594,0.100006,2.951572,0.100009,1.266147,5.081311,...,9.577562,0.100006,0.100007,45.391348,0.100002,0.100001,0.100003,2.099901,0.100003,0.10001
Topic3,9.237356,0.100013,0.100006,6.989475,0.100004,0.100003,4.099948,0.10001,11.868525,5.425213,...,0.100017,1.397044,0.100011,11.284581,5.099969,2.120734,7.480872,21.100054,0.1,8.424013
Topic4,0.10001,0.100004,23.099949,3.005906,0.100001,0.100019,0.100011,0.1,0.100023,0.100006,...,1.501162,0.100007,0.100005,0.100005,0.100012,1.301351,6.719057,0.100008,0.100005,21.102709


In [43]:
print('Top 10 keywords in each topic in PA corpus:')
df_topic_keywords_pa = get_top_ten_keywords_in_topics(best_lda_model_pa, vectorizer_pa)
df_topic_keywords_pa

Top 10 keywords in each topic in PA corpus:


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,just,really,know,people,thing,stuff,sure,let,base,don
Topic 1,think,good,second,really,news,actually,probably,easy,mean,view
Topic 2,know,look,don,remember,right,like,guy,guess,person,easy
Topic 3,just,say,search,try,think,interesting,people,know,sort,right
Topic 4,target,group,age,topic,thank,important,man,college,grad,comment
Topic 5,interested,social,medium,politic,syria,issue,research,news,innovation,maybe
Topic 6,video,minute,youtube,watch,time,view,write,read,title,average
Topic 7,research,year,male,innovation,state,united,election,phil,old,work
Topic 8,audience,size,news,reach,general,big,current,interested,egypt,just
Topic 9,persona,content,jordan,want,look,need,kind,say,interested,similar


In [44]:
print('Top 10 keywords in each topic in YT corpus:')
df_topic_keywords_yt = get_top_ten_keywords_in_topics(best_lda_model_yt, vectorizer_yt)
df_topic_keywords_yt

Top 10 keywords in each topic in YT corpus:


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,jordan,view,video,watch,average,just,duration,let,datum,guess
Topic 1,content,watch,number,look,video,long,male,man,viewer,don
Topic 2,time,watch,really,need,thing,sure,look,minute,just,exactly
Topic 3,know,don,video,write,news,mean,email,like,click,think
Topic 4,say,analytic,youtube,male,persona,easy,use,female,work,high
Topic 5,people,interested,didn,good,maybe,comment,story,actually,look,know
Topic 6,audience,target,try,state,united,demographic,look,want,male,people
Topic 7,just,want,say,youtube,thank,watch,tell,video,right,male
Topic 8,look,think,jordan,year,man,age,just,information,okay,try
Topic 9,group,age,male,watch,target,view,total,jordan,demographic,try
