# NRC Emotions lexicon based text and sentiment analysis between two text corpuses
This analyses the two corpuses PA and YT on the basis of association of words with eight emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). The [NRC lexicon](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) is used.


# Load the python packages and initialize global variables

In [1]:
import docx
import gensim
import os
import pandas as pd
import nltk
import numpy as np
import re
import spacy
import sys
from scipy.stats import mannwhitneyu
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

import pyLDAvis
import pyLDAvis.sklearn
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

nlp = spacy.load('en', disable=['parser', 'ner'])
sys.getdefaultencoding()

NRC_EMOTION_LEXICON_PATH = "Input/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
nouns = ['NNS', 'NNPS']  # + 'NN' + 'NNP',
adjectives = ['JJ', 'JJR', 'JJS']

OUTPUT_PATH = "Output"
PA_CORPUS_TEXT = "PACorpus.txt"
YT_CORPUS_TEXT = "YTCorpus.txt"
INPUT_PATH = "Input"
INPUT_FILE = "US3_ALL_TRANSCRIPTS.docx"
WORD_LIST = ['i', 'me', 'our', 'my', 'we', 'us', 'you', 'your', 'she', 'her', 'he', 'him', 'his', 'they','them', 'their']

# Read given input document file 
Read the given input file and return list of lines

In [2]:
def read_input_file(file_path):
    doc = docx.Document(file_path)
    total_lines = list()
    for i in doc.paragraphs:
        total_lines.append(i.text)
    return total_lines
filename = 'Input/US3_ALL_TRANSCRIPTS.docx'
lines = read_input_file(filename)

# Grouping of Corpuses
This method groups the given text into two groups (PA and YT) and removes unnecessary lines and characters.

In [3]:
def group_to_corpuses(lines_in_input):
    yt_group = list()
    pa_group = list()
    initial_group_flag = None
    regex = re.compile('^P[0-9]+$')

    for line in lines_in_input:
        if not line.strip():
            continue
        if line.strip().startswith("Joni: ") or line.strip().startswith("Jim:"):
            continue
        if line.strip().startswith("R:") or line.strip().startswith("R: ") or line.strip().startswith("R :"):
            continue
        if "R:" in line:
            lines_list = line.splitlines()
            for single_line in lines_list:
                if single_line.startswith("R:"):
                    lines_list.remove(single_line)
            line = ",".join(lines_list)

        if re.match(regex, line):
            continue
        if line.strip().startswith('2018-11-') or line.strip().startswith('Total experiment talk time:'):
            continue
        line = re.sub(r'\[[^()]*\]', '', line)  # regex removing text between brackets

        line = line.replace('...', ' ,')
        line = line.replace('…', ' ,')

        if not line.strip().startswith("P:") and not line.strip().startswith('YT') and not line.strip().startswith(
                'PA'):
            line = "".join([i if ord(i) < 128 else ' ' for i in line])
            if line.strip().startswith("R :") or line.strip().startswith("R4.") or line.strip().startswith("P10"):
                continue

        if line.strip().lower() == 'yt':
            initial_group_flag = 'yt'
            continue
        elif line.strip().lower() == 'pa':
            initial_group_flag = 'pa'
            continue
        elif line:
            if initial_group_flag == 'pa':
                pa_group.append(line)
            else:
                yt_group.append(line)
    return pa_group, yt_group
pa_group, yt_group = group_to_corpuses(lines)

# Write to each corpus file 
write the classified text to each corpus files

In [4]:
def write_to_corpus_file(data, _type=PA_CORPUS_TEXT):
    text_file_path = os.path.join(OUTPUT_PATH, _type)
    with open(text_file_path, 'w') as outfile:
        for line in data:
            outfile.write("%s\n" % line)

In [5]:
# write to pa corpus text file
write_to_corpus_file(pa_group, PA_CORPUS_TEXT)

# write to yt corpus text file
write_to_corpus_file(yt_group, YT_CORPUS_TEXT)

# Tokenize and Clean-up
Tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether. Gensim’s ```simple_preprocess()``` is great for this. Additionally I have set ```deacc=True``` to remove the punctuations.

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))
pa_sentences_group = sent_to_words(pa_group)
yt_sentences_group = sent_to_words(yt_group)

# Lemmatization
Lemmatization convert words to its root word. This method lemmatizes the tokenized words and joins them into sentences.

In [7]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(" ".join(
            [token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out
pa_sentences_group = lemmatization(pa_sentences_group)
pa_sentences_group = ". ".join(pa_sentences_group)
yt_sentences_group = lemmatization(yt_sentences_group)
yt_sentences_group = ". ".join(yt_sentences_group)

# Splitting
Splitter class use english pickle splitter to split joined sentences before tokenization

In [8]:
class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """
        input format: a paragraph of text
        output format: a list of lists of words.
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences

# POS Tagging
Parts of Speech tagging of splitted words in a sentence.

In [9]:
class POSTagger(object):
    def __init__(self):
        pass

    def pos_tag(self, sentences):
        """
        input format: list of lists of words
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        output format: list of lists of tagged tokens. Each tagged tokens has a
        form, a lemma, and a list of tags
            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        # adapt format
        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos

# Dictionary Tagging of NRC emotion lexicon 
Read the NRC emotion lexicon and tag the words with emotion and lexicon. Use this tagged data to find the emotions and sentiment of each word of the POS tagged PA and YT corpus sentences.

In [10]:
class DictionaryTagger(object):
    def __init__(self, file_path):
        self.dictionary = dict()
        self.max_key_size = 0
        with open(file_path, "r", encoding="utf-8") as nrc_file:
            for line in nrc_file.readlines():
                if not line.strip():
                    continue
                line = re.sub(r'\s+', '\t', line)
                splited = line.replace("\n", "").split("\t")
                word, emotion, value = splited[0], splited[1], splited[2]

                if word in self.dictionary.keys():
                    self.dictionary[word].append((emotion, int(value)))
                else:
                    self.dictionary[word] = [(emotion, int(value))]

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=False):
        """
        the result is only one tagging of all the possible ones.
        The resulting tagging is determined by these two priority rules:
            - longest matches have higher priority
            - search is made from left to right
        """
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while i < N:
            j = min(i + self.max_key_size, N)  # avoid overflow
            tagged = False
            while j > i:
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    # self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token:  # if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence

# MannWhitney-U test
Perform MannWhitney-U test to compare the emotions and sentiments between PA corpus and YT corpus

In [11]:
def mann_whitney_u_test(group_pa, group_yt):
    print("Mann Whitney-u Test:")
    pa_count = list(group_pa.values())
    yt_count = list(group_yt.values())
    try:
        mw_stat, mw_p = mannwhitneyu(pa_count, yt_count, use_continuity=True, alternative="two-sided")
    except ValueError:
        mw_stat = -1  # in case of ties, Mann-Whitney cannot rank, and so cannot calculate U
        mw_p = -1

    return mw_stat, mw_p

# Sentiment and Emotion Score Calculation
Get sentiments and emotions for each word from dictionary tagged data.

In [12]:
def get_sentiment(sentiment):
    if not isinstance(sentiment, tuple):
        return dict()
    results = dict()
    results.update({sentiment[0]: sentiment[1]})
    return results

Aggregate the sentiment score and emotions

In [13]:
def sentiment_score(dict_tagged_sentences):
    emotions = dict()
    for sentence in dict_tagged_sentences:
        for token in sentence:
            for tag in token[2]:
                value = get_sentiment(tag)
                if not value:
                    continue
                emotions.update({tag[0]: emotions.get(tag[0], 0) + value.get(tag[0], 0)})
    return emotions

Split the sentences in each corpus using the ```Splitter()``` we defined and POS tag the words using our ```POSTagger()``` before tagging these wors with sentiments and emotions using ```DictionaryTagger()```.

In [14]:
# read NRC emotion lexicon and tag these words with the given emotions and sentiments
dicttagger = DictionaryTagger(NRC_EMOTION_LEXICON_PATH)
# initialize 
postagger = POSTagger()
splitter = Splitter()

# split the pa corpus sentences using the splitter
splitted_pa_sentences = splitter.split(pa_sentences_group)

# POS tagging of splitted words in sentences
pos_tagged_pa_sentences = postagger.pos_tag(splitted_pa_sentences)

# emotions and sentiment tagging of the words in PA corpus using the loaded data from NRC emmotion lexicon
dict_tagged_pa_sentences = dicttagger.tag(pos_tagged_pa_sentences)

In [15]:
# split the yt corpus sentences using the splitter
splitted_yt_sentences = splitter.split(yt_sentences_group)

# POS tagging of splitted words in sentences
pos_tagged_yt_sentences = postagger.pos_tag(splitted_yt_sentences)

# emotions and sentiment tagging of the words in PA corpus using the loaded data from NRC emmotion lexicon
dict_tagged_yt_sentences = dicttagger.tag(pos_tagged_yt_sentences)

In [16]:
# calculate the emotions and sentiment score for PA corpus
pa_sentiment = sentiment_score(dict_tagged_pa_sentences)

# calculate the emotions and sentiment score for YT corpus
yt_sentiment = sentiment_score(dict_tagged_yt_sentences)

print("PA sentiment: %s" % pa_sentiment)
print("\nYT sentiment: %s\n" % yt_sentiment)
print("Is PA's Postive Sentiment > YT's Postive Sentiment? %s\n" % (pa_sentiment.get('positive') > yt_sentiment.get('positive')))

# calculate the MannWhitney-U test
mw_stat, mw_p = mann_whitney_u_test(pa_sentiment, yt_sentiment)
print("MannWhitney U Value: %s" % mw_stat)
print("MannWhitney rho Value: %s" % mw_p)

PA sentiment: {'anger': 57, 'anticipation': 353, 'disgust': 202, 'fear': 153, 'joy': 211, 'negative': 167, 'positive': 940, 'sadness': 235, 'surprise': 150, 'trust': 494}

YT sentiment: {'anger': 55, 'anticipation': 461, 'disgust': 92, 'fear': 205, 'joy': 169, 'negative': 122, 'positive': 636, 'sadness': 93, 'surprise': 88, 'trust': 417}

Is PA's Postive Sentiment > YT's Postive Sentiment? True

Mann Whitney-u Test:
MannWhitney U Value: 62.0
MannWhitney rho Value: 0.3846730627355087


# Removing Special characters
This method removes the special characters, articles, un-necessary words before steming and lemmatization

In [17]:
def remove_special_characters_from_lines(lines):
    total_clean_word_list = list()
    line_wise_word_list = list()
    for sentence in lines:
        tokens = nltk.word_tokenize(sentence)
        if 'P' in tokens:
            tokens.remove('P')
        tokens = list(filter(lambda x: x, map(lambda x: re.sub(r'[^A-Za-z0-9]+', '', x), tokens)))
        # remove articles
        tokens = [token for token in tokens if token.lower() not in ['a', 'an', 'the']]
        tagged = nltk.pos_tag(tokens)

        # lemmatize and stem the words
        stemmer = nltk.stem.PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        clean_word_list = list()
        for x, y in tagged:
            x = x.lower()
            if y in adjectives:
                clean_word_list.append(lemmatizer.lemmatize(x, pos='a'))
            elif y in verbs:
                clean_word_list.append(lemmatizer.lemmatize(x, pos='v'))
            elif y in nouns:
                clean_word_list.append(stemmer.stem(x))
            else:
                clean_word_list.append(x)
        total_clean_word_list.extend(clean_word_list)
        line_wise_word_list.append(clean_word_list)
    return total_clean_word_list, line_wise_word_list

tokenize, remove special characters, stem and lemmatize

In [18]:
pa_cleaned_up, _ = remove_special_characters_from_lines(pa_group)
yt_cleaned_up, _ = remove_special_characters_from_lines(yt_group)

# Lexical Diversity Analyser

In [19]:
def lexical_diversity_analyser(words_grouping):
    return len(set(words_grouping)) / len(words_grouping)

In [20]:
# calculate the lexical diversity
pa_ld = lexical_diversity_analyser(pa_cleaned_up)
yt_ld = lexical_diversity_analyser(yt_cleaned_up)

print("PA lexical diversity: %s" % pa_ld)
print("YT lexical diversity: %s" % yt_ld)

print("\nPA>YT? %s" % (pa_ld > yt_ld))

PA lexical diversity: 0.08002443494196701
YT lexical diversity: 0.08059488245226652

PA>YT? False


# Personal Pronoun Analysis
This method counts the number of personal pronouns in two separate corpuses for analysis. The pronouns are ```'i' 'me' 'our' 'my' 'we' 'us' 'you' 'your' 'she' 'her' 'he' 'him' 'his' 'they' 'them' 'their'```

In [21]:
def personal_pronoun_analysis(words_grouping):
    pronouns_to_count = WORD_LIST
    return dict((x, words_grouping.count(x)) for x in set(pronouns_to_count))

In [22]:
pa_personal_pronouns = personal_pronoun_analysis(pa_cleaned_up)
print("PA personal pronouns: %s" % pa_personal_pronouns)

yt_personal_pronouns = personal_pronoun_analysis(yt_cleaned_up)
print("\nYT personal pronouns: %s\n" % yt_personal_pronouns)

print("Count of SHE in PA > YT ? %s" % (pa_personal_pronouns.get('she') > yt_personal_pronouns.get('she')))
print("Count of HE in PA > YT ? %s" % (pa_personal_pronouns.get('he') > yt_personal_pronouns.get('he')))
print("Count of HER in PA > YT ? %s" % (pa_personal_pronouns.get('her') > yt_personal_pronouns.get('her')))
print("Count of HIS in PA > YT ? %s\n" % (pa_personal_pronouns.get('his') > yt_personal_pronouns.get('his')))


# calculate MannWhitney-U test
mw_stat, mw_p = mann_whitney_u_test(pa_personal_pronouns, yt_personal_pronouns)
print("MannWhitney U Value: %s" % mw_stat)
print("MannWhitney rho Value: %s" % mw_p)

PA personal pronouns: {'they': 96, 'i': 766, 'she': 5, 'my': 44, 'we': 127, 'them': 28, 'our': 37, 'her': 6, 'your': 12, 'him': 34, 'his': 59, 'he': 281, 'their': 14, 'me': 63, 'you': 164, 'us': 58}

YT personal pronouns: {'they': 171, 'i': 842, 'she': 2, 'my': 50, 'we': 149, 'them': 29, 'our': 80, 'her': 1, 'your': 8, 'him': 3, 'his': 6, 'he': 28, 'their': 16, 'me': 77, 'you': 130, 'us': 45}

Count of SHE in PA > YT ? True
Count of HE in PA > YT ? True
Count of HER in PA > YT ? True
Count of HIS in PA > YT ? True

Mann Whitney-u Test:
MannWhitney U Value: 145.0
MannWhitney rho Value: 0.5339542575755475


# Get the top 10 keywords from each topic
Top 10 keywords that are representative of the topic are created by ```show_topics()```

In [23]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

# Create the Document-Word matrix
The LDA topic model algorithm requires a document word matrix as the main input.

We can create one using CountVectorizer. In the below code, I have configured the CountVectorizer to consider words that has occurred at least 10 times (min_df), remove built-in english stopwords, convert all words to lowercase, and a word can contain numbers and alphabets of at least length 3 in order to be qualified as a word.

So, to create the doc-word matrix, we need to first initialise the CountVectorizer class with the required configuration and then apply fit_transform to actually create the matrix.

Sparsicity is nothing but the percentage of non-zero datapoints in the document-word matrix, that is data_vectorized.

In [24]:
def get_vectorized_data(data):
    data_words = list(sent_to_words(data))

    # Do lemmatization keeping only Noun, Adj, Verb, Adverb
    data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # print(data_lemmatized)

    vectorizer = CountVectorizer(
        analyzer='word',
        min_df=10,  # minimum reqd occurences of a word
        stop_words='english',  # remove stop words
        lowercase=True,  # convert all words to lowercase
        token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
    )

    data_vectorized = vectorizer.fit_transform(data_lemmatized)

    # Materialize the sparse data
    data_dense = data_vectorized.todense()

    # Compute Sparsicity = Percentage of Non-Zero cells
    print("Sparsicity: ", ((data_dense > 0).sum() / data_dense.size) * 100, "%")

    return vectorizer, data_vectorized

# GridSearch to find the best LDA model
The most important tuning parameter for LDA models is n_components (number of topics). In addition, learning_decay (which controls the learning rate) is important as well.

Besides these, other possible search params could be learning_offset (downweigh early iterations. Should be > 1) and max_iter.

The best topic model and its parameters are printed.

```Warning: process can consume a lot of time and resources.```

In [25]:
def get_optimized_lda(data):
    vectorizer, data_vectorized = get_vectorized_data(data)

    # Define Search Param
    search_params = {'n_components': [10, 15, 20, 25, 30, 35, 40], 'learning_decay': [0.1, 0.3, .5, .7, .9]}

    # Init the Model
    lda = LatentDirichletAllocation()

    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params, cv=3, iid=True)

    # Do the Grid Search
    model.fit(data_vectorized)

    # Best Model
    best_lda_model = model.best_estimator_
    print(best_lda_model)
    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
    return best_lda_model, data_vectorized, vectorizer

 # Finding dominant topic in each document
To classify a document as belonging to a particular topic, a logical approach is to see which topic has the highest contribution to that document and assign it.

In [26]:
def create_document_topic_matrix(best_lda_model, data_vectorized):
    # Create Document - Topic Matrix

    lda_output = best_lda_model.transform(data_vectorized)

    # column names
    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

    # index names
    docnames = ["Doc" + str(i) for i in range(int(lda_output.size/best_lda_model.n_components))]
    
    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic

    # Apply Style
    df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)

    return df_document_topics, df_document_topic, topicnames


# Review topics distribution across documents

In [27]:
def get_topic_distribution_over_documents(df_document_topic):
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Num', 'Num Documents']
    return df_topic_distribution

# Visualize the LDA model with pyLDAvis
The pyLDAvis offers the best visualization to view the topics-keywords distribution.

A good topic model will have non-overlapping, fairly big sized blobs for each topic.

In [28]:
def visualize_lda_model(best_lda_model, data_vectorized, vectorizer):
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
    return panel


# Get Topic’s keywords
The weights of each keyword in each topic is contained in lda_model.components_ as a 2d array. The names of the keywords itself can be obtained from vectorizer object using get_feature_names().

Let’s use this info to construct a weight matrix for all keywords in each topic.

In [29]:
def get_topic_keywords(best_lda_model, vectorizer, topicnames):
    # Topic-Keyword Matrix
    df_topic_keywords = pd.DataFrame(best_lda_model.components_)

    # Assign Column and Index
    df_topic_keywords.columns = vectorizer.get_feature_names()
    df_topic_keywords.index = topicnames
    return df_topic_keywords

# Get the top 10 keywords of each topic

In [30]:
def get_top_ten_keywords_in_topics(best_lda_model, vectorizer):
    topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=10)

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word ' + str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic ' + str(i) for i in range(df_topic_keywords.shape[0])]
    return df_topic_keywords

# Build LDA model with sklearn
Latent Dirichlet Allocation (LDA) is initialised and fit_transform() is called to build the LDA model.

```Note: This is not used here but we can use the optimized params from GridSearch Algorithm give above to initialized and fit the model.```

In [31]:
def analyser(data):
    _, data_vectorized = get_vectorized_data(data)
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=20,  # Number of topics
        max_iter=10,  # Max learning iterations
        learning_method='online',
        random_state=100,  # Random state
        batch_size=128,  # n docs in each learning iter
        evaluate_every=-1,  # compute perplexity every n iters, default: Don't
        n_jobs=-1,  # Use all available CPUs
    )
    lda_output = lda_model.fit_transform(data_vectorized)

    print(lda_output)

    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))

    # See model parameters
    pprint(lda_model.get_params())

In [32]:
# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)


def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

Get the optimized LDA parameters for each corpuses

In [33]:
print("Optimal LDA Parameters for PA corpus:")
best_lda_model_pa, data_vectorized_pa, vectorizer_pa = get_optimized_lda(pa_group)

Optimal LDA Parameters for PA corpus:
Sparsicity:  2.959816003294264 %
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.3,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
Best Model's Params:  {'learning_decay': 0.3, 'n_components': 10}
Best Log Likelihood Score:  -6614.630427130501
Model Perplexity:  108.7358250496893


In [34]:
print("Optimal LDA Parameters for YT corpus:")
best_lda_model_yt, data_vectorized_yt, vectorizer_yt = get_optimized_lda(yt_group)

Optimal LDA Parameters for YT corpus:
Sparsicity:  3.286978508217446 %
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.1,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
Best Model's Params:  {'learning_decay': 0.1, 'n_components': 10}
Best Log Likelihood Score:  -6077.405460494027
Model Perplexity:  104.32752629843496


In [35]:
print("Document-topic matrix for PA corpus:")
df_document_topics_pa, df_document_topic_pa, topicnames_pa = create_document_topic_matrix(best_lda_model_pa, data_vectorized_pa)
df_document_topics_pa

Document-topic matrix for PA corpus:


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.05,0.05,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05,5
Doc1,0.52,0.01,0.01,0.01,0.01,0.01,0.24,0.14,0.01,0.01,0
Doc2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc3,0.03,0.03,0.4,0.33,0.03,0.03,0.03,0.03,0.03,0.03,2
Doc4,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.82,0.02,8
Doc5,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0
Doc6,0.01,0.01,0.01,0.01,0.61,0.01,0.29,0.01,0.01,0.01,4
Doc7,0.2,0.01,0.01,0.01,0.01,0.01,0.01,0.3,0.4,0.01,8
Doc8,0.03,0.03,0.03,0.03,0.77,0.03,0.03,0.03,0.03,0.03,4
Doc9,0.0,0.0,0.0,0.0,0.55,0.11,0.14,0.0,0.0,0.19,4


In [36]:
print("Document-topic matrix for YT corpus:")
df_document_topics_yt, df_document_topic_yt, topicnames_yt = create_document_topic_matrix(best_lda_model_yt, data_vectorized_yt)
df_document_topics_yt

Document-topic matrix for YT corpus:


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.02,0.02,0.02,0.02,0.38,0.46,0.02,0.02,0.02,0.02,5
Doc1,0.01,0.01,0.01,0.01,0.29,0.62,0.01,0.01,0.01,0.01,5
Doc2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc3,0.3,0.01,0.26,0.01,0.15,0.24,0.01,0.01,0.01,0.01,0
Doc4,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc5,0.37,0.37,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0
Doc6,0.03,0.03,0.03,0.03,0.27,0.53,0.03,0.03,0.03,0.03,5
Doc7,0.15,0.58,0.08,0.01,0.01,0.01,0.01,0.15,0.01,0.01,1
Doc8,0.02,0.02,0.61,0.02,0.02,0.02,0.23,0.02,0.02,0.02,2
Doc9,0.05,0.05,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,2


In [37]:
print("Topics distribution over documents in PA corpus:")
df_topic_distribution_pa = get_topic_distribution_over_documents(df_document_topic_pa)
df_topic_distribution_pa

Topics distribution over documents in PA corpus:


Unnamed: 0,Topic Num,Num Documents
0,0,322
1,6,91
2,3,81
3,4,74
4,5,72
5,1,72
6,2,60
7,8,58
8,9,42
9,7,25


In [38]:
print("Topics distribution over documents in YT corpus:")
df_topic_distribution_yt = get_topic_distribution_over_documents(df_document_topic_yt)
df_topic_distribution_yt

Topics distribution over documents in YT corpus:


Unnamed: 0,Topic Num,Num Documents
0,0,263
1,5,79
2,6,72
3,4,68
4,8,57
5,9,54
6,2,51
7,1,51
8,7,48
9,3,48


In [39]:
print("Visualization of topics and most influencial words in the topics in PA corpus:")
pa_panel = visualize_lda_model(best_lda_model_pa, data_vectorized_pa, vectorizer_pa)
pyLDAvis.display(pa_panel)

Visualization of topics and most influencial words in the topics in PA corpus:


In [40]:
print("Visualization of topics and most influencial words in the topics in YT corpus:")
yt_panel = visualize_lda_model(best_lda_model_yt, data_vectorized_yt, vectorizer_yt)
pyLDAvis.display(yt_panel)

Visualization of topics and most influencial words in the topics in YT corpus:


In [41]:
print("Weights of each keyword in each topic in PA corpus:")
df_topic_keywords_pa = get_topic_keywords(best_lda_model_pa, vectorizer_pa, topicnames_pa)
df_topic_keywords_pa.head()

Weights of each keyword in each topic in PA corpus:


Unnamed: 0,actually,age,american,analytic,audience,average,base,big,click,college,...,united,use,video,view,want,watch,work,write,year,youtube
Topic0,5.635397,16.841967,0.100011,0.100004,0.100015,0.100009,0.439559,0.100005,0.616796,0.100008,...,19.50718,0.100005,0.100015,0.100016,0.100003,0.10001,0.100011,0.10001,8.139892,0.100015
Topic1,0.100007,0.100014,0.100001,15.09998,25.493491,0.100004,0.1,5.009875,0.100006,0.1,...,0.100002,2.932328,0.100014,0.100008,0.10001,0.100008,0.100014,0.100006,0.100006,31.215764
Topic2,0.100008,0.10001,0.100005,0.100002,0.100007,21.099926,1.982613,0.100005,0.100016,0.100376,...,3.577023,0.100005,17.912952,30.377934,1.669384,34.041549,5.909637,0.100001,18.020622,0.100011
Topic3,0.100016,0.100012,0.100013,0.1,0.100009,0.100014,0.100009,0.100002,1.370255,0.100005,...,0.100004,10.722951,0.100011,0.100011,0.100016,0.100008,0.100028,0.100011,0.100006,0.100003
Topic4,0.100024,0.100006,8.226265,0.1,0.100003,0.10002,0.100005,0.100029,0.100032,0.100012,...,1.050307,0.100005,0.10008,4.31493,0.100008,0.100029,11.490395,0.10002,0.65101,0.100004


In [42]:
print("Weights of each keyword in each topic in YT corpus:")
df_topic_keywords_yt = get_topic_keywords(best_lda_model_yt, vectorizer_yt, topicnames_yt)
df_topic_keywords_yt.head()

Weights of each keyword in each topic in YT corpus:


Unnamed: 0,actually,age,analytic,audience,average,big,bit,channel,click,come,...,view,viewer,want,watch,way,woman,work,write,year,youtube
Topic0,0.100008,2.318745,4.094406,7.1189,1.29061,0.1,3.563917,0.10476,8.771525,0.100009,...,0.100026,0.100026,0.100005,6.56257,1.412544,1.315136,3.464388,0.100005,0.100002,0.100014
Topic1,0.388992,11.665545,0.10002,11.58735,0.100002,10.925728,0.100007,1.357933,0.100008,5.68238,...,9.767087,10.781321,38.748034,0.100015,0.100007,0.10005,0.100022,0.100001,1.833867,0.100012
Topic2,18.664636,6.723185,0.100006,0.100018,0.100003,0.100004,0.100006,0.100012,0.100011,0.100004,...,0.100006,0.100014,0.100002,0.100023,0.100004,0.100052,0.100007,0.100002,0.100009,0.100006
Topic3,0.100004,4.204143,0.100009,0.100012,0.100001,0.869881,0.111331,0.100003,0.100011,26.516288,...,33.789799,0.10001,0.100002,7.197191,0.1,0.1,0.100007,0.100001,0.100002,15.080127
Topic4,0.100006,5.352408,18.566183,6.910934,0.1,0.100009,0.1,0.100009,12.422384,0.101295,...,2.768929,3.631722,0.100009,9.200962,0.1,2.199933,0.100007,0.100008,0.100013,16.405297


In [43]:
print('Top 10 keywords in each topic in PA ccorpus:')
df_topic_keywords_pa = get_top_ten_keywords_in_topics(best_lda_model_pa, vectorizer_pa)
df_topic_keywords_pa

Top 10 keywords in each topic in PA ccorpus:


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,persona,interesting,male,say,state,united,look,people,age,interested
Topic 1,jordan,youtube,audience,size,try,just,persona,analytic,reach,egypt
Topic 2,time,minute,good,watch,view,average,stuff,year,video,like
Topic 3,know,don,look,remember,just,thing,use,let,comment,quote
Topic 4,interested,research,innovation,like,iraq,election,syria,news,social,read
Topic 5,think,just,mean,write,video,actually,sure,sort,title,current
Topic 6,target,group,social,medium,news,interested,politic,video,issue,thank
Topic 7,audience,come,base,work,size,big,relate,people,guess,politic
Topic 8,right,want,topic,person,guy,content,look,need,think,search
Topic 9,say,maybe,people,know,college,important,number,interested,grad,email


In [44]:
print('Top 10 keywords in each topic in YT ccorpus:')
df_topic_keywords_yt = get_top_ten_keywords_in_topics(best_lda_model_yt, vectorizer_yt)
df_topic_keywords_yt

Top 10 keywords in each topic in YT ccorpus:


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,look,content,need,try,information,understand,lot,just,target,time
Topic 1,say,want,people,know,interested,age,audience,big,viewer,just
Topic 2,think,look,maybe,actually,didn,gender,difficult,interested,help,great
Topic 3,video,view,come,kind,comment,youtube,thing,make,day,lot
Topic 4,jordan,group,target,let,analytic,try,video,youtube,good,male
Topic 5,male,year,age,audience,just,man,look,target,female,consume
Topic 6,know,don,state,united,just,remember,think,select,sorry,mean
Topic 7,watch,time,really,thank,number,relate,man,male,youtube,woman
Topic 8,write,sure,youtube,think,thing,email,video,know,just,work
Topic 9,watch,minute,view,time,jordan,average,demographic,video,just,duration
