# Import Libraries

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# nltk for the stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jennyraikakou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from imp import reload


# Load the Dataset

In [2]:
df = pd.read_csv("../../data/ubs-mobile-app-reviews-clean.csv")

In [3]:
df.shape

(1570, 13)

In [4]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'reviewId', 'userName', 'userImage',
       'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at',
       'replyContent', 'repliedAt', 'clean_content'],
      dtype='object')

In [5]:
df.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,clean_content
0,0,0,gp:AOqpTOFKM8vyKDl8bQv21U8i2O8m6EdIkpCn8XNWJYj...,yoann mii,https://play-lh.googleusercontent.com/a/AATXAJ...,doesnt work on many phones and full of bugs,1,0,12.5.64086,2022-06-02 17:06:19,Thank you for your feedback & please excuse th...,2022-06-03 13:51:16,doesnt work phones bugs
1,1,1,gp:AOqpTOGpGnFdSoTVSpYBE-XY0td_ZsoQX9lbL_aHZAc...,Pratik Gilda,https://play-lh.googleusercontent.com/a-/AOh14...,Update on 2-june-22: unable to login with acce...,1,1,12.5.64086,2022-06-02 16:06:12,Thank you very much for your patience and plea...,2022-06-03 14:52:29,update june unable login access app update
2,2,2,gp:AOqpTOEPnZAw5fgoez35bU4IvWwSKSrfuWkwFs4USPK...,Radosław Kania,https://play-lh.googleusercontent.com/a/AATXAJ...,app is very slow. additional app for access in...,3,0,12.5.64086,2022-06-01 22:39:43,Thank you for your feedback & please excuse th...,2022-06-02 16:19:20,app slow additional app access shocker
3,3,3,gp:AOqpTOEFeYQ8CVjsVT-13q2tNJWFbabIEiOupsx5hkb...,adi leist,https://play-lh.googleusercontent.com/a/AATXAJ...,Lots of bugs since the last release. 1) QR pay...,1,0,12.5.64086,2022-06-01 16:59:37,,,lots bugs release payments scanning function d...
4,4,4,gp:AOqpTOHuzvbKRc8XzFCdmtUOlm95WTBzIDbDiLJ-na3...,Escoffery Babatunde,https://play-lh.googleusercontent.com/a/AATXAJ...,Can't pay the ebills if the payments feature d...,2,4,12.5.64086,2022-06-01 10:51:45,Please excuse the inconvenience. The problem r...,2022-06-01 16:14:48,pay bills payments feature doesnt work shows b...


In [6]:
data = df.clean_content.values.tolist() 

## Tokenize words and cleanup the text

Use gensims simple_preprocess(), set deacc=True to remove punctuations.



In [7]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


In [8]:
data_words = list(sent_to_words(data))
print(data_words[:1])

[['doesnt', 'work', 'phones', 'bugs']]


## Creating Bigram and Trigram models

Bigrams are 2 words frequently occuring together in docuent. Trigrams are 3 words frequently occuring. Many other techniques are explained in part-1 of the blog which are important in NLP pipline, it would be worth your while going through that blog. The 2 arguments for Phrases are min_count and threshold. The higher the values of these parameters , the harder its for a word to be combined to bigram.


**Bigrams** are two words frequently occurring together in the document. **Trigrams** are 3 words frequently occurring.

Some examples in our example are: ‘front_bumper’, ‘oil_leak’, ‘maryland_college_park’ etc.
Gensim’s Phrases model can build and implement the bigrams, trigrams, quadgrams and more. The two important arguments to Phrases are min_count and threshold. The higher the values of these param, the harder it is for words to be combined to bigrams.


In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.

trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['doesnt', 'work', 'phones', 'bugs']


## Remove Stopwords, make bigrams and lemmatize

Using lemmatization instead of stemming is a practice which especially pays off in topic modeling because lemmatized words tend to be more human-readable than stemming.


In [12]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


In [13]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['work', 'phone', 'bug']]


## Create Dictionary and Corpus needed for Topic Modeling


In [None]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])


In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis


# Using Gensim for LDA

I will be using the Latent Dirichlet Allocation (LDA) from Gensim package along with the Mallet’s implementation (via Gensim). Mallet has an efficient implementation of the LDA. It is known to run faster and gives better topics segregation.

We will also extract the volume and percentage contribution of each topic to get an idea of how important a topic is.

Let’s begin!




In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


In [None]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)


## Lemmatization

In [None]:

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])



## Create Corpora and Dictionary

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)


In [None]:
data_words[1]

In [None]:
id2word.doc2bow(data_words[1]) 

In [None]:
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

## LDA model training

- Gensim 

To keep things simple, we’ll keep all the parameters to default except for inputting the number of topics. For this tutorial, we will build a model with 10 topics where each topic is a combination of keywords, and each keyword contributes a certain weightage to the topic.

In [None]:
from pprint import pprint

# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


## Analyzing LDA model results


In [None]:

# Visualize the topics
pyLDAvis.enable_notebook()


gensimvis.prepare(lda_model, corpus, id2word)