# Import Libraries

In [14]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from langdetect import detect
import scipy.sparse

# Gensim
import gensim
from gensim import matutils, models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# nltk for the stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk import pos_tag

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jennyraikakou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jennyraikakou/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jennyraikakou/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Load the Dataset

In [3]:
df = pd.read_csv("../../data/ubs-mobile-app-reviews-clean.csv")

In [4]:
df.shape

(1570, 13)

In [5]:
df.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,clean_content
0,0,0,gp:AOqpTOFKM8vyKDl8bQv21U8i2O8m6EdIkpCn8XNWJYj...,yoann mii,https://play-lh.googleusercontent.com/a/AATXAJ...,doesnt work on many phones and full of bugs,1,0,12.5.64086,2022-06-02 17:06:19,Thank you for your feedback & please excuse th...,2022-06-03 13:51:16,doesnt work phones bugs
1,1,1,gp:AOqpTOGpGnFdSoTVSpYBE-XY0td_ZsoQX9lbL_aHZAc...,Pratik Gilda,https://play-lh.googleusercontent.com/a-/AOh14...,Update on 2-june-22: unable to login with acce...,1,1,12.5.64086,2022-06-02 16:06:12,Thank you very much for your patience and plea...,2022-06-03 14:52:29,update june unable login access app update
2,2,2,gp:AOqpTOEPnZAw5fgoez35bU4IvWwSKSrfuWkwFs4USPK...,Radosław Kania,https://play-lh.googleusercontent.com/a/AATXAJ...,app is very slow. additional app for access in...,3,0,12.5.64086,2022-06-01 22:39:43,Thank you for your feedback & please excuse th...,2022-06-02 16:19:20,app slow additional app access shocker
3,3,3,gp:AOqpTOEFeYQ8CVjsVT-13q2tNJWFbabIEiOupsx5hkb...,adi leist,https://play-lh.googleusercontent.com/a/AATXAJ...,Lots of bugs since the last release. 1) QR pay...,1,0,12.5.64086,2022-06-01 16:59:37,,,lots bugs release payments scanning function d...
4,4,4,gp:AOqpTOHuzvbKRc8XzFCdmtUOlm95WTBzIDbDiLJ-na3...,Escoffery Babatunde,https://play-lh.googleusercontent.com/a/AATXAJ...,Can't pay the ebills if the payments feature d...,2,4,12.5.64086,2022-06-01 10:51:45,Please excuse the inconvenience. The problem r...,2022-06-01 16:14:48,pay bills payments feature doesnt work shows b...


In [6]:
scope_df = df[
    df["content"].notnull()
]

In [7]:
scope_df.shape

(1570, 13)

# Prepare the Data for the Topic Modeling

## Create document-term matrix

### sklearn & Gensim Implementation

In [9]:
data = df.content

In [10]:
cv = CountVectorizer()
data_cv = cv.fit_transform(data)

In [11]:
data_cv.shape

(1570, 3888)

In [12]:
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = df.index



In [13]:
data_stop

Unnamed: 0,05,06,08,0800,09,10,1000,100chf,1080,10th,...,öffnen,über,überarbeitete,überblick,überprüft,übersichtlich,übersichtlichkeit,überweisung,überweisungen,überzeugt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**What is sparse matrix?**

A sparse matrix is a matrix that is comprised of mostly zero values. Sparse matrices are distinct from matrices with mostly non-zero values, which are referred to as dense matrices. A matrix is sparse if many of its coefficients are zero.

In [15]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(data_stop.transpose())

**Create the corpus for the LDA**


In [20]:
corpus = matutils.Sparse2Corpus(sparse_counts)

In [21]:
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

# Perform LDA

In [25]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.027*"app" + 0.024*"good" + 0.019*"and" + 0.017*"banking" + 0.014*"not" + 0.013*"very" + 0.011*"user" + 0.010*"in" + 0.009*"it" + 0.009*"friendly"'),
 (1,
  '0.041*"to" + 0.033*"the" + 0.028*"it" + 0.023*"app" + 0.021*"and" + 0.017*"is" + 0.013*"you" + 0.012*"in" + 0.009*"this" + 0.009*"for"'),
 (2,
  '0.057*"the" + 0.035*"to" + 0.031*"app" + 0.024*"and" + 0.023*"it" + 0.020*"is" + 0.013*"not" + 0.011*"of" + 0.010*"with" + 0.010*"on"'),
 (3,
  '0.021*"app" + 0.016*"die" + 0.011*"und" + 0.009*"nicht" + 0.009*"de" + 0.008*"ist" + 0.007*"der" + 0.007*"la" + 0.007*"excellent" + 0.006*"das"')]