In [7]:
import re
import nltk

# We need this dataset in order to use the tokenizer
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Also download the list of stopwords to filter out
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def process_text(text):
    # Make all the strings lowercase and remove non alphabetic characters
    text = re.sub('[^A-Za-z]', ' ', text.lower())

    # Tokenize the text; this is, separate every sentence into a list of words
    # Since the text is already split into sentences you don't have to call sent_tokenize
    tokenized_text = word_tokenize(text)

    # Remove the stopwords and stem each word to its root
    clean_text = [
        stemmer.stem(word) for word in tokenized_text
        if word not in stopwords.words('english')
    ]

    # Remember, this final output is a list of words
    return clean_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\enlik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\enlik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import re
import pandas as pd

# bolt_google_playstore_review.csv 17MB (21 Dec 2020)
url='https://drive.google.com/file/d/1qWuyf3UrpaU5xnxLmO3GMFa6zybSFYQh/view?usp=sharing'
url2='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url2, usecols=['content','at'])

# remove emoji
# https://stackoverflow.com/a/57514515/2670476
df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

df.content = df.apply(lambda row: re.sub(r"http\S+", "", row.content).lower(), 1)
df.content = df.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.content.split())), 1)
df.content = df.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.content).split()), 1)
texts = df.content.tolist()
timestamps = df['at'].tolist()

In [8]:
from gensim import corpora
texts = [process_text(text) for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [9]:
from gensim import models
model = models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

topics = model.print_topics(num_words=3)
for topic in topics:
    print(topic)


(0, '0.160*"good" + 0.072*"great" + 0.058*"app"')
(1, '0.032*"driver" + 0.030*"app" + 0.015*"use"')
(2, '0.022*"use" + 0.020*"easi" + 0.016*"like"')


In [14]:
from gensim import models
model = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = model.print_topics(num_words=3)
for topic in topics:
    print(topic)


(0, '0.061*"driver" + 0.036*"time" + 0.026*"ride"')
(1, '0.025*"ok" + 0.016*"de" + 0.013*"bad"')
(2, '0.245*"good" + 0.111*"great" + 0.067*"app"')
(3, '0.039*"app" + 0.020*"use" + 0.017*"taxifi"')
(4, '0.105*"love" + 0.103*"nice" + 0.041*"cool"')
