Importing the required libraries

In [90]:
import numpy as np
import pandas as pd

import re
import string
import emoji

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from pymongo import MongoClient

import pickle
import os

import warnings
from pandas.core.generic import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

Loading the English model

In [2]:
nlp = spacy.load("en_core_web_lg")

Creating a connection to MongoDB

In [3]:
%%time

client = MongoClient(
    os.environ["MONGODB_URL"],
    serverSelectionTimeoutMS=300000
)
db = client["vidio"]
collection = db["google_play_store_reviews"]
df_original = pd.DataFrame(list(collection.find()))
df_original = df_original.drop("_id", axis=1)

CPU times: total: 2.75 s
Wall time: 1min 21s


Removing unnecessary rows and columns

In [67]:
df = df_original.copy()
df = df.replace("empty", np.nan)
df["repliedAt"] = pd.to_datetime(df["repliedAt"])
df = df.sort_values("at", ascending=False)
df = df[df["score"] <= 3]
df = df[df["content_formal_indonesian"].notna()]
df = df[~df["content_original"].str.match(r"[\u263a-\U0001f645]")]
df = df[["reviewId", "userName", "at", "content_original", "content_formal_indonesian", "content_english", "score"]]
df.head()

Unnamed: 0,reviewId,userName,at,content_original,content_formal_indonesian,content_english,score
241854,f9199a90-9330-4ed2-9ea0-e7f98ea65ec2,Devi Nurfalisyah,2023-04-22 07:12:54,Banyak iklan,Terlalu banyak iklan.,Excessive advertising.,1
241857,27a2af58-142d-4d45-901b-88ab09705987,Ariii Arii,2023-04-22 05:40:09,Nyesel aku download aplikasi ini mana iklan ny...,Saya sangat menyesal telah mengunduh aplikasi ...,I deeply regret downloading this application b...,1
241858,e1d00b2f-a5c8-44cc-9d05-6dbadb086d0e,randi nugraha,2023-04-22 05:13:19,Pliss kurangin iklan nya ya,Tolong kurangi iklannya ya.,Please reduce the number of ads displayed.,1
241860,e6ad63a8-7378-4c62-8117-a9b32541c6ac,DEDExs 1319,2023-04-22 03:19:38,Gilaaaa ini mau nonton tv malah jadi nonton ik...,"Sungguh tidak masuk akal, saya ingin menonton ...","It's really ridiculous, I want to watch TV but...",1
241862,b8eecd63-d6bc-4571-8bb2-23f3ee884743,Taufik Kurrahim,2023-04-22 01:34:44,"Gajelas, masa udah beli paket langganan belum ...",Saya merasa sangat kecewa dengan Gajelas. Masa...,"I am very disappointed with Gajelas. How come,...",1


In [68]:
df.shape

(20428, 7)

Cleaning the reviews

In [69]:
for i in ["content_formal_indonesian", "content_english"]:
    df[i] = df[i].str.replace("Baris 1", "")
    df[i] = df[i].str.replace("Baris 2", "")
    df[i] = df[i].str.replace("(", "", regex=True)
    df[i] = df[i].str.replace(")", "", regex=True)
    df[i] = df[i].apply(lambda x: emoji.replace_emoji(x, ""))

In [70]:
df["content_english"] = df["content_english"].str.replace("ads", "advertisements")

In [71]:
replace1 = list(df[df["content_english"].str.contains("Note")].index)
replace2 = list(df[(df["content_english"].str.contains("Indonesian", case=False)) & (df["content_english"].str.contains("dictionary", case=False))].index)
replace3 = list(df[(df["content_english"].str.contains("Indonesian", case=False)) & (df["content_english"].str.contains("language", case=False))].index)

replace = sorted(list(set(replace1 + replace2 + replace3)))

df.loc[df.index.isin(replace), ["content_formal_indonesian", "content_english"]] = df.loc[df.index.isin(replace), "content_original"]

In [72]:
def clean(text):
    text = text.lower()
    text = re.sub(r'\$\w*', '',str(text ))
    text = re.sub(r'\bRT\b', '', text)
    text = re.sub('b\'', '', text)
    text = re.sub(r'\.{2,}', ' ', text)
    text = re.sub('@[^\s]+','',text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub('[0-9]+', '', text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'(\s)#\w+', r'\1', text)
    text = text.strip(' "\'')
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans("","",string.punctuation))
    text = text.replace("\n",' ')
    return text 

df["content_cleaned"] = df["content_english"].apply(clean)
df = df[df["content_cleaned"].str.strip().replace("\s+", "") != ""]
df = df[df["content_cleaned"] != ""]
df.head()

Unnamed: 0,reviewId,userName,at,content_original,content_formal_indonesian,content_english,score,content_cleaned
241854,f9199a90-9330-4ed2-9ea0-e7f98ea65ec2,Devi Nurfalisyah,2023-04-22 07:12:54,Banyak iklan,Terlalu banyak iklan.,Excessive advertising.,1,excessive advertising
241857,27a2af58-142d-4d45-901b-88ab09705987,Ariii Arii,2023-04-22 05:40:09,Nyesel aku download aplikasi ini mana iklan ny...,Saya sangat menyesal telah mengunduh aplikasi ...,I deeply regret downloading this application b...,1,i deeply regret downloading this application b...
241858,e1d00b2f-a5c8-44cc-9d05-6dbadb086d0e,randi nugraha,2023-04-22 05:13:19,Pliss kurangin iklan nya ya,Tolong kurangi iklannya ya.,Please reduce the number of advertisements dis...,1,please reduce the number of advertisements dis...
241860,e6ad63a8-7378-4c62-8117-a9b32541c6ac,DEDExs 1319,2023-04-22 03:19:38,Gilaaaa ini mau nonton tv malah jadi nonton ik...,"Sungguh tidak masuk akal, saya ingin menonton ...","It's really ridiculous, I want to watch TV but...",1,its really ridiculous i want to watch tv but a...
241862,b8eecd63-d6bc-4571-8bb2-23f3ee884743,Taufik Kurrahim,2023-04-22 01:34:44,"Gajelas, masa udah beli paket langganan belum ...",Saya merasa sangat kecewa dengan Gajelas. Masa...,"I am very disappointed with Gajelas. How come,...",1,i am very disappointed with gajelas how come w...


Lemmatizing the reviews

In [73]:
%%time

def lemmatization(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc if not token.is_stop and token.lang_ == 'en'])
    return lemmatized_text

df["content_cleaned"] = df["content_cleaned"].apply(nlp)
df["content_cleaned"] = df["content_cleaned"].apply(lambda x: lemmatization(x.text))
df.head()

CPU times: total: 3min 2s
Wall time: 5min 33s


Unnamed: 0,reviewId,userName,at,content_original,content_formal_indonesian,content_english,score,content_cleaned
241854,f9199a90-9330-4ed2-9ea0-e7f98ea65ec2,Devi Nurfalisyah,2023-04-22 07:12:54,Banyak iklan,Terlalu banyak iklan.,Excessive advertising.,1,excessive advertising
241857,27a2af58-142d-4d45-901b-88ab09705987,Ariii Arii,2023-04-22 05:40:09,Nyesel aku download aplikasi ini mana iklan ny...,Saya sangat menyesal telah mengunduh aplikasi ...,I deeply regret downloading this application b...,1,deeply regret download application advertiseme...
241858,e1d00b2f-a5c8-44cc-9d05-6dbadb086d0e,randi nugraha,2023-04-22 05:13:19,Pliss kurangin iklan nya ya,Tolong kurangi iklannya ya.,Please reduce the number of advertisements dis...,1,reduce number advertisement display
241860,e6ad63a8-7378-4c62-8117-a9b32541c6ac,DEDExs 1319,2023-04-22 03:19:38,Gilaaaa ini mau nonton tv malah jadi nonton ik...,"Sungguh tidak masuk akal, saya ingin menonton ...","It's really ridiculous, I want to watch TV but...",1,ridiculous want watch tv endless advertisement...
241862,b8eecd63-d6bc-4571-8bb2-23f3ee884743,Taufik Kurrahim,2023-04-22 01:34:44,"Gajelas, masa udah beli paket langganan belum ...",Saya merasa sangat kecewa dengan Gajelas. Masa...,"I am very disappointed with Gajelas. How come,...",1,disappointed gajela come buy subscription pack...


Performing another cleaning

In [74]:
for i in ["content_cleaned"]:
    df[i] = df[i].str.replace("world", "", case=False)
    df[i] = df[i].str.replace("cup", "", case=False)
    df[i] = df[i].str.replace("final", "", case=False)
    df[i] = df[i].str.replace("wc", "", case=False)
    df[i] = df[i].str.replace("football", "", case=False)
    df[i] = df[i].str.replace("app ", " ", case=False)
    df[i] = df[i].str.replace("application", "", case=False)
    df[i] = df[i].str.replace("apk", "", case=False)

Creating a CountVectorizer to convert a collection of text documents into a matrix of token counts

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")
dtm = cv.fit_transform(df["content_cleaned"])

Creating an LDA model with 4 topics

In [76]:
%%time

from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=4, random_state=42)
LDA.fit(dtm)

CPU times: total: 23.7 s
Wall time: 41.1 s


Showing the top 15 words on each of the 4 topics

In [78]:
df_top_15_words = pd.DataFrame(
    data=[cv.get_feature_names_out()[i][::-1] for i in [topic.argsort()[-15:] for topic in LDA.components_]],
    columns=[f"Words {index}" for index in range(15)],
    index=[f"Topic {index}" for index in range(len(LDA.components_))]
)
df_top_15_words

Unnamed: 0,Words 0,Words 1,Words 2,Words 3,Words 4,Words 5,Words 6,Words 7,Words 8,Words 9,Words 10,Words 11,Words 12,Words 13,Words 14
Topic 0,bad,watch,download,good,live,slow,star,streaming,broadcast,try,pay,free,time,code,useless
Topic 1,package,purchase,watch,buy,payment,unable,subscription,pay,premium,credit,subscribe,balance,deduct,use,premier
Topic 2,advertisement,watch,video,tv,fix,good,quality,play,movie,network,update,like,poor,time,open
Topic 3,watch,error,subscribe,log,pay,account,match,experience,suddenly,want,difficult,subscription,ve,try,login


Attaching the discovered topics to the original reviews

In [79]:
topic_results = LDA.transform(dtm)
df["topic"] = topic_results.argmax(axis=1)

Calculating the number of reviews that are associated with each of the 4 topics

In [80]:
df["topic"].value_counts()

3    6052
2    5218
0    4571
1    4522
Name: topic, dtype: int64

Adding labels to the topics

In [82]:
def topic_names(x):
    if x == 0:
        return "Bad Application"
    elif x == 1:
        return "Package"
    elif x == 2:
        return "Advertisement"
    else:
        return "Watching Experience"
    
df["topic"] = df["topic"].apply(topic_names)
df

Unnamed: 0,reviewId,userName,at,content_original,content_formal_indonesian,content_english,score,content_cleaned,topic
241854,f9199a90-9330-4ed2-9ea0-e7f98ea65ec2,Devi Nurfalisyah,2023-04-22 07:12:54,Banyak iklan,Terlalu banyak iklan.,Excessive advertising.,1,excessive advertising,Advertisement
241857,27a2af58-142d-4d45-901b-88ab09705987,Ariii Arii,2023-04-22 05:40:09,Nyesel aku download aplikasi ini mana iklan ny...,Saya sangat menyesal telah mengunduh aplikasi ...,I deeply regret downloading this application b...,1,deeply regret download advertisement unclear ...,Bad Application
241858,e1d00b2f-a5c8-44cc-9d05-6dbadb086d0e,randi nugraha,2023-04-22 05:13:19,Pliss kurangin iklan nya ya,Tolong kurangi iklannya ya.,Please reduce the number of advertisements dis...,1,reduce number advertisement display,Advertisement
241860,e6ad63a8-7378-4c62-8117-a9b32541c6ac,DEDExs 1319,2023-04-22 03:19:38,Gilaaaa ini mau nonton tv malah jadi nonton ik...,"Sungguh tidak masuk akal, saya ingin menonton ...","It's really ridiculous, I want to watch TV but...",1,ridiculous want watch tv endless advertisement...,Advertisement
241862,b8eecd63-d6bc-4571-8bb2-23f3ee884743,Taufik Kurrahim,2023-04-22 01:34:44,"Gajelas, masa udah beli paket langganan belum ...",Saya merasa sangat kecewa dengan Gajelas. Masa...,"I am very disappointed with Gajelas. How come,...",1,disappointed gajela come buy subscription pack...,Package
...,...,...,...,...,...,...,...,...,...
32916,1709ac0b-7f24-4a06-a48c-fc2b399fc84a,hartono ang,2022-08-27 20:16:12,Aplikasi yang buruk .. nonton liga inggris nge...,Aplikasi yang buruk mengalami lag saat menonto...,"The application is bad, it lags while watching...",2,bad lag watch english league match provide sa...,Package
32914,2f0085d1-e91f-4b6c-8ce3-de8e341dde2f,Alif Choirul Maftuchin,2022-08-27 20:16:12,"Gak bisa buat nonton, PADAHAL SUDAH LANGGANAN ...","Tidak dapat digunakan untuk menonton, WALAUPUN...","Cannot be used for watching, EVEN THOUGH ALREA...",1,watch subscribe goblk,Watching Experience
32547,cf112c5b-446b-4b30-b5d2-c06e3cd6ea81,Ristyan Vidya,2022-08-27 20:16:06,Gua udh byar full 1 thun.. error kga bsa nnton...,Saya telah melakukan pembayaran penuh selama s...,"I have paid in full for one year, but I cannot...",1,pay year watch poor quality,Advertisement
32915,a384369c-7aa9-48d7-ac28-e2244ea2cb65,eric abednego maranatha,2022-08-27 20:15:52,"Parah lg seru"" nonton liga Inggris eh ilang pa...","Nonton liga Inggris menjadi semakin seru, teta...",Watching the Premier League was getting even m...,1,watch premier league get exciting suddenly dis...,Watching Experience


Merging the topics to the original dataframe

In [83]:
df_merged = pd.merge(df_original, df[["topic"]], left_index=True, right_index=True, how="outer")
df_merged = df_merged.fillna("empty")
df_merged.head(10)

Unnamed: 0,reviewId,userName,userImage,content_original,content_formal_indonesian,content_english,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,topic
0,e2a5116c-6baa-4a19-94b1-0563d0f49823,Bibah Garuda,https://play-lh.googleusercontent.com/a/AGNmyx...,"Saya baik min,walaupun suka ngefreeze and bany...","Saya merasa baik min, walaupun sering mengalam...","I am doing well, despite frequently experienci...",3,0,6.1.10-11c7d4bab7,2023-04-07 06:25:21,"Hai kak Bibah, ada banyak tayangan yang bisa k...",2023-04-07 06:33:41,Advertisement
1,aa568a26-9a64-43ac-acc0-551e9723645a,Nuke Yuli,https://play-lh.googleusercontent.com/a/AGNmyx...,Vidio Ini Sangat Bagus Dan KualitasNya Dan Apl...,"Video ini sangat bagus dan kualitasnya, dan ap...",This video is very good in quality and the app...,5,0,6.1.10-11c7d4bab7,2023-04-07 06:12:32,"Hai kak Nuke, thank you for your five stars. F...",2023-04-07 06:33:15,empty
2,bdb23d0e-47bf-4e89-92eb-2061460ad6c0,reva hasan,https://play-lh.googleusercontent.com/a/AGNmyx...,Saya bisa nonton tv tanpa menggunakan tv,Saya dapat menonton tv tanpa menggunakan tv.,I can watch TV without using a TV.,3,0,6.1.10-11c7d4bab7,2023-04-07 05:58:47,"Hai Kak Reva, terima kasih ya atas rating dan ...",2023-04-07 06:32:50,Advertisement
3,d2518700-29b4-4ea2-9538-ce92e95ae318,Ucu Baan,https://play-lh.googleusercontent.com/a/AGNmyx...,baik,Baiklah,Kindness,4,0,5.90.11-313f2fa36c,2023-04-07 05:56:22,"Hai Kak Ucu, thank you for your positive feedb...",2023-04-07 06:31:53,empty
4,85e286b1-db43-44c7-86f7-f82e329e4ae8,Hafiz Fadillah,https://play-lh.googleusercontent.com/a/AGNmyx...,Ada vidio magic 5,Ada video magic 5.,Please download this application to watch videos.,5,0,empty,2023-04-07 05:52:38,"Hai kak Hafiz, thank you for your 5 stars. Kam...",2023-04-07 06:32:13,empty
5,4bd5f616-3c50-442d-a0a8-92cc80783357,Va Ra,https://play-lh.googleusercontent.com/a/AGNmyx...,Saya belum dowload si saya kasih aja bintang l...,"Saya belum mengunduhnya, tapi saya beri lima b...","I haven't downloaded it yet, but I'll give it ...",5,0,empty,2023-04-07 04:34:26,"Hai Sahabat Vidio, terima kasih atas bintang l...",2023-04-07 04:45:15,empty
6,010cc55b-358c-4a8a-b789-c87a08209c7e,MaZ ambon,https://play-lh.googleusercontent.com/a/AGNmyx...,Siilp,Seni Rupa,Fine Arts,3,0,empty,2023-04-07 03:11:36,"Hai Sahabat Vidio, terima kasih sudah mengguna...",2023-04-07 03:16:40,Advertisement
7,6f181258-4462-4d1a-a387-9dfa504f4181,online “Nobila Syalwa Rahmanita” arsitektur,https://play-lh.googleusercontent.com/a-/ACB-R...,Bagusss bagettt filimnya magic 5,Filmnya Bagusss bagiett magic 5.,"The movie is really good, especially Magic 5.",5,0,6.0.5-f68a5157fa,2023-04-07 03:55:22,"Hai Sahabat Vidio, terima kasih atas bintang l...",2023-04-07 04:45:09,empty
8,0d59dd1d-cb45-45cb-b9a4-3a4e540bfbc6,Miranda Randa,https://play-lh.googleusercontent.com/a/AGNmyx...,Saya emosi sama apk ini kebanyakan iklan mendi...,Saya merasa kesal dengan aplikasi ini karena t...,I am frustrated with this application because ...,1,0,6.0.5-f68a5157fa,2023-04-07 02:55:41,"Hai kak Miranda, terima kasih telah menggunaka...",2023-04-07 03:03:32,Advertisement
9,f99f20e2-c07e-4e10-a821-a712adf7043a,Mau Kana,https://play-lh.googleusercontent.com/a/AGNmyx...,Mending hapus aja apk ini,Mohon hapus aplikasi ini.,I suggest deleting this application.,1,0,empty,2023-04-07 02:02:48,"Hai Sahabat Vidio, mohon maaf ketidaknyamanan ...",2023-04-07 02:06:48,Bad Application


Saving both the CountVectorizer and LDA model

In [88]:
with open("count_vectorizer.pkl", "wb") as f:
    pickle.dump(cv, f)
    
with open("lda_model.pkl", "wb") as f:
    pickle.dump(LDA, f)

Replacing existing values with new ones

In [93]:
%%time

client = MongoClient(
    os.environ["MONGODB_URL"],
    serverSelectionTimeoutMS=300000
)
db = client["vidio"]
collection = db["google_play_store_reviews"]
df_merged_dict = df_merged.to_dict("records")
collection.delete_many({})

batch_size = 1_000
num_records = len(df_merged_dict)
num_batches = num_records // batch_size

if num_records % batch_size != 0:
    num_batches += 1

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, num_records)
    batch = df_merged_dict[start_idx:end_idx]
    
    if batch:
        collection.insert_many(batch)

print("Data replaced successfully")

Data replaced successfully
CPU times: total: 5.81 s
Wall time: 11min 10s
