# Install packages here

In [None]:
#!pip install gensim
#!pip install spacy
#!pip install symspellpy
#!python -m pip install fuzzywuzzy
#!python -m pip install octis
#%pip install gensim
#%pip install nltk
#%pip install tqdm
#%pip install openpyxl
#%pip install python-Levenshtein
#%pip install pandas
#%pip install numba
#%pip install symspellpy
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#%pip install emoji

# Import modules here

In [1]:
import re
import gensim
import pkg_resources
from symspellpy import SymSpell, Verbosity
import nltk
from tqdm import tqdm
import pandas as pd
import emoji
from polyglot.detect import Detector
from nltk.tokenize import RegexpTokenizer
import ast
import itertools
from nltk import FreqDist
import numpy as np

# Initialize variables here

## Tokenizer

It extracts any word with or without an apostrophe. It can extract the following words:

Examples: better, havn't, isn't, bug, problem

In [2]:
tokenizer = RegexpTokenizer(r'[a-zA-Z]+[\']*[a-zA-Z]*')

## Spellcorrector

The spellcorrector with a custom dictionary that is created from 88 app reviews and merged with the default dictionary of symspellpy.

In [36]:
sym_spell = SymSpell(max_dictionary_edit_distance=1, prefix_length=7, count_threshold=1)
dictionary_path = 'clean_dict_v3.txt'
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1, separator=',', encoding='utf-8')
del dictionary_path

# Define functions here

## Remove repeating words in text - only removes repeating words in a row 

remove_repeating_words('test word test test') -> test word test

remove_repeating_words('test test test') -> test

In [4]:
def remove_repeating_words(text):
    pattern = r'\b(\w+)(\s+\1\b)+'
    regex = re.compile(pattern, re.IGNORECASE)
    result = regex.sub(r'\1', text)
    
    progress_bar.update(1) # remove this if you don't need a progress bar
    
    return result

## Spellcorrection function which uses the symspell package

In [42]:
def enchant(sentence):
    corrected_sentence = sentence
    tokens = tokenizer.tokenize(sentence)
    
    for word in tokens:
        suggestions = sym_spell.lookup(word, Verbosity.TOP, max_edit_distance=1, include_unknown=False)
        if suggestions:
            corrected_sentence = corrected_sentence.replace(word, suggestions[0].term)
    
    # Update the progress_bar
    progress_bar.update(1)

    return corrected_sentence

## Tokenizer function with a progress bar

In [6]:
def tokenize(sentence):
    progress_bar.update(1)
    return tokenizer.tokenize(sentence)

## Remove repeating characters in the text - shortens every repeating character sequence to two characters (since we do not use a contextual spellchecker, we rely only on a spell correction which is based on the levenshtein algorithm)

In [7]:
def remove_repeating_characters(text):

    pattern = r'([a-zA-Z])\1+'
    regex = re.compile(pattern, re.IGNORECASE)

    result = regex.sub(r'\1\1', text)

    if 'soo' in result: # unique word that needs special treatment (is also a stopword, if corrected will be detected and removed by BERTopic)
        result = result.replace('soo', 'so')

    progress_bar.update(1)

    return result

## Remove emojis

In [8]:
def demoji_progress(data, repl):
    corrected_text = emoji.replace_emoji(data, repl)
    progress_bar.update(1)
    return corrected_text

## Detect non-english reviews

In [10]:
def detect_language(text):
    try:
        lan = [x.name for x in Detector(text).languages]
        progress_bar.update(1)
        return lan
    except Exception as e:
        # Handle the UnknownLanguage exception here
        return ['unknown']

# English App Review specific dictionary

## Load entire dataset of 88 apps into memory (large dataset roughly 4.57 GB - make sure you have enough ram available on your system)

In [13]:
apps = [
    'youtube',
    'whatsapp',
    'telegram',
    'instagram',
    'tiktok',
    'com.zzkko',
    'com.snapchat.android',
    'com.amazon.avod.thirdpartyclient',
    'com.lemon.lvoverseas',
    'com.gamma.scan',
    'paypal',
    'de.hafas.android.db',
    'com.ebay.kleinanzeigen',
    'de.dhl.paket',
    'com.sec.android.easyMover',
    'de.cellular.ottohybrid',
    'com.google.android.apps.translate',
    'de.ingdiba.bankingapp',
    'amazon_shop',
    'com.duolingo',
    'com.starfinanz.mobile.android.pushtan',
    'com.scaleup.chatai',
    'com.disney.disneyplus',
    'fr.doctolib.www',
    'bloodpressure.bloodpressureapp.bloodpressuretracker',
    'com.starfinanz.smob.android.sfinanzstatus',
    'whale.vpn.free',
    'com.lidl.eci.lidlplus',
    'com.facebook.orca',
    'com.spotify.music',
    'com.facebook.katana',
    'com.teacapps.barcodescanner',
    'videoplayer.videodownloader.downloader',
    'com.whatsapp.w4b',
    'com.google.android.apps.walletnfcrel',
    'com.myklarnamobile',
    'com.pinterest',
    'com.azure.authenticator',
    'net.wrightflyer.le.reality',
    'com.ai.polyverse.mirror',
    'net.diflib.recorderx',
    'io.faceapp',
    'com.booking',
    'de.dm.meindm.android',
    'netflix',
    'com.goodreads',
    'com.google.android.apps.maps',
    'com.google.android.apps.subscriptions.red',
    'com.tinder',
    'com.dazn',
    'tv.twitch.android.app',
    'de.komoot.android',
    'com.bumble.app',
    'com.crunchyroll.crunchyroid',
    'com.yazio.android',
    'com.babbel.mobile.android.en',
    'com.microsoft.skydrive',
    'net.lovoo.android',
    'de.prosiebensat1digital.seventv',
    'com.badoo.mobile',
    'com.dropbox.android',
    'com.calimoto.calimoto',
    'com.fitbit.FitbitMobile',
    'sg.bigo.live',
    'me.fup.joyapp',
    'com.reddit.frontpage',
    'com.groundspeak.geocaching.intro',
    'com.discord',
    'com.nordvpn.android',
    'de.mobiletrend.lovidoo',
    'de.exaring.waipu',
    'de.spiegel.android.app.spon',
    'de.dwins.financeguru',
    'deezer.android.app',
    'com.sgiggle.production',
    'com.zattoo.player',
    'com.colt',
    'com.cbs.ca',
    'com.grindrapp.android',
    'com.netbiscuits.bild.android',
    'com.canva.editor',
    'com.azarlive.android',
    'com.blinkslabs.blinkist.android',
    'com.naver.linewebtoon',
    'com.iViNi.bmwhatLite',
    'com.tomtom.gplay.navapp',
    'com.kms.free',
    'us.zoom.videomeetings'
    ]

In [14]:
frames = [pd.read_json(f'app_reviews/{app}_reviews_all.json') for app in apps]
df = pd.concat(frames)

## Preprocess data

### Remove rows which do not contain any value in the column content

In [15]:
df = df.dropna(subset=['content'])

### turn everything in the content column to lowercase

In [16]:
df.content = df.content.apply(lambda x: x.lower())

### Remove repeating words in a row within a review

In [17]:
progress_bar = tqdm(total=len(df))
df.content = df.content.apply(lambda x: remove_repeating_words(x))
del progress_bar

100%|██████████| 8607928/8607928 [00:46<00:00, 185738.66it/s]


### Remove non-english reviews

In [18]:
from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")

In [19]:
progress_bar = tqdm(total=len(df))
df['lan_code'] = df['content'].apply(lambda x: detect_language(x))
del progress_bar

 99%|█████████▊| 8487284/8607928 [02:31<00:02, 56149.73it/s]


In [20]:
for idx in range(0, 2):
    df['lan_code'].apply(lambda x: x.remove('un') if 'un' in x else print(x))

['English', 'Korean', 'Greek']
['unknown']
['unknown']
['English', 'Inuktitut', 'Kannada']
['unknown']
['unknown']
['unknown']
['unknown']
['English', 'Armenian', 'Georgian']
['unknown']
['English', 'Kannada', 'Greek']
['unknown']
['Hindi', 'Indonesian', 'English']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['English', 'Greek', 'Danish']
['unknown']
['English', 'Russian', 'Hebrew']
['unknown']
['unknown']
['unknown']
['unknown']
['Russian', 'English', 'Czech']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['

In [21]:
df['contains_only_english'] = df['lan_code'].apply(lambda x: 'English' in x and len(x) == 1)

In [22]:
df.drop(df[df['contains_only_english'] == False].index, inplace=True)

### Create a column with the tokenized words

In [23]:
progress_bar = tqdm(total=len(df))
df['word_list'] = df['content'].apply(lambda x: tokenize(x))
del progress_bar

100%|██████████| 2053583/2053583 [00:09<00:00, 206222.31it/s]


### NOTE that this is only necessary if you created the column word_list and loaded a saved csv file which contained the column word_list. It converts the column word_list into a string representation of lists, which needs to be converted into lists first before flattening the column as seen in the cell after this. This is what the code below does. We uncommented it by default.

In [None]:

#df.word_list = df.word_list.apply(ast.literal_eval)

### Now we need to flatten the word_list column and create a frequency distribution using the nltk package

In [24]:
flat_list = itertools.chain(*list(df['word_list']))
word_list = list(flat_list)

freq_dist = FreqDist(word_list)
most_common = freq_dist.most_common()

df_dict = pd.DataFrame(most_common, columns=['word', 'freq'])

### Save the dictionary as a csv file for further use

In [25]:
df_dict.to_csv('review_english_dict_v2.csv', index=False)

### To skip the above you can also load the final list here

In [None]:
df = pd.read_csv('review_english_dict_v2.csv.csv')

# Preprocess every app review data

## Load the data

In [11]:
df_netflix = pd.read_json(f'app_reviews/netflix_reviews_all.json')
df_youtube = pd.read_json(f'app_reviews/youtube_reviews_all.json')
df_whatsapp = pd.read_json(f'app_reviews/whatsapp_reviews_all.json')
df_paypal = pd.read_json(f'app_reviews/paypal_reviews_all.json')
df_amazon = pd.read_json(f'app_reviews/amazon_shop_reviews_all.json')

# please do not change the order of these values, as other code depends on the specific positions
app_frames = [df_netflix, df_youtube, df_whatsapp, df_paypal, df_amazon]

## Remove any row in the column content, which does not contain a value

In [12]:
for frame in app_frames:
    frame.dropna(subset=['content'], inplace=True)

## Remove any rows that do not contain app version

In [13]:
for frame in app_frames:
    frame.dropna(subset=['reviewCreatedVersion'], inplace=True)

## Create the column major_versions

In [14]:
for idx, frame in enumerate(app_frames):
    if idx != 2: # does not equal whatsapp
        frame['major_version'] = frame['reviewCreatedVersion'].apply(lambda x: int(x.split('.')[0]))
    else:
        frame['major_version'] = frame['reviewCreatedVersion'].apply(lambda x: '.'.join(x.split('.')[0:2]))

## Removal of experimental app versions

Only Netflix and YouTube contain experimental versions

In [15]:
for idx, frame in enumerate(app_frames[0:2]):
    if idx == 0: # equals netflix
        frame['reviewCreatedVersion'] = frame['reviewCreatedVersion'].apply(lambda x: x if 'Dogfooding' not in x and 'D1' not in x and '1.0' not in x else None)
    elif idx == 1: # equals youtube
        frame['reviewCreatedVersion'] = frame['reviewCreatedVersion'].apply(lambda x: x if '-DOGFOOD' not in x and 'E' not in x else None)
    frame.dropna(subset=['reviewCreatedVersion'], inplace=True)

## Removal of repeating words inside a review

In [16]:
for frame in app_frames:
    progress_bar = tqdm(total=len(frame))
    frame['content'] = frame['content'].apply(lambda x: remove_repeating_words(x))
    del progress_bar

100%|██████████| 325297/325297 [00:01<00:00, 221858.54it/s]
100%|██████████| 149952/149952 [00:01<00:00, 90635.94it/s] 
100%|██████████| 101498/101498 [00:01<00:00, 99818.72it/s]
100%|██████████| 104460/104460 [00:00<00:00, 142489.14it/s]
100%|██████████| 126953/126953 [00:00<00:00, 167996.27it/s]


## Shortening of repeating characters inside the review
### Shortening of repeating characters in words (examples: gooooood, toooooooooooo and veeeeeeeeeeeeeeeeeery, soooooo)

In [17]:
for frame in app_frames:
    progress_bar = tqdm(total=len(frame))
    frame['content'] = frame['content'].apply(lambda x: remove_repeating_characters(x))
    del progress_bar

100%|██████████| 325297/325297 [00:01<00:00, 204887.57it/s]
100%|██████████| 149952/149952 [00:01<00:00, 85031.57it/s] 
100%|██████████| 101498/101498 [00:01<00:00, 86477.32it/s]
100%|██████████| 104460/104460 [00:00<00:00, 134890.57it/s]
100%|██████████| 126953/126953 [00:00<00:00, 154279.76it/s]


## Removal of emojis from review text

In [18]:
for frame in app_frames:
    progress_bar = tqdm(total=len(frame))
    frame['content'] = frame['content'].apply(lambda x: demoji_progress(x, ''))
    del progress_bar

100%|██████████| 325297/325297 [00:02<00:00, 138291.73it/s]
100%|██████████| 149952/149952 [00:03<00:00, 44610.98it/s]
100%|██████████| 101498/101498 [00:02<00:00, 49719.08it/s]
100%|██████████| 104460/104460 [00:01<00:00, 78609.48it/s]
100%|██████████| 126953/126953 [00:01<00:00, 93138.72it/s] 


## Remove any rows that contain only 1 words, 2 words, ...

### Create a column, which lists the token length

In [19]:
for frame in app_frames:
    progress_bar = tqdm(total=len(frame))
    frame['token_length'] = frame['content'].apply(lambda x: len(tokenize(x)))
    del progress_bar

100%|██████████| 325297/325297 [00:00<00:00, 506669.85it/s]
100%|██████████| 149952/149952 [00:00<00:00, 209755.14it/s]
100%|██████████| 101498/101498 [00:00<00:00, 232231.46it/s]
100%|██████████| 104460/104460 [00:00<00:00, 327297.89it/s]
100%|██████████| 126953/126953 [00:00<00:00, 382769.51it/s]


### Remove any review that has less than 4

In [20]:
for frame in app_frames:
    frame.drop(frame[frame['token_length'] < 4].index, inplace=True)

## Removal of very short reviews (maybe the ones that are 20 characters or less)

### Create a colum, which lists the document length in chars

In [21]:
for frame in app_frames:
    frame['doc_length'] = frame['content'].apply(len)

### Remove any review, that is shorter than 20 characters

In [22]:
for frame in app_frames:
    frame.drop(frame[frame['doc_length'] < 20].index, inplace=True)

## Remove every review that has a score of five

In [23]:
for frame in app_frames:
    frame.drop(frame[frame['score'] == 5].index, inplace=True)

## SymSpell with custom dictionary data - accounts for slang words and abbreviations often used in app reviews
(Note that the dictionary itself can contains mistakes. This means that some words won't be corrected because the mistake has a frequency above 620)

Nonetheless, this dictionary is able to improve the poor quality of the reviews to some degree without drastically changing the context.

In [43]:
for frame in app_frames:
    progress_bar = tqdm(total=len(frame))
    frame['content_corrected'] = frame['content'].apply(lambda x: enchant(x.lower()))
    del progress_bar

100%|██████████| 100164/100164 [00:05<00:00, 16907.16it/s]
100%|██████████| 119282/119282 [00:07<00:00, 15477.07it/s]
100%|██████████| 74924/74924 [00:04<00:00, 16784.95it/s]
100%|██████████| 45324/45324 [00:02<00:00, 18041.41it/s]
100%|██████████| 53378/53378 [00:03<00:00, 16965.64it/s]


### Here is an example of how it corrects reviews.

#### It only corrects words that have at most one letter to either add, remove or substitute

In [45]:
id_netflix = '3ba0f356-8251-4052-b4ae-39a5ab5454f3'
id_amazon = '1961f2ce-8637-4cbe-a58c-e832ed43a619'
id_youtube = '27c0cf29-f233-4071-84c1-29c7c1857f9d'

test = df_youtube[df_youtube['reviewId'] == id_youtube][['content', 'content_corrected']]

print('Original:')
print(list(test['content']))
print('Corrected')
print(list(test['content_corrected']))

Original:
['Sometimes when using picture in picture mode it gliches out and it draws ontop of everything. (Same problem was with google maps). So you cant close the app or do anything else without restarting the Phone. Gets annoying really son. Fix it please google! (again)']
Corrected
['sometimes when using picture in picture mode it glitches out and it draws onto of everything. (same problem was with google maps). so you cant close the app or do anything else without restarting the phone. gets annoying really son. fix it please google! (again)']


## Remove any non-english text from the content
### (We removed any reviews that contain foreign language - mixed language may still be in there)

### Create a column that groups the reviews into their language

In [46]:
from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")

In [47]:
for frame in app_frames:
    progress_bar = tqdm(total=len(frame))
    frame['lan_code'] = frame['content'].apply(lambda x: detect_language(x))
    del progress_bar

100%|█████████▉| 100147/100164 [00:02<00:00, 42178.84it/s]
100%|█████████▉| 119281/119282 [00:02<00:00, 48257.56it/s]
100%|█████████▉| 74922/74924 [00:01<00:00, 50440.15it/s]
100%|█████████▉| 45321/45324 [00:01<00:00, 29968.94it/s]
100%|█████████▉| 53372/53378 [00:01<00:00, 52871.41it/s]


In [48]:
for frame in app_frames:
    for idx in range(0, 2):
        frame['lan_code'].apply(lambda x: x.remove('un') if 'un' in x else print(x))

['unknown']
['English', 'Kannada', 'Georgian']
['unknown']
['unknown']
['English', 'Japanese', 'Greek']
['unknown']
['unknown']
['unknown']
['Marathi', 'English', 'French']
['unknown']
['unknown']
['Telugu', 'Slovak', 'English']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['unknown']
['Chinese', 'English', 'Portuguese']
['unknown']
['Hindi', 'English', 'Kinyarwanda']
['Persian', 'Arabic', 'Oromo']
['Hindi', 'English', 'Welsh']
['English', 'xx', 'Greek']
['Telugu', 'Xhosa', 'English']
['unknown']
['Oromo', 'Hindi', 'Tagalog']
['Scottish Gaelic', 'English', 'Bhojpuri']
['unknown']
['Hindi', 'Latin', 'English']
['English', 'Kannada']
['English', 'Lao']
['unknown']
['Spanish', 'English']
['English', 'Georgian']
['Spanish', 'English']
['English', 'Portuguese']
['English', 'Spanish']
['English', 'Ukrainian']
['English', 'Japanese']
['English', 'Kannada', 'Georgian']
['English', 'Kannada']
['English', 'Tamil']
['Dutch', 'English']
['English', 'Urdu']
['English', 'Kannada']
['D

In [49]:
for frame in app_frames:
    frame['contains_only_english'] = frame['lan_code'].apply(lambda x: 'English' in x and len(x) == 1)

In [50]:
df_netflix[df_netflix['contains_only_english'] == True]

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,major_version,token_length,doc_length,content_corrected,lan_code,contains_only_english
0,a528c73e-e15d-4474-8f81-e408acc295b3,Jaden Corpolongo,https://play-lh.googleusercontent.com/a-/ACB-R...,"I love netflix but as of recently, it has been...",1,929,8.63.0 build 16 50390,2023-04-03T22:06:01,,,8,92,478,"i love netflix but as of recently, it has been...",[English],True
1,36310c88-bbaf-48b3-a8dc-8caa98103488,Kristen Trevey,https://play-lh.googleusercontent.com/a-/ACB-R...,Playback is good but almost 100% of the time i...,2,1992,8.63.0 build 16 50390,2023-04-11T06:06:12,,,8,86,457,playback is good but almost 100% of the time i...,[English],True
2,5fc26a26-6693-49a1-bf30-17545a399317,Monika,https://play-lh.googleusercontent.com/a/AGNmyx...,"Whenever I use Netflix in phone app, Videos do...",1,125,8.64.0 build 8 50394,2023-04-13T11:05:02,,,8,79,499,"whenever i use netflix in phone app, videos do...",[English],True
3,af6a476a-695f-4c05-a19d-c9b9e44c11ae,Mike,https://play-lh.googleusercontent.com/a/AGNmyx...,I've just given up. The My List feature is sim...,1,1066,8.62.0 build 7 50386,2023-03-31T05:23:00,,,8,92,490,i've just given up. the my list feature is sim...,[English],True
4,14de4aec-f34f-4e5a-856f-4e0ebddaff49,Leonidas Angelus,https://play-lh.googleusercontent.com/a-/ACB-R...,"It's a pretty decent app overall, but this lat...",3,25,8.64.0 build 8 50394,2023-04-12T09:06:30,,,8,96,499,"it's a pretty decent app overall, but this lat...",[English],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402850,fff437ba-ee80-4140-92e3-bfc1b189b498,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,I don't want Netflix on my phone how can I uni...,1,0,7.53.3 build 31 34824,2020-04-23T05:34:12,,,7,11,52,i don't want netflix on my phone how can i uni...,[English],True
402852,b5269e49-4871-430f-b24e-44a98fa43244,Chris Morrison,https://play-lh.googleusercontent.com/a-/ACB-R...,Overall it's a good app,4,409,7.53.3 build 31 34824,2020-05-01T03:02:09,,,7,5,23,overall it's a good app,[English],True
402854,26880623-daac-4143-8992-10ddbfd740dc,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Meh. Netfliggars.... Could be better but isn't.,4,16,7.53.3 build 31 34824,2020-04-22T23:28:14,,,7,7,47,meh. netfliggars.... could be better but isn't.,[English],True
402899,26b11cfe-a3e0-4ebc-8d94-98d3a2c8720a,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Hindi dekhne wale ke liye discription main lik...,3,0,7.53.3 build 31 34824,2020-04-23T12:54:41,,,7,21,123,hindi dekhne wale ke like description main dik...,[English],True


### Remove non-english text

In [51]:
for frame in app_frames:
    frame.drop(frame[frame['contains_only_english'] == False].index, inplace=True)

# Save preprocessed data

In [52]:
df_netflix.to_csv('preprocessed_data/prep_netflix_v4.csv', index=False)
df_youtube.to_csv('preprocessed_data/prep_youtube_v4.csv', index=False)
df_whatsapp.to_csv('preprocessed_data/prep_whatsapp_v4.csv', index=False)
df_paypal.to_csv('preprocessed_data/prep_paypal_v4.csv', index=False)
df_amazon.to_csv('preprocessed_data/prep_amazon_v4.csv', index=False)

# Load preprocessed data

In [None]:
df_netflix = df_netflix.read_csv('preprocessed_data/prep_netflix_v4.csv')
df_youtube = df_youtube.read_csv('preprocessed_data/prep_youtube_v4.csv')
df_whatsapp = df_whatsapp.read_csv('preprocessed_data/prep_whatsapp_v4.csv')
df_paypal = df_paypal.read_csv('preprocessed_data/prep_paypal_v4.csv')
df_amazon = df_amazon.read_csv('preprocessed_data/prep_amazon_v4.csv')