# Libraries

In [10]:
from utils import *
from pprint import pprint
import pandas as pd
from spellchecker import SpellChecker
spell = SpellChecker()

import pickle

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

config = get_config('config.yaml')

# Read Input

## Bolt Apple

In [3]:
df = pd.read_csv(config['csv_input_local']['bolt_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 55061 

Total unique users : 50021
Total users who gave multiple reviews: 5040

Average rating for this app based on the textual reviews: 3.91 



# Preprocessing Input Data

## Remove Non-English User Reviews (it's done, p1)

Total reviews (BEFORE): 55061 

Total reviews (AFTER): 40365 

Total Non-English reviews: 14696 

Wall time: 9.94 ms

## Filtering Out Inconsistent User Review (Removed, p2)

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [6]:
# df_p2 = pd.read_csv(config['csv_input_local']['bolt_apple_p2'], index_col=0)
# df_p2 = df_p2.reset_index(drop=True)
# df_p2

## Filtering Out Uninformative Reviews (now using bolt_apple_google_p1.csv)

In [5]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "1_bolt" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\1_bolt.csv
Vocabulary size for 1_bolt : 5770
Training set Size: 1442
Testing set Size: 2067
Unlabeling set Size: 40365




In [10]:
# for i in range(0, len(unlabelSet)):
#     print(unlabelSet[i].id)

### EMNB

In [7]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = False # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

No more progress, stopping EM at iteration 40
Average F-Score for the test data: 0.627536063102609
Number of informative reviews: 15201
Number of uninformative reviews: 25164
Wall time: 3min 3s


### SVM

In [8]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9366149110978086
Number of informative reviews: 26936
Number of uninformative reviews: 13429
Wall time: 620 ms


In [9]:
df_p1 = pd.read_csv(config['csv_input_local']['bolt_apple_google_p1'], index_col=0)
df_p1 = df_p1.reset_index(drop=True)
df_p1

Unnamed: 0,userName,review,rating,date
0,davide masi,The first thing I noticed is that you can't pr...,1,2020-12-20 01:51:18
1,Mymoena Saban,Your GPS setting around Cape Town Internationa...,4,2020-12-09 17:45:31
2,hello 87,I was invited by my friend and was given a £10...,1,2020-12-06 21:13:37
3,Hilary Meyer,Your drivers are great BUT your support is no ...,3,2020-12-06 19:38:13
4,Kenneth B,"Loving the app, but can only give it a medium ...",3,2020-12-11 14:06:15
...,...,...,...,...
40360,RobToo96,"Its the 5th time Im saying, But When Will ther...",1,2018-05-16 08:34:17
40361,Illimar,Just love it!,5,2017-06-23 20:34:04
40362,kerdionu,Great app!,5,2015-08-04 18:18:57
40363,Zakkaz11,Super convenient and fast!,5,2015-03-12 11:47:21


In [12]:
listOfRemovedIndex_p2 = []

for i in range(0, len(uninformRev)):
#     print(uninformRev[i].id)
#     print(uninformRev[i].id - (len(trainSet) + len(testSet)))
#     uninformRev[i].printSelf()
#     print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p2.append(idxToRemove)

In [13]:
listOfRemovedIndex_p2

[86,
 134,
 232,
 241,
 250,
 433,
 507,
 533,
 561,
 570,
 600,
 603,
 612,
 651,
 661,
 666,
 670,
 683,
 717,
 756,
 760,
 781,
 792,
 806,
 829,
 840,
 844,
 851,
 894,
 906,
 919,
 1013,
 1050,
 1057,
 1070,
 1088,
 1108,
 1191,
 1212,
 1213,
 1222,
 1227,
 1240,
 1247,
 1272,
 1287,
 1298,
 1300,
 1325,
 1338,
 1343,
 1379,
 1406,
 1410,
 1474,
 1478,
 1479,
 1483,
 1494,
 1511,
 1514,
 1520,
 1533,
 1557,
 1559,
 1591,
 1598,
 1611,
 1626,
 1646,
 1686,
 1691,
 1725,
 1729,
 1740,
 1762,
 1772,
 1777,
 1790,
 1808,
 1814,
 1822,
 1833,
 1836,
 1838,
 1841,
 1847,
 1865,
 1868,
 1915,
 1924,
 1947,
 1977,
 1986,
 1998,
 1999,
 2000,
 2020,
 2033,
 2064,
 2119,
 2141,
 2158,
 2201,
 2266,
 2269,
 2271,
 2283,
 2363,
 2375,
 2377,
 2389,
 2410,
 2423,
 2452,
 2456,
 2458,
 2461,
 2489,
 2491,
 2526,
 2530,
 2547,
 2579,
 2587,
 2593,
 2597,
 2604,
 2615,
 2648,
 2658,
 2677,
 2687,
 2703,
 2717,
 2735,
 2742,
 2802,
 2822,
 2844,
 2866,
 2929,
 2932,
 2942,
 2955,
 2961,
 2965,
 29

In [15]:
%%time
df_p2 = df_p1.drop(df_p1.index[listOfRemovedIndex_p2])
total_reviews_before = len(df_p1)
total_reviews_after = len(df_p2)
total_removed_reviews = len(listOfRemovedIndex_p2)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 40365 

Total reviews (AFTER): 26936 

Total removed reviews: 13429 

Wall time: 11 ms


In [16]:
df_p2.to_csv(config['csv_input_local']['bolt_apple_google_p2_v2'])

## Building Corpus

In [62]:
df_p5 = df_p4.copy()
df_p5

Unnamed: 0,userName,isEdited,review,rating,title,date
0,Kazem Sharan,False,I’ve been using bolt for a month now. I had a ...,1,Drivers are unprofessional,2019-09-20 09:25:04
1,19RN20,False,I’m so appalled at the attitude of the last dr...,1,Zero Stars.,2020-09-04 19:50:50
2,evolvingihsan,False,I’ve had many problems with the customer suppo...,1,HORRIBLE! HORRIBLE CUSTOMER SERVICE,2020-01-29 16:58:11
3,Sabcha08,False,So far majority of the drivers were very kind ...,3,Needs some updates,2019-07-05 12:17:37
4,Ellenyh79,False,I was super happy when Bolt launch as finally ...,1,Terrible customer service and grossly overchar...,2019-10-03 22:17:44
...,...,...,...,...,...,...
620,Vitaliy.l,False,Great app!,5,Great app,2016-03-22 15:29:49
621,cretchen,False,Great app!!,5,Great,2015-07-14 20:13:38
622,Nick_Name_Why,False,The drivers regularly get the wrong address du...,1,Poor location accuracy,2019-10-08 10:57:34
623,Illimar,False,Just love it!,5,Perfect UX,2017-06-23 20:34:04


In [64]:
cleaned_docs = remove_things(df_p5.review)
lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)

ngrams = make_bigrams(lists_of_words_no_stops)
# ngrams = make_trigrams(lists_of_words_no_stops)

data_lemmatized = lemmatize(ngrams, allowed_postags=['NOUN'])

Making bigrams...
Lemmatizing...


In [65]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
term_doc = [id2word.doc2bow(text) for text in texts]

# View
print(term_doc[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]]


In [67]:
tf_idf = models.TfidfModel(term_doc, smartirs='ntc')[term_doc]
[[(id2word[id], freq) for id, freq in cp] for cp in tf_idf[:1]]

[[('apology', 0.258303175195304),
  ('appointment', 0.19374341386795016),
  ('car', 0.1660609726432199),
  ('chair', 0.258303175195304),
  ('cross', 0.2304987993967995),
  ('day', 0.12187014507285714),
  ('driver', 0.07236556805706493),
  ('explanation', 0.2304987993967995),
  ('extraction', 0.258303175195304),
  ('incident', 0.2142342821987155),
  ('journey', 0.12323004598478411),
  ('leg', 0.258303175195304),
  ('location', 0.1423610134036225),
  ('man', 0.19374341386795016),
  ('month', 0.19374341386795016),
  ('occasion', 0.20269442359829498),
  ('one', 0.18642990640021098),
  ('pain', 0.258303175195304),
  ('people', 0.15244204814412854),
  ('place', 0.18024642394263304),
  ('refund', 0.14019220224671175),
  ('response', 0.15244204814412854),
  ('swore', 0.258303175195304),
  ('text', 0.258303175195304),
  ('trip', 0.10301677900469748)]]

In [73]:
with open(config['csv_input_local']['bolt_apple_corpus'], 'wb') as f:
    pickle.dump(tf_idf, f)