# Libraries

In [32]:
from utils import *
from pprint import pprint
import pandas as pd

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

config = get_config('config.yaml')

# Read Input

## Bolt Apple

In [3]:
df = pd.read_csv(config['csv_input_local']['bolt_apple'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 3154 

Total unique users : 3149
Total users who gave multiple reviews: 5

Average rating for this app based on the textual reviews: 3.02 



# Preprocessing Input Data

## Remove Non-English User Reviews

In [4]:
# %%time
# listOfNonEnglishIndex = []

# for i in range(0, len(df2)):
#     reviewText = df2['review'][i]
    
# #     # for debugging purpose
# #     print(reviewText)
# #     print(isEnglishReview(reviewText))
# #     print('\n')
    
#     isEnglish, listToStr, english_score = isEnglishReview(reviewText)
#     if isEnglish == False:
#         listOfNonEnglishIndex.append(i)
        

In [5]:
# %%time
# df_p1 = df.drop(df2.index[listOfNonEnglishIndex])
# total_reviews_before = len(df)
# total_reviews_after = len(df_p1)
# total_non_english_reviews = len(listOfNonEnglishIndex)

# print(f'Total reviews (BEFORE): {total_reviews_before} \n')
# print(f'Total reviews (AFTER): {total_reviews_after} \n')
# print(f'Total Non-English reviews: {total_non_english_reviews} \n')

In [6]:
# df_p1.to_csv(config['csv_input_local']['bolt_apple_p1'])
df_p1 = pd.read_csv(config['csv_input_local']['bolt_apple_p1'], index_col=0)
df_p1

Unnamed: 0,userName,isEdited,review,rating,title,date
0,Tkay_Browning,False,I love bolt. I don’t use uber often because on...,5,Love it *Not sponsored*,2019-07-20 11:51:59
1,Livvii_xo,False,So annoyed with this app!! Definitely the wors...,1,Fake Arrival Time & Car Tracking on Map,2020-03-05 08:22:05
2,Kazem Sharan,False,I’ve been using bolt for a month now. I had a ...,1,Drivers are unprofessional,2019-09-20 09:25:04
3,Doctor A Theo,False,"To make things clear, I am not a regular revie...",1,Shockingly unreliable drivers and unhelpful se...,2020-02-24 08:36:59
4,Maria348794,False,I have used the app 3 or 4 times and I thought...,1,I feel scammed,2019-11-04 15:03:04
...,...,...,...,...,...,...
3085,RobToo96,False,"Its the 5th time Im saying, But When Will ther...",1,Pre-ordering,2018-05-16 08:34:17
3091,Illimar,False,Just love it!,5,Perfect UX,2017-06-23 20:34:04
3115,kerdionu,False,Great app!,5,Excellent,2015-08-04 18:18:57
3122,Zakkaz11,False,Super convenient and fast!,5,Super convenient.,2015-03-12 11:47:21


## Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [7]:
df_p2 = pd.read_csv(config['csv_input_local']['bolt_apple_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,isEdited,review,rating,title,date
0,Kazem Sharan,False,I’ve been using bolt for a month now. I had a ...,1,Drivers are unprofessional,2019-09-20 09:25:04
1,19RN20,False,I’m so appalled at the attitude of the last dr...,1,Zero Stars.,2020-09-04 19:50:50
2,evolvingihsan,False,I’ve had many problems with the customer suppo...,1,HORRIBLE! HORRIBLE CUSTOMER SERVICE,2020-01-29 16:58:11
3,Sabcha08,False,So far majority of the drivers were very kind ...,3,Needs some updates,2019-07-05 12:17:37
4,Ellenyh79,False,I was super happy when Bolt launch as finally ...,1,Terrible customer service and grossly overchar...,2019-10-03 22:17:44
...,...,...,...,...,...,...
629,Vitaliy.l,False,Great app!,5,Great app,2016-03-22 15:29:49
630,cretchen,False,Great app!!,5,Great,2015-07-14 20:13:38
631,Nick_Name_Why,False,The drivers regularly get the wrong address du...,1,Poor location accuracy,2019-10-08 10:57:34
632,Illimar,False,Just love it!,5,Perfect UX,2017-06-23 20:34:04


## Filtering Out Uninformative Reviews

In [8]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "bolt" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

Vocabulary size for bolt : 1761
Training set Size: 110
Testing set Size: 522
Unlabeling set Size: 634




In [10]:
# for i in range(0, len(unlabelSet)):
#     print(unlabelSet[i].id)

In [11]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = False # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

No more progress, stopping EM at iteration 12
Average F-Score for the test data: 0.5037940063038874
Number of informative reviews: 625
Number of uninformative reviews: 9
Wall time: 3.36 s


In [23]:
df_p2.iloc[[366]]

Unnamed: 0,userName,isEdited,review,rating,title,date
366,Bug Reporter 1978,False,It’s spam on my phone. Don’t do it. How Apple ...,3,Stop Lock Screen Promos!,2019-12-19 11:45:37


In [29]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
#     print(uninformRev[i].id)
#     print(uninformRev[i].id - (len(trainSet) + len(testSet)))
#     uninformRev[i].printSelf()
#     print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

In [30]:
listOfRemovedIndex_p3

[276, 366, 389, 421, 461, 546, 559, 577, 589]

In [31]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 634 

Total reviews (AFTER): 625 

Total removed reviews: 9 

Wall time: 2.55 ms


In [33]:
df_p3.to_csv(config['csv_input_local']['bolt_apple_p3'])

## Correcting Typos

## Processing Natural Language

### Coreference Resolution

### Sentence Annotation

## Building Corpus

# Save pre-processed data into binary Pickle file

In [3]:
import pickle

output_path = 'preprocessed_data/bolt/'

## data_lemmatized

In [27]:
with open(output_path + 'data_lemmatized.pkl', 'wb') as f:
    pickle.dump(data_lemmatized, f)

In [28]:
df = pd.read_pickle('preprocessed_data/data_lemmatized.pkl')
df

[['fare', 'guy'],
 [],
 [],
 ['uber', 'freezing', 'locating', 'form', 'payment', 'day', 'update'],
 ['uber',
  'care_customer',
  'location',
  'always_ask',
  'money',
  'credit',
  'ride',
  'time'],
 ['ride'],
 ['address', 'passenger', 'error', 'way', 'problem', 'problem'],
 [],
 ['facility',
  'cancelling_trip',
  'driver',
  'penalty',
  'money',
  'customer',
  'user',
  'trip',
  'people',
  'name',
  'reality',
  'grade',
  'service',
  'rating',
  'rate'],
 ['problem', 'way', 'comment', 'problem', 'problem', 'rating', 'offer', 'way'],
 ['ride', 'request', 'minute', 'car', 'rate', 'bit', 'lyft', 'second'],
 ['facility'],
 ['taxi',
  'dollar',
  'taxi',
  'cancelling',
  'price',
  'price',
  'pay',
  'dollar',
  'difference'],
 ['book', 'trip', 'function'],
 ['cost', 'ride', 'ride', 'solve', 'bug'],
 ['user', 'venmo', 'credit_card', 'get'],
 ['thank', 'service', 'uber'],
 ['pin', 'app', 'location'],
 ['complaint'],
 ['customer', 'care_customer'],
 ['total', 'paytm', 'use', 'app

## dictionary

In [29]:
with open(output_path + 'dictionary.pkl', 'wb') as f:
    pickle.dump(id2word, f)

In [10]:
import pandas as pd
id2word = pd.read_pickle(output_path + 'dictionary.pkl')

[[(id2word[id], freq) for id, freq in cp] for cp in term_doc[:3]]

[[('app', 1),
  ('book', 2),
  ('booking', 1),
  ('catch', 1),
  ('driver', 1),
  ('find', 1),
  ('flight', 1),
  ('get_rid', 1),
  ('pm', 1),
  ('prenook', 1),
  ('service', 1),
  ('thing', 1),
  ('today', 2),
  ('trip', 1),
  ('went', 1)],
 [('app', 1),
  ('airport', 1),
  ('need', 1),
  ('number', 1),
  ('section', 1),
  ('spot', 1)],
 [('contact', 1),
  ('customer', 1),
  ('friend', 1),
  ('journey', 1),
  ('promotion', 1),
  ('reply', 1),
  ('support', 1),
  ('team', 1),
  ('use', 1),
  ('would_allow', 1)]]

## term_doc

In [30]:
with open(output_path + 'term_doc.pkl', 'wb') as f:
    pickle.dump(term_doc, f)

In [6]:
import pandas as pd
term_doc = pd.read_pickle(output_path + 'term_doc.pkl')
term_doc

[[(0, 1),
  (1, 2),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 2),
  (13, 1),
  (14, 1)],
 [(0, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(4, 1),
  (26, 2),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 2)],
 [(4, 2), (13, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 1)],
 [(25, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1)],
 [(4, 2),
  (24, 1),
  (38, 1),
  (45, 2),
  (49, 1),
  (50, 1),
  (51, 2),
  (52, 1),
  (53, 1)],
 [(0, 2),
  (13, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1)],
 [(10, 1), (13, 1), (26, 1), (27, 1), (45, 1), (62, 1), (63, 1), (64, 1)],
 [(4, 1), (42, 1), (65, 1), (66, 1), (67, 1), (68, 1)],
 [(21, 1),
  (36, 1),
  (69, 1),
  (70, 2),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 3),
  (75, 1),

## tf_idf

In [None]:
with open(output_path + 'tf_idf.pkl', 'wb') as f:
    pickle.dump(tf_idf, f)

In [11]:
import pandas as pd
tf_idf = pd.read_pickle(output_path + 'tf_idf.pkl')
tf_idf

<gensim.interfaces.TransformedCorpus at 0x12f5a4cf8>

In [12]:
[[(id2word[id], freq) for id, freq in cp] for cp in tf_idf[:1]]

[[('app', 0.0775013248836275),
  ('book', 0.4044360833214392),
  ('booking', 0.22539990188012918),
  ('catch', 0.2511611770012997),
  ('driver', 0.0621868309590647),
  ('find', 0.2478640193086606),
  ('flight', 0.2548470857086759),
  ('get_rid', 0.3199212421204541),
  ('pm', 0.3072325950314696),
  ('prenook', 0.3416126275910468),
  ('service', 0.09562349900473621),
  ('thing', 0.156980919747838),
  ('today', 0.32857432683695253),
  ('trip', 0.11999725492585073),
  ('went', 0.3416126275910468)]]

## Save DF Merged

In [43]:
df_merged.to_csv(output_path + 'bolt_apple_google.csv')