In [57]:
# libraries
from utils import *
from pprint import pprint
import pandas as pd
from spellchecker import SpellChecker
spell = SpellChecker()

import pickle

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

config = get_config('config.yaml')

# Bolt

## Read Input

In [2]:
df = pd.read_csv(config['csv_input_local']['bolt_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 55061 

Total unique users : 50021
Total users who gave multiple reviews: 5040

Average rating for this app based on the textual reviews: 3.91 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [33]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 19.2 s


In [4]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 55061 

Total reviews (AFTER): 40365 

Total Non-English reviews: 14696 

Wall time: 9.94 ms


In [6]:
df_p1.to_csv(config['csv_input_local']['bolt_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [166]:
df_p2 = pd.read_csv(config['csv_input_local']['bolt_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,Hilary Meyer,Your drivers are great BUT your support is no ...,3,2020-12-06 19:38:13
1,Kenneth B,"Loving the app, but can only give it a medium ...",3,2020-12-11 14:06:15
2,Bonolo Mphahlele,The response time with regards to customer ser...,4,2020-12-07 13:59:18
3,jessfechi egolo,This has been happening for some time now and ...,1,2020-12-13 18:19:06
4,Safeeya Lawal,The drivers almost always never have change an...,3,2020-12-05 15:08:10
...,...,...,...,...
17925,Vitaliy.l,Great app!,5,2016-03-22 15:29:49
17926,cretchen,Great app!!,5,2015-07-14 20:13:38
17927,Nick_Name_Why,The drivers regularly get the wrong address du...,1,2019-10-08 10:57:34
17928,Illimar,Just love it!,5,2017-06-23 20:34:04


### Filtering Out Uninformative Reviews

In [167]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "1_bolt" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\1_bolt.csv
Vocabulary size for 1_bolt : 3666
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 17930




In [168]:
# for i in range(0, len(unlabelSet)):
#     print(unlabelSet[i].id)

In [169]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9372883953217626
Number of informative reviews: 10785
Number of uninformative reviews: 7145
Wall time: 456 ms


In [170]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
#     print(uninformRev[i].id)
#     print(uninformRev[i].id - (len(trainSet) + len(testSet)))
#     uninformRev[i].printSelf()
#     print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

In [171]:
# listOfRemovedIndex_p3

In [172]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 17930 

Total reviews (AFTER): 10785 

Total removed reviews: 7145 

Wall time: 3.99 ms


In [173]:
df_p3.to_csv(config['csv_input_local']['bolt_apple_google_p3'])

### Correcting Typos

In [42]:
%%time
cleaned_docs = remove_things(df_p3.review)
lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)
lists_of_words_no_stops

Wall time: 350 ms


[['using',
  'month',
  'good',
  'journeys',
  'horrendous',
  'ones',
  'recently',
  'first',
  'occasion',
  'told',
  'cross',
  'legs',
  'car',
  'might',
  'touch',
  'chair',
  'apparently',
  'encourages',
  'people',
  'sit',
  'still',
  'move',
  'car',
  'response',
  'got',
  'customer',
  'service',
  'refund',
  'second',
  'time',
  'dental',
  'appointment',
  'ordered',
  'cab',
  'driver',
  'came',
  'said',
  'going',
  'soon',
  'found',
  'location',
  'got',
  'frustrated',
  'swore',
  'said',
  'far',
  'man',
  'tired',
  'asked',
  'accepted',
  'first',
  'place',
  'want',
  'go',
  'far',
  'get',
  'proper',
  'explanation',
  'apology',
  'pain',
  'dental',
  'extraction',
  'cancel',
  'end',
  'said',
  'said',
  'would',
  'text',
  'let',
  'know',
  'messaged',
  'days',
  'ago',
  'still',
  'reply',
  'charged',
  'trip',
  'happen',
  'terrible',
  'terrible',
  'drivers',
  'bad',
  'customer',
  'service',
  'incidents',
  'like',
  'happen

In [49]:
list_of_typo_words = []
for sentence in lists_of_words_no_stops:
#     print(sentence)
    for word in sentence:
        if spell.correction(word) != word:
            print(word)
            print(spell.correction(word))
            list_of_typo_words.append(word)

xl
al
beckenham
beckham
cctv
cut
cctv
cut
deivers
drivers
fumingg
fuming
hru
thru
cornerr
corner
faq
far
kapten
kaiten
malta
malt
ffs
ifs
llocks
blocks
linkedin
linkedln
xx
ex
teip
trip
touran
toucan
tunis
tunes
gps
gas
viavan
vivian
ms
is
gps
gas
gps
gas
darkmode
darmody
usability
ability
gps
gas
viavan
vivian
postcodes
postcode
viavan
vivian
kapten
kaiten
gps
gas
slovakia
slovakian
kapten
kaiten
badd
bad
malta
malt
funke
funky
ms
is
funke
funky
meriza
media
londoner
londoners
btw
bow
gps
gas
аа
i
не
i
mr
my
masimba
marimba
pls
plus
befre
before
kapten
kaiten
xl
al
amaizing
amazing
kaunas
saunas
km
am
krakow
kraken
reaaly
really
latvia
latvian
ppl
pal
kapten
kaiten
gps
gas
gatwick
patrick
xs
is
xl
al
covid
couid
drivera
driver
kapten
kaiten
sh
so
ft
it
ft
it
ft
it
bmw
bow
tfl
til
sms
sums
unsubscribed
subscribed
uninstalled
installed
viavan
vivian
bonuss
bonus
nicee
nice
gps
gas
hrs
his
coudnt
count
happned
happened
muqtar
guitar
moalin
moulin
xl
al
xl
al
desaster
disaster
kaunas
saun

In [50]:
list_of_typo_words

['xl',
 'beckenham',
 'cctv',
 'cctv',
 'deivers',
 'fumingg',
 'hru',
 'cornerr',
 'faq',
 'kapten',
 'malta',
 'ffs',
 'llocks',
 'linkedin',
 'xx',
 'teip',
 'touran',
 'tunis',
 'gps',
 'viavan',
 'ms',
 'gps',
 'gps',
 'darkmode',
 'usability',
 'gps',
 'viavan',
 'postcodes',
 'viavan',
 'kapten',
 'gps',
 'slovakia',
 'kapten',
 'badd',
 'malta',
 'funke',
 'ms',
 'funke',
 'meriza',
 'londoner',
 'btw',
 'gps',
 'аа',
 'не',
 'mr',
 'masimba',
 'pls',
 'befre',
 'kapten',
 'xl',
 'amaizing',
 'kaunas',
 'km',
 'krakow',
 'reaaly',
 'latvia',
 'ppl',
 'kapten',
 'gps',
 'gatwick',
 'xs',
 'xl',
 'covid',
 'drivera',
 'kapten',
 'sh',
 'ft',
 'ft',
 'ft',
 'bmw',
 'tfl',
 'sms',
 'unsubscribed',
 'uninstalled',
 'viavan',
 'bonuss',
 'nicee',
 'gps',
 'hrs',
 'coudnt',
 'happned',
 'muqtar',
 'moalin',
 'xl',
 'xl',
 'desaster',
 'kaunas',
 'lyftt',
 'cs',
 'sms',
 'tallinn',
 'jdexgn',
 'cascais',
 'cascais',
 'mobey',
 'optionally',
 'cooll',
 'protus',
 'txfy',
 'huf',
 'huf',

### Processing Natural Language

In [56]:
df_p4 = df_p3.copy()
df_p4 = df_p4.reset_index(drop=True)
df_p4

Unnamed: 0,userName,isEdited,review,rating,title,date
0,Kazem Sharan,False,I’ve been using bolt for a month now. I had a ...,1,Drivers are unprofessional,2019-09-20 09:25:04
1,19RN20,False,I’m so appalled at the attitude of the last dr...,1,Zero Stars.,2020-09-04 19:50:50
2,evolvingihsan,False,I’ve had many problems with the customer suppo...,1,HORRIBLE! HORRIBLE CUSTOMER SERVICE,2020-01-29 16:58:11
3,Sabcha08,False,So far majority of the drivers were very kind ...,3,Needs some updates,2019-07-05 12:17:37
4,Ellenyh79,False,I was super happy when Bolt launch as finally ...,1,Terrible customer service and grossly overchar...,2019-10-03 22:17:44
...,...,...,...,...,...,...
620,Vitaliy.l,False,Great app!,5,Great app,2016-03-22 15:29:49
621,cretchen,False,Great app!!,5,Great,2015-07-14 20:13:38
622,Nick_Name_Why,False,The drivers regularly get the wrong address du...,1,Poor location accuracy,2019-10-08 10:57:34
623,Illimar,False,Just love it!,5,Perfect UX,2017-06-23 20:34:04


#### Coreference Resolution

In [51]:
import logging;
logging.basicConfig(level=logging.INFO)
import neuralcoref

# Load your usual SpaCy model (one of SpaCy English models)
import spacy
nlp = spacy.load('en')
# nlp = spacy.load("en_core_web_sm")

# Add neural coref to SpaCy's pipe
import neuralcoref
neuralcoref.add_to_pipe(nlp)

# You're done. You can now use NeuralCoref as you usually manipulate a SpaCy document annotations.
doc = nlp(u'My sister has a dog. She loves him.')
print(doc._.coref_resolved)

My sister has a dog. My sister loves a dog.


In [60]:
df_p4.review[1]

'I’m so appalled at the attitude of the last driver I ordered from you. Three people recommended your services and I trusted those opinions. I’ve only taken four trips with Bolt and not ONE driver has worn a mask including the trips taken with my 7 month old baby. I ordered a XL car to help with moving a mirror and a bassinet to my destination, he took 15 minutes to get here, was on a call via loud speaker with a friend/family member. I asked if I could put my stuff into the boot as I was moving house and collecting the last items I had at my previous home. He declined and the person he was speaking to on the phone also got involved also. He then said it was my fault for ordering the wrong type of car, I told him I ordered the biggest car option on bolt. He said I would have to find another car and then told ME to cancel the drive. I refused so as he drove off he shouted something and left me, my baby and my things in the cold. Then to make matters worse, he cancelled the trip and put 

In [61]:
for i in range(0, len(df_p4)):
    textToCheck = df_p4.review[i]
    doc = nlp(textToCheck)
    hasCoref = doc._.has_coref
    
    print(textToCheck)
    print('\n')
    print(hasCoref)
    
    if(hasCoref == True):
        print(doc._.coref_resolved)
        
    print('\n')
    

I’ve been using bolt for a month now. I had a few good journeys but 2 horrendous ones recently. The first occasion I was told to not cross my legs in the car because it “might” touch his chair. So apparently bolt encourages people to sit still and not move in the car, which was the response I got from customer service and no refund. The second time, I had a dental appointment and ordered a cab, the driver came and then he said where are you going? As soon as he found out what the location was, he got frustrated and swore and said “this is too far man I’m tired”. I asked why he had accepted it in the first place if he didn’t want to go too far. I didn’t get any proper explanation and no apology. I was in pain because of my dental extraction too. So I had to cancel from my end because he said he couldn’t do it. He said he would text bolt and let them know. And I messaged Bolt 3 days ago and still no reply. They have charged me £3 for this trip that didn’t happen. Terrible terrible driver

I was super happy when Bolt launch as finally there’s fair competition for taxi apps in London. After using it a couple of times I was happy. Until I took a ride from Beckenham to Heathrow I was quoted an estimate of £45-50. But after the ride I noticed that Bolt had charged me £87 instead!!! I contacted customer service and they have been absolutely USELESS!! It’s been 2 weeks and I’m going round in circles with them asking me again what my route was, how much I was charged, sending them bank statements. They have refused to acknowledge the overcharge and not giving refund. If I had known they were going to charge such extortionate price I would have hired a local minicab company or gone with Uber instead! If I had raised such issue with Uber there would be an immediate refund. Watch out for these scammers!


True
I was super happy when Bolt launch as finally there’s fair competition for taxi apps in London. After using it a couple of times I was happy. Until I took a ride from Becken

So I’m trying out Bolt as a potential alternative to Uber, given the situation with Uber in London. Prices are good but Bolt needs to do a bit of work on the app!
1. The idea of being able to modify the pick up location should be added. You have to cancel and start agin even just for crossing the road
2. Being able to track the potential route would be useful, especially if you’re familiar with the route and can suggest modifications
Still - great to have an alternative to Uber! And drivers are all good too so far


True
So I’m trying out Bolt as a potential alternative to Uber, given the situation with Uber in London. Prices are good but Bolt needs to do a bit of work on the app!
1. The idea of being able to modify the pick up location should be added. You have to cancel and start agin even just for crossing the road
2. Being able to track the potential route would be useful, especially if you’re familiar with the route and can suggest modifications
Still - great to have an alternativ

Used three times so far after good friend recommended and send me invite that gave me good welcome discount which was plus! Overall professional drivers and friendly drivers. New or very decent cars and very quick pick up which was very great first impression. Keep up the good work!


False


Having used Uber for years and made over 1000 trips, I decided to try Bolt.
Cost wise it was extremely heap the first few rides, but now it is almost always more than Uber for the same route.
The app is very inaccurate and doesn’t recognise places of interest at all. I always have to message the driver to explain exactly where I am.
The app could definitely benefit from more development on the location accuracy.


True
Having used Uber for years and made over 1000 trips, I decided to try Bolt.
Cost wise it was extremely heap the first few rides, but now it is almost always more than Uber for the same route.
The app is very inaccurate and doesn’t recognise places of interest at all. I always have t

Nice service. Getting promotions from time to time. But the only thing that is lacking - two address request. It’s very common service. Instead of minimize communication with drivers, we need to explain that we would change final destination when we come closer to first one. So that driver won’t take next trip. Confusing.


True
Nice service. Getting promotions from time to time. But the only thing that is lacking - two address request. the only thing that is lacking - two address request’s very common service. Instead of minimize communication with drivers, we need to explain that we would change final destination when we come closer to first one. So that driver won’t take next trip. Confusing.


First trip was ok, second time I booked a trip, the app told me 8 min waiting time, the driver called me and said he would be 15 mins not 8 mins and did I want to cancel, I said no I would wait, after 15 mins waiting the driver cancelled the trip, after that I couldn’t not get another bolt ri

Please do not up your prices because they’re so helpful to get around especially if you’re stuck somewhere with little to no money, you’re better than Uber and I’d like to keep it that way


False


All Bolt cars show up as white cars on the app, even when the car is a completely different colour. 

Sort it out, please! 

Got me out here looking for a different Bolt car. 

Oh, and they cancel the ride when your fee is low due to an ongoing promotion.

EDIT: Bolt customer service has been amazing. Absolutely top notch! 

Most of the Bolt drivers are amazing, but there are a few that take the absolute mick. Each time that this has happened, a Bolt customer rep was easily able to rectify the problem. 

Thank you so much, Bolt!


True
All Bolt cars show up as white cars on the app, even when the car is a completely different colour. 

Sort the car out, please! 

Got me out here looking for a different Bolt car. 

Oh, and they cancel the ride when your fee is low due to an ongoing promotion

Bolt may be cheaper than Uber but the app itself is useless. It’s slow, geolocation barely works and address lookup is abysmal. Tried entering an address, couldn’t find the correct one so had to be dropped off somewhere else. Couldn’t track the driver accurately and ETA of driver arrival was extremely delayed. 
Maybe invest some money + time into your backend infrastructure so the app actually works, otherwise I’ll be going back to Uber...


True
Bolt may be cheaper than Uber but the app itself is useless. the app itself’s slow, geolocation barely works and address lookup is abysmal. Tried entering an address, couldn’t find the correct one so had to be dropped off somewhere else. Couldn’t track the driver accurately and ETA of driver arrival was extremely delayed. 
Maybe invest some money + time into your backend infrastructure so the app itself actually works, otherwise I’ll be going back to Uber...


It ms quite often that a driver accepts a ride then within minutes cancels it so it’

The advertised a coupon 10£ off to try Bold on our first ride.
I applied the coupon, they charged me without -10£.. Even if they showed to me that the ride would have -10£ on the price while I called the car.
Then I sent 3 times a question about the event and the overcharging and nobodies ever responded to me...!!..
Tragic customer service at least.
Do not try Bolt. Delete this “funny” app..


True
The advertised a coupon 10£ off to try Bold on our first ride.
I applied a coupon 10£, they charged me without -10£.. Even if they showed to me that our first ride would have -10£ on the price while I called the car.
Then I sent 3 times a question about the event and the overcharging and nobodies ever responded to me...!!..
Tragic customer service at least.
Do not try Bolt. Delete this “funny” app..


The app is okay but the estimated pick up time is not accurate at all and it’s usually almost twice as long than it shows initially. It also happened to notify me that the cab arrived, however 

I’ve used this in Europe before but now it’s in the UK they’re not giving promos to Customers that haven’t used it before in the UK - BAD MISTAKE - the only way to get customers is to give promo codes otherwise it simply won’t work !!! 

Also pricing is higher than uBer and ViaVan- why would we use BOLT ? Sort your pricing out or you’ll be out of here quicker than Theresa May


True
I’ve used this in Europe before but now this’s in the UK the UK’re not giving promos to Customers that haven’t used this before in the UK - BAD MISTAKE - the only way to get customers is to give promo codes otherwise this simply won’t work !!! 

Also pricing is higher than uBer and ViaVan- why would we use BOLT ? Sort your pricing out or you’ll be out of here quicker than Theresa May


The customer service is diabolical, one guy called David absolutely terrible telling what I did & didn’t do & the app itself puts different postcodes in, to lengthen your journey & bump up your price on your ride. I won’t be 

As simple to use as it is efficient. Drivers are friendly & professional. Unlike my Über experiences in London. Great service 😎


False


Recommended to friends and always beating Uber , Óla & Kapten by a few pounds on every journey! Amazing service too from all friendly drivers


False


We waited to long with kids we choose executive car which means bigger still waited 20 min then he reach with small car said this executive means we ask him can we cancel he didn’t said that he goons charge 8 pound for not using it n we took bus reach home we received email that he charge us 8 pound badddd service


True
We waited to long with kids We choose executive car which means bigger still waited 20 min then 20 min reach with small car said this executive means We ask 20 min can We cancel 20 min didn’t said that 20 min goons charge 8 pound for not using it n We took bus reach home We received email that he charge We 8 pound badddd service


Very quick to book and very good price friendly 
 driv

Not unusual to wait 10+ minutes for a car that is meant to arrive in 2 (according the app). If you’re on a time schedule use Uber.


False


Awful customer service. Drivers cancel rides when they arrive to pick you up as it’s not convenient for them, I’ve been waiting 15 mins!!!! Then you tell me this!!!! And then the next rider does exactly the same!!!! Waste of my time, my money!!!! Never again, I will use OLA FROM NOW ON, GOODBYE


True
Awful customer service. Drivers cancel rides when Drivers arrive to pick you up as it’s not convenient for Drivers, I’ve been waiting 15 mins!!!! Then you tell me this!!!! And then the next rider does exactly the same!!!! Waste of my time, my money!!!! Never again, I will use OLA FROM NOW ON, GOODBYE


Brilliant prices but the app is not great, when you request a driver it’s needs to be more clear where they are coming from and how long it’s taking them to come


True
Brilliant prices but the app is not great, when you request a driver a driver’s nee

Never felt so welcome and involved.  Lovely, passionate driver, perfect energy.. imma customer wanting to return:) perfect job for a good person


False


He was really friendly and also had great conversation with him...and he’s funny


True
He was really friendly and also had great conversation with He...and He’s funny


Drivers are breaking traffic rules and there is no way to report 

Driver on the white car with bolt marking car number Аа 6788 не just broke a traffic rules by driving on the street where was a sign that you can’t enter the street


False


The customer support team team takes approximately 3 days to reply, really rude drives, driving whilst on the phone with headphones on which is very unsafe. It’s extremely disgusting costume service and I will not be using this app anymore.


False


It just worked for the first 2 times. I’ve been trying for more than a week now and it doesn’t work. I have added my bank card so many times and it always asks for it again. So annoy

This app is completely rubbish, I paid for a ride which I never took. I contacted to bolt customer service but they didnt even get back to me. Very frustrating and I would never want the same thing happening to anyone else!


True
This app is completely rubbish, I paid for a ride which I never took. I contacted to bolt customer service but customer service didnt even get back to me. Very frustrating and I would never want the same thing happening to anyone else!


Such a good value and amazing service it comes is quick!!!!


False


Waited less than 3 mins for my driver, polite, friendly and very good driver. Very pleased will deffo be using again :)


False


Great , reliable and don’t have to wait long


False


Very prompt service and good value for money


False


He is an amazing driver, and if you are luck get his car on Christmas, there is a suprise waiting you 😍👍🏻😄🤩🎄


True
He is an amazing driver, and if you are luck get He car on Christmas, there is a suprise waiting you 😍👍🏻😄

A very good, efficient service. Nice and polite drivers. Overall a very good value for money.


False


Bolt has helped me a lot it’s so fast and cheap love it!!!


True
Bolt has helped me a lot Bolt’s so fast and cheap love Bolt!!!


Simply do not use this BOLT app. 
In Georgia service was absolutely hopeless, 4 out of 5 rides booked didn’t turn up 
In Poland rides arrived but drivers drove aggressively and far too fast. 2nd ride in Krakow driver was not the one photographed, car was old damaged and filthy. Driver rude, no help with luggage and didn’t want to drop us at the hotel booked. 
Overcharged 40% on BOTH journeys. 
Support hopeless; no response whatsoever after 4 days to messages left. 
Never had such problems with UBER!


False


I really want to try the app, and I’m sure it’s all safe, but going back to sharing card details is so 2015.


True
I really want to try the app, and I’m sure the app’s all safe, but going back to sharing card details is so 2015.


I had £14 credit f

Rude driver to start with. I was picked up from Gatwick and then overcharged twice the airport pickup fee. 

The driver then didn’t end the trip when I left the car, leaving the account running until I called and insisted that he end the trip immediately. 

Customer service will only offer app credit. I’ve removed my card details from the app and will be deleting it altogether.

AVOID AVOID AVOID.


True
Rude driver to start with. I was picked up from Gatwick and then overcharged twice the airport pickup fee. 

Rude driver to start then didn’t end the trip when I left the car, leaving the account running until I called and insisted that Rude driver to start end the trip immediately. 

Customer service will only offer app credit. I’ve removed my card details from app and will be deleting my card details from the app altogether.

AVOID AVOID AVOID.


Amazing app , reliable to use


False


Always very pleasant, drivers are great.


False


Just love it, will promote you all the way, Than

Great driver and extremely friendly!


False


Love the app, love the drivers. Whoooosh! Especially the cheery ones.


False


So far its been great


False


It’s very good and the promos are amazing and about 1/3 the price of Uber


False


I found most drivera super rude , I have been over charged for a trip a lot once , contacted the company And Had zero response ... never again


True
I found most drivera super rude , I have been over charged for a trip a lot once , contacted most drivera And Had zero response ... never again


Very nice service. Excellent 5 star


False


Terrible service cabs said 8-10 pound and when I got home and checked it’s was 17 pound support don’t talk or help u any way at all I would advise people to get this con artists


True
Terrible service cabs said 8-10 pound and when I got home and checked Terrible service cabs’s was 17 pound support don’t talk or help u any way at all I would advise people to get this con artists


Has the best promos which come 

Great service!!!!


False


And charged me 6 pound for them cancelling disgusting service stick to ViaVan or Uber


False


Fantastic driver very polite


False


Fandabedooose absolute legend of a driver! 
Best service in London!😎😎🤩👊🏻🤩🤩☺️🤗🤗


False


Best driver ever he deserves a bonusss!!!


True
Best driver ever Best driver deserves a bonusss!!!


Great app!!!


False


Terrible . Never any cars. Horrendous customer service. Downloading was massive regret


False


Some stupid company and bad drivers


False


Great trip


False


Worst apps ever


False


Perfect app


False


The worst app ever it thinks I’m doing fraud so it blocked my account


True
The worst app ever app thinks I’m doing fraud so app blocked my account


Very nice driver


False


They steal from drivers.


False


The best ♥️good price


False


Perfect for tourists


False


Worst of all time 🤮


False


really good ! very pleased !


False


Very good promotion


False


Very niceee


False


The best Drive

Absolutely brilliant in London. Had a promotion code for a free journey which took us from Kings Cross to our hotel, then I was given a code to give to a friend for another free journey which covered our return to Kings Cross. Drivers where on time and friendly. Would definitely recommend in London


True
Absolutely brilliant in London. Had a promotion code for a free journey which took us from Kings Cross to us hotel, then I was given a code to give to a friend for another free journey which covered us return to Kings Cross. Drivers where on time and friendly. Would definitely recommend in London


Usually I have no problems with bolt however when we ordered our bolt we had to look for our driver for 6 minutes (in high heels) and when we found him he was buying an iPhone charger, we noticed that the timer had been going for 6 minutes and 30 seconds and asked our driver if the timer had charged us extra he assured us that the timer had not started yet however we were estimated £13.50 -

It’s good application.i have been used this app in Kaunas,Lithuania for 2-3 months .some of the driver can speak English and most of them are not speak English.some driver always took a longer route even it shows up on their equipment.i try to explain to them but seem like they don’t care and end up my payment was higher than expected.

I tried to contract bolt application service but it’s so difficult ,the only way for you to contact them is to email them.
No live chat 

the good things that you can shared your journey with another person.


True
It’s good application.i have been used this app in Kaunas,Lithuania for 2-3 months .some of the driver can speak English and most of them are not speak English.some driver always took a longer route even a longer route shows up on them equipment.i try to explain to them but seem like them don’t care and end up my payment was higher than expected.

I tried to contract bolt application service but a longer route’s so difficult ,the only way for

App is terrible. In the login process I'm supposed to receive SMS with login code. I've tried to enter my phone number for a number of times and nothing. I'm not receiving message with login code. Also, even worse... I needed a ride when it was heavy rain outside. Terrible service.


True
App is terrible. In the login process I'm supposed to receive SMS with login code. I've tried to enter my phone number for a number of times and nothing. I'm not receiving message with login code. Also, even worse... I needed a ride when a ride was heavy rain outside. Terrible service.


Worst taxi app in world. I got called dirty words so many times, harassed sexually and now they started to charge my card without even using it. I am getting transactions with “Tallinn/Bolt.eu” and I didn’t use this crap for several days. THIS IS PURE SCAM and must be removed from iTunes


True
Worst taxi app in world. I got called dirty words so many times, harassed sexually and now they started to charge my card wit

I took a ride in for 4070 HUF. The company charged me 407000 HUF. I can’t believe that this was mistake!! I was surprised to see -1333 EUR when checked my bank account. The support not answering to the messages. Not possible to join anyone by phone. The drivers are taking longer routes to charge you more. By far the worst taxi app you can imagine !!! Use Uber and you’d save yourself a lot of trouble. 0 stars for Taxify in every way.
#Update 28/12: Taxify blocked my number after this review. Well done because I don’t want  to hear about you anymore and that confirms the unprofessional methods of this company !!


True
I took a ride in for 4070 HUF. 4070 HUF charged me 407000 HUF. I can’t believe that this was mistake!! I was surprised to see -1333 EUR when checked my bank account. The support not answering to the messages. Not possible to join anyone by phone. The drivers are taking longer routes to charge you more. By far the worst taxi app you can imagine !!! Use Uber and you’d save y

Just signed up, they charged me with 2 euros and the bike didn’t move at all. Maybe the device was broken, I don’t know. But I didn’t use the bike and I had to pay for a trip.


True
Just signed up, they charged me with 2 euros and the bike didn’t move at all. Maybe the device was broken, I don’t know. But I didn’t use the bike and I had to pay for a trip.


BOLT is the worst ride sharing app I have ever tried. It took my money twice and I’ve never had a trip yet, this is unbelievable and has to be stopped. They are thieves and have to be removed from the market.


True
BOLT is the worst ride sharing app I have ever tried. app took my money twice and I’ve never had a trip yet, this is unbelievable and has to be stopped. They are thieves and have to be removed from the market.


It took more than 20 minutes for our ride to get to our destination, once it was close, it stated that it was finishing another ride and was going to take another 10 minutes. After I cancel the ride the app char



True
A very helpful app, but why I can't just watch the list of taxi companies with taxi companies phonenumbers?


Its fine.


False


BEST app ever! So easy to use and very functional!


False


Great app! Works perfectly.


False


Shows 1 min. Taxi arrives in 9min. Shows 3 min. Taxi arrives in 6 min etc.


True
Shows 1 min. Taxi arrives in 9min. Shows 3 min. Taxi arrives in 6 min etc.


Good app. I suggest


False


Very good and helpful app!


False


Great app!


False


Superb app! Thanks devs!!


False


Great app!


False


Great app!!


False


The drivers regularly get the wrong address due to some poor auto-replacement-with-the-local-hotspot function even though to the customer the app displays their correct address. Which leads the customer to blame the unsuspecting driver who can’t do anything about it. Bolt, your customer experience sucks!


True
The drivers regularly get the wrong address due to some poor auto-replacement-with-the-local-hotspot function even though to 

#### Sentence Annotation

In [None]:
# TODO

### Building Corpus

In [62]:
df_p5 = df_p4.copy()
df_p5

Unnamed: 0,userName,isEdited,review,rating,title,date
0,Kazem Sharan,False,I’ve been using bolt for a month now. I had a ...,1,Drivers are unprofessional,2019-09-20 09:25:04
1,19RN20,False,I’m so appalled at the attitude of the last dr...,1,Zero Stars.,2020-09-04 19:50:50
2,evolvingihsan,False,I’ve had many problems with the customer suppo...,1,HORRIBLE! HORRIBLE CUSTOMER SERVICE,2020-01-29 16:58:11
3,Sabcha08,False,So far majority of the drivers were very kind ...,3,Needs some updates,2019-07-05 12:17:37
4,Ellenyh79,False,I was super happy when Bolt launch as finally ...,1,Terrible customer service and grossly overchar...,2019-10-03 22:17:44
...,...,...,...,...,...,...
620,Vitaliy.l,False,Great app!,5,Great app,2016-03-22 15:29:49
621,cretchen,False,Great app!!,5,Great,2015-07-14 20:13:38
622,Nick_Name_Why,False,The drivers regularly get the wrong address du...,1,Poor location accuracy,2019-10-08 10:57:34
623,Illimar,False,Just love it!,5,Perfect UX,2017-06-23 20:34:04


In [64]:
cleaned_docs = remove_things(df_p5.review)
lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)

ngrams = make_bigrams(lists_of_words_no_stops)
# ngrams = make_trigrams(lists_of_words_no_stops)

data_lemmatized = lemmatize(ngrams, allowed_postags=['NOUN'])

Making bigrams...
Lemmatizing...


In [65]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
term_doc = [id2word.doc2bow(text) for text in texts]

# View
print(term_doc[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]]


In [67]:
tf_idf = models.TfidfModel(term_doc, smartirs='ntc')[term_doc]
[[(id2word[id], freq) for id, freq in cp] for cp in tf_idf[:1]]

[[('apology', 0.258303175195304),
  ('appointment', 0.19374341386795016),
  ('car', 0.1660609726432199),
  ('chair', 0.258303175195304),
  ('cross', 0.2304987993967995),
  ('day', 0.12187014507285714),
  ('driver', 0.07236556805706493),
  ('explanation', 0.2304987993967995),
  ('extraction', 0.258303175195304),
  ('incident', 0.2142342821987155),
  ('journey', 0.12323004598478411),
  ('leg', 0.258303175195304),
  ('location', 0.1423610134036225),
  ('man', 0.19374341386795016),
  ('month', 0.19374341386795016),
  ('occasion', 0.20269442359829498),
  ('one', 0.18642990640021098),
  ('pain', 0.258303175195304),
  ('people', 0.15244204814412854),
  ('place', 0.18024642394263304),
  ('refund', 0.14019220224671175),
  ('response', 0.15244204814412854),
  ('swore', 0.258303175195304),
  ('text', 0.258303175195304),
  ('trip', 0.10301677900469748)]]

In [73]:
with open(config['csv_input_local']['bolt_apple_corpus'], 'wb') as f:
    pickle.dump(tf_idf, f)

# Uber

## Read Input

In [8]:
df = pd.read_csv(config['csv_input_local']['uber_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 20342 

Total unique users : 20225
Total users who gave multiple reviews: 117

Average rating for this app based on the textual reviews: 3.13 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [9]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 1min 53s


In [10]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 20342 

Total reviews (AFTER): 19833 

Total Non-English reviews: 509 

Wall time: 8.98 ms


In [12]:
df_p1.to_csv(config['csv_input_local']['uber_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [58]:
df_p2 = pd.read_csv(config['csv_input_local']['uber_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,Vivek Singh,Superb services with Uber cab..I loving it.Tha...,5,2020-10-24 09:24:04
1,Bernardino Espinoza,Always on time. Courteous drivers... Highly re...,5,2020-10-14 08:02:30
2,hassan ijaz,Terrible.. Drivers are rude and Uber don't eve...,1,2020-09-14 17:01:40
3,Terril Ziegler-Carson,Excellent ride. Clean and safe,4,2020-11-14 16:47:14
4,VINAY DUBEY,Very fantastic. I love uber and i trust uber. ...,5,2020-10-30 19:16:06
...,...,...,...,...
8280,Mistry1785,Get £200 credit with this code NW17E.. Before ...,5,2014-02-05 20:42:44
8281,Corrrrrrrr,"I got £40 credit with code EJY28, use that cod...",5,2014-01-29 01:01:58
8282,TPSLBB,Fantastic Service - Great App - though it coul...,5,2013-08-28 21:25:39
8283,avinash2904,The app is stuck on the login screen after the...,3,2013-02-04 10:11:18


### Filtering Out Uninformative Reviews

In [64]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "2_uber" # four apps: facebook, templerun2, swiftkey, tapfish # others: all, 1_bolt, 2_uber
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\2_uber.csv
Vocabulary size for 2_uber : 3580
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 8285




In [67]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9362254867678941
Number of informative reviews: 6763
Number of uninformative reviews: 1522
Wall time: 206 ms


In [72]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
    print(uninformRev[i].id)
    print(uninformRev[i].id - (len(trainSet) + len(testSet)))
    uninformRev[i].printSelf()
    print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

3529
15
Review id: 3529 Rating: 5 Content: love user friendli trustworthi rare even comment app favorit carpool app alot Ntokens: 12 TS:  Group:  Prob: 0 label: 0
Raw text: i love it very user friendly and trustworthy it s rare for me to even comment about other apps but this is my favorite carpool app alot


3533
19
Review id: 3533 Rating: 5 Content: scott amaz understand situat help thank scott Ntokens: 7 TS:  Group:  Prob: 0 label: 0
Raw text: scott was amazing and very understanding of the situation i was in and very helpful thank you scott


3536
22
Review id: 3536 Rating: 3 Content: enter payment info dark mode see number Ntokens: 7 TS:  Group:  Prob: 0 label: 0
Raw text: can t enter payment info in dark mode can t see the numbers


3546
32
Review id: 3546 Rating: 5 Content: speed eleg Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: brilliance speed and elegance


3554
40
Review id: 3554 Rating: 5 Content: conveni except Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: conve

Raw text: it s very good and comfortable


6831
3317
Review id: 6831 Rating: 5 Content: great friendli Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: great and friendly


6833
3319
Review id: 6833 Rating: 5 Content: use good Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: useful and good


6834
3320
Review id: 6834 Rating: 5 Content: great experi uber Ntokens: 3 TS:  Group:  Prob: 0 label: 0
Raw text: great experience with uber


6835
3321
Review id: 6835 Rating: 5 Content: nice behaviour safe journey Ntokens: 4 TS:  Group:  Prob: 0 label: 0
Raw text: nice behaviour safe journey


6838
3324
Review id: 6838 Rating: 5 Content: clean courteou Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: clean and courteous


6843
3329
Review id: 6843 Rating: 5 Content: outstand cashless experi Ntokens: 3 TS:  Group:  Prob: 0 label: 0
Raw text: outstanding cashless experience


6845
3331
Review id: 6845 Rating: 5 Content: nice app get vehicl Ntokens: 4 TS:  Group:  Prob: 0 label: 0
Raw text: v

11153
7639
Review id: 11153 Rating: 5 Content: realli nice men brilliant get app Ntokens: 6 TS:  Group:  Prob: 0 label: 0
Raw text: really nice men who socialise with you brilliant get the app


11156
7642
Review id: 11156 Rating: 5 Content: noth brilliant love app use london econom easi seamless Ntokens: 9 TS:  Group:  Prob: 0 label: 0
Raw text: nothing but brilliant love the app using it in london is economical easy and seamless


11160
7646
Review id: 11160 Rating: 5 Content: good servic Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: good service


11161
7647
Review id: 11161 Rating: 5 Content: jeff good love car boot Ntokens: 5 TS:  Group:  Prob: 0 label: 0
Raw text: jeff was very good and had a lovely car to boot


11163
7649
Review id: 11163 Rating: 2 Content: app buggi hell iphon io sort start lose custom Ntokens: 9 TS:  Group:  Prob: 0 label: 0
Raw text: your app is buggy as hell iphone 5 ios 8 0 2 sort it out or start losing customers


11164
7650
Review id: 11164 Rating: 

In [69]:
listOfRemovedIndex_p3

[15,
 19,
 22,
 32,
 40,
 41,
 45,
 49,
 52,
 88,
 106,
 109,
 130,
 159,
 163,
 189,
 208,
 221,
 239,
 245,
 263,
 264,
 282,
 285,
 295,
 304,
 307,
 309,
 311,
 319,
 321,
 324,
 343,
 346,
 348,
 361,
 364,
 375,
 378,
 380,
 392,
 416,
 421,
 424,
 427,
 432,
 438,
 460,
 479,
 483,
 500,
 533,
 537,
 565,
 567,
 575,
 577,
 583,
 585,
 589,
 590,
 594,
 599,
 607,
 615,
 617,
 635,
 640,
 642,
 651,
 670,
 682,
 719,
 740,
 743,
 754,
 759,
 774,
 786,
 789,
 791,
 799,
 801,
 812,
 815,
 823,
 825,
 827,
 845,
 857,
 869,
 875,
 889,
 904,
 905,
 910,
 911,
 915,
 920,
 927,
 935,
 948,
 960,
 969,
 972,
 979,
 986,
 993,
 994,
 1003,
 1008,
 1010,
 1019,
 1024,
 1027,
 1030,
 1039,
 1040,
 1042,
 1044,
 1046,
 1050,
 1052,
 1059,
 1071,
 1083,
 1094,
 1095,
 1104,
 1106,
 1108,
 1111,
 1112,
 1116,
 1125,
 1131,
 1132,
 1149,
 1158,
 1166,
 1168,
 1170,
 1186,
 1192,
 1195,
 1200,
 1210,
 1212,
 1213,
 1224,
 1227,
 1229,
 1236,
 1242,
 1248,
 1257,
 1264,
 1266,
 1276,
 1277,

In [70]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 8285 

Total reviews (AFTER): 6763 

Total removed reviews: 1522 

Wall time: 2.99 ms


In [71]:
df_p3.to_csv(config['csv_input_local']['uber_apple_google_p3'])

### Correcting Typos

In [42]:
%%time
cleaned_docs = remove_things(df_p3.review)
lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)
lists_of_words_no_stops

Wall time: 350 ms


[['using',
  'month',
  'good',
  'journeys',
  'horrendous',
  'ones',
  'recently',
  'first',
  'occasion',
  'told',
  'cross',
  'legs',
  'car',
  'might',
  'touch',
  'chair',
  'apparently',
  'encourages',
  'people',
  'sit',
  'still',
  'move',
  'car',
  'response',
  'got',
  'customer',
  'service',
  'refund',
  'second',
  'time',
  'dental',
  'appointment',
  'ordered',
  'cab',
  'driver',
  'came',
  'said',
  'going',
  'soon',
  'found',
  'location',
  'got',
  'frustrated',
  'swore',
  'said',
  'far',
  'man',
  'tired',
  'asked',
  'accepted',
  'first',
  'place',
  'want',
  'go',
  'far',
  'get',
  'proper',
  'explanation',
  'apology',
  'pain',
  'dental',
  'extraction',
  'cancel',
  'end',
  'said',
  'said',
  'would',
  'text',
  'let',
  'know',
  'messaged',
  'days',
  'ago',
  'still',
  'reply',
  'charged',
  'trip',
  'happen',
  'terrible',
  'terrible',
  'drivers',
  'bad',
  'customer',
  'service',
  'incidents',
  'like',
  'happen

In [49]:
list_of_typo_words = []
for sentence in lists_of_words_no_stops:
#     print(sentence)
    for word in sentence:
        if spell.correction(word) != word:
            print(word)
            print(spell.correction(word))
            list_of_typo_words.append(word)

xl
al
beckenham
beckham
cctv
cut
cctv
cut
deivers
drivers
fumingg
fuming
hru
thru
cornerr
corner
faq
far
kapten
kaiten
malta
malt
ffs
ifs
llocks
blocks
linkedin
linkedln
xx
ex
teip
trip
touran
toucan
tunis
tunes
gps
gas
viavan
vivian
ms
is
gps
gas
gps
gas
darkmode
darmody
usability
ability
gps
gas
viavan
vivian
postcodes
postcode
viavan
vivian
kapten
kaiten
gps
gas
slovakia
slovakian
kapten
kaiten
badd
bad
malta
malt
funke
funky
ms
is
funke
funky
meriza
media
londoner
londoners
btw
bow
gps
gas
аа
i
не
i
mr
my
masimba
marimba
pls
plus
befre
before
kapten
kaiten
xl
al
amaizing
amazing
kaunas
saunas
km
am
krakow
kraken
reaaly
really
latvia
latvian
ppl
pal
kapten
kaiten
gps
gas
gatwick
patrick
xs
is
xl
al
covid
couid
drivera
driver
kapten
kaiten
sh
so
ft
it
ft
it
ft
it
bmw
bow
tfl
til
sms
sums
unsubscribed
subscribed
uninstalled
installed
viavan
vivian
bonuss
bonus
nicee
nice
gps
gas
hrs
his
coudnt
count
happned
happened
muqtar
guitar
moalin
moulin
xl
al
xl
al
desaster
disaster
kaunas
saun

In [50]:
list_of_typo_words

['xl',
 'beckenham',
 'cctv',
 'cctv',
 'deivers',
 'fumingg',
 'hru',
 'cornerr',
 'faq',
 'kapten',
 'malta',
 'ffs',
 'llocks',
 'linkedin',
 'xx',
 'teip',
 'touran',
 'tunis',
 'gps',
 'viavan',
 'ms',
 'gps',
 'gps',
 'darkmode',
 'usability',
 'gps',
 'viavan',
 'postcodes',
 'viavan',
 'kapten',
 'gps',
 'slovakia',
 'kapten',
 'badd',
 'malta',
 'funke',
 'ms',
 'funke',
 'meriza',
 'londoner',
 'btw',
 'gps',
 'аа',
 'не',
 'mr',
 'masimba',
 'pls',
 'befre',
 'kapten',
 'xl',
 'amaizing',
 'kaunas',
 'km',
 'krakow',
 'reaaly',
 'latvia',
 'ppl',
 'kapten',
 'gps',
 'gatwick',
 'xs',
 'xl',
 'covid',
 'drivera',
 'kapten',
 'sh',
 'ft',
 'ft',
 'ft',
 'bmw',
 'tfl',
 'sms',
 'unsubscribed',
 'uninstalled',
 'viavan',
 'bonuss',
 'nicee',
 'gps',
 'hrs',
 'coudnt',
 'happned',
 'muqtar',
 'moalin',
 'xl',
 'xl',
 'desaster',
 'kaunas',
 'lyftt',
 'cs',
 'sms',
 'tallinn',
 'jdexgn',
 'cascais',
 'cascais',
 'mobey',
 'optionally',
 'cooll',
 'protus',
 'txfy',
 'huf',
 'huf',

# BlaBlaCar

## Read Input

In [13]:
df = pd.read_csv(config['csv_input_local']['blablacar_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 44480 

Total unique users : 42695
Total users who gave multiple reviews: 1785

Average rating for this app based on the textual reviews: 4.21 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [14]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 5min 8s


In [15]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 44480 

Total reviews (AFTER): 16212 

Total Non-English reviews: 28268 

Wall time: 13.5 ms


In [17]:
df_p1.to_csv(config['csv_input_local']['blablacar_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [141]:
df_p2 = pd.read_csv(config['csv_input_local']['blablacar_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,Nagesh Choudhary,It's best facility for both us (car owner & co...,5,2020-12-13 17:20:35
1,Suhel Sheikh,"Great app to find rides and riders. Genuine, e...",5,2020-12-11 05:15:16
2,Nikhil Malhotra,Very useful and helpful app but some of their ...,3,2020-12-18 07:39:51
3,Pankaj Singh,Nice app and great I am always traveling city ...,4,2020-12-18 09:17:32
4,v singh,What an idea to get pooled for sharing travel ...,5,2020-12-13 04:46:56
...,...,...,...,...
8817,1Aleezy,Basically good,4,2013-07-19 11:08:21
8818,juju 0605,amazing way to travel and meet nice people,5,2017-06-14 11:18:55
8819,pbidalot,The best one!,5,2013-03-19 09:53:09
8820,rtherance,Very nice and useful site,4,2014-08-31 17:22:56


### Filtering Out Uninformative Reviews

In [142]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "3_blablacar" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\3_blablacar.csv
Vocabulary size for 3_blablacar : 2634
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 8822




In [143]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9362254867678941
Number of informative reviews: 3156
Number of uninformative reviews: 5666
Wall time: 239 ms


In [144]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
#     print(uninformRev[i].id)
#     print(uninformRev[i].id - (len(trainSet) + len(testSet)))
#     uninformRev[i].printSelf()
#     print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

In [145]:
listOfRemovedIndex_p3

[5,
 7,
 16,
 30,
 32,
 46,
 48,
 49,
 61,
 71,
 72,
 73,
 81,
 82,
 84,
 87,
 89,
 91,
 92,
 93,
 107,
 108,
 109,
 111,
 112,
 113,
 119,
 122,
 125,
 127,
 128,
 130,
 131,
 134,
 135,
 136,
 137,
 139,
 142,
 143,
 146,
 155,
 156,
 157,
 161,
 162,
 163,
 164,
 170,
 173,
 174,
 176,
 177,
 178,
 181,
 182,
 183,
 189,
 190,
 193,
 194,
 195,
 196,
 197,
 203,
 204,
 206,
 210,
 211,
 213,
 214,
 219,
 221,
 228,
 229,
 230,
 241,
 246,
 248,
 249,
 253,
 255,
 256,
 259,
 260,
 262,
 267,
 268,
 270,
 271,
 274,
 275,
 276,
 279,
 282,
 284,
 285,
 286,
 288,
 289,
 292,
 294,
 303,
 306,
 307,
 309,
 313,
 317,
 319,
 320,
 321,
 323,
 326,
 328,
 330,
 331,
 332,
 339,
 342,
 346,
 347,
 352,
 353,
 354,
 358,
 359,
 361,
 362,
 364,
 367,
 369,
 370,
 371,
 376,
 378,
 379,
 384,
 388,
 394,
 396,
 397,
 398,
 399,
 400,
 401,
 406,
 409,
 412,
 413,
 414,
 416,
 418,
 419,
 420,
 421,
 422,
 423,
 425,
 426,
 428,
 430,
 432,
 434,
 436,
 437,
 438,
 439,
 440,
 443,
 444,
 4

In [146]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 8822 

Total reviews (AFTER): 3156 

Total removed reviews: 5666 

Wall time: 5.98 ms


In [147]:
df_p3.to_csv(config['csv_input_local']['blablacar_apple_google_p3'])

### Correcting Typos

In [42]:
%%time
cleaned_docs = remove_things(df_p3.review)
lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)
lists_of_words_no_stops

Wall time: 350 ms


[['using',
  'month',
  'good',
  'journeys',
  'horrendous',
  'ones',
  'recently',
  'first',
  'occasion',
  'told',
  'cross',
  'legs',
  'car',
  'might',
  'touch',
  'chair',
  'apparently',
  'encourages',
  'people',
  'sit',
  'still',
  'move',
  'car',
  'response',
  'got',
  'customer',
  'service',
  'refund',
  'second',
  'time',
  'dental',
  'appointment',
  'ordered',
  'cab',
  'driver',
  'came',
  'said',
  'going',
  'soon',
  'found',
  'location',
  'got',
  'frustrated',
  'swore',
  'said',
  'far',
  'man',
  'tired',
  'asked',
  'accepted',
  'first',
  'place',
  'want',
  'go',
  'far',
  'get',
  'proper',
  'explanation',
  'apology',
  'pain',
  'dental',
  'extraction',
  'cancel',
  'end',
  'said',
  'said',
  'would',
  'text',
  'let',
  'know',
  'messaged',
  'days',
  'ago',
  'still',
  'reply',
  'charged',
  'trip',
  'happen',
  'terrible',
  'terrible',
  'drivers',
  'bad',
  'customer',
  'service',
  'incidents',
  'like',
  'happen

In [49]:
list_of_typo_words = []
for sentence in lists_of_words_no_stops:
#     print(sentence)
    for word in sentence:
        if spell.correction(word) != word:
            print(word)
            print(spell.correction(word))
            list_of_typo_words.append(word)

xl
al
beckenham
beckham
cctv
cut
cctv
cut
deivers
drivers
fumingg
fuming
hru
thru
cornerr
corner
faq
far
kapten
kaiten
malta
malt
ffs
ifs
llocks
blocks
linkedin
linkedln
xx
ex
teip
trip
touran
toucan
tunis
tunes
gps
gas
viavan
vivian
ms
is
gps
gas
gps
gas
darkmode
darmody
usability
ability
gps
gas
viavan
vivian
postcodes
postcode
viavan
vivian
kapten
kaiten
gps
gas
slovakia
slovakian
kapten
kaiten
badd
bad
malta
malt
funke
funky
ms
is
funke
funky
meriza
media
londoner
londoners
btw
bow
gps
gas
аа
i
не
i
mr
my
masimba
marimba
pls
plus
befre
before
kapten
kaiten
xl
al
amaizing
amazing
kaunas
saunas
km
am
krakow
kraken
reaaly
really
latvia
latvian
ppl
pal
kapten
kaiten
gps
gas
gatwick
patrick
xs
is
xl
al
covid
couid
drivera
driver
kapten
kaiten
sh
so
ft
it
ft
it
ft
it
bmw
bow
tfl
til
sms
sums
unsubscribed
subscribed
uninstalled
installed
viavan
vivian
bonuss
bonus
nicee
nice
gps
gas
hrs
his
coudnt
count
happned
happened
muqtar
guitar
moalin
moulin
xl
al
xl
al
desaster
disaster
kaunas
saun

In [50]:
list_of_typo_words

['xl',
 'beckenham',
 'cctv',
 'cctv',
 'deivers',
 'fumingg',
 'hru',
 'cornerr',
 'faq',
 'kapten',
 'malta',
 'ffs',
 'llocks',
 'linkedin',
 'xx',
 'teip',
 'touran',
 'tunis',
 'gps',
 'viavan',
 'ms',
 'gps',
 'gps',
 'darkmode',
 'usability',
 'gps',
 'viavan',
 'postcodes',
 'viavan',
 'kapten',
 'gps',
 'slovakia',
 'kapten',
 'badd',
 'malta',
 'funke',
 'ms',
 'funke',
 'meriza',
 'londoner',
 'btw',
 'gps',
 'аа',
 'не',
 'mr',
 'masimba',
 'pls',
 'befre',
 'kapten',
 'xl',
 'amaizing',
 'kaunas',
 'km',
 'krakow',
 'reaaly',
 'latvia',
 'ppl',
 'kapten',
 'gps',
 'gatwick',
 'xs',
 'xl',
 'covid',
 'drivera',
 'kapten',
 'sh',
 'ft',
 'ft',
 'ft',
 'bmw',
 'tfl',
 'sms',
 'unsubscribed',
 'uninstalled',
 'viavan',
 'bonuss',
 'nicee',
 'gps',
 'hrs',
 'coudnt',
 'happned',
 'muqtar',
 'moalin',
 'xl',
 'xl',
 'desaster',
 'kaunas',
 'lyftt',
 'cs',
 'sms',
 'tallinn',
 'jdexgn',
 'cascais',
 'cascais',
 'mobey',
 'optionally',
 'cooll',
 'protus',
 'txfy',
 'huf',
 'huf',

# Cabify

## Read Input

In [19]:
df = pd.read_csv(config['csv_input_local']['cabify_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 10645 

Total unique users : 10420
Total users who gave multiple reviews: 225

Average rating for this app based on the textual reviews: 3.7 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [20]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 1min 18s


In [21]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 10645 

Total reviews (AFTER): 1899 

Total Non-English reviews: 8746 

Wall time: 5.65 ms


In [22]:
df_p1.to_csv(config['csv_input_local']['cabify_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [132]:
df_p2 = pd.read_csv(config['csv_input_local']['cabify_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,Danny Storm,"Horrible Service, my account has been blocked ...",1,2020-12-16 14:11:17
1,Candice Williams,Whats the point of reserving a cabify several ...,1,2020-12-11 06:09:15
2,Youngsook Love,I use Cabify since 2016. I had issue with the ...,1,2020-12-17 17:23:26
3,Monica Pinto,App stopped working. Uninstalled it to reinsta...,4,2020-11-19 01:23:14
4,Frombeauty2abeast LH,Don't bother with it ! Terrible service in Ali...,1,2020-10-11 07:44:48
...,...,...,...,...
758,Xkin,Works perfectly. Nice and kind drivers. I tota...,5,2016-12-04 23:54:46
759,grobalino,Great service,5,2016-11-28 08:02:01
760,Jpapo,Perfecto,5,2016-07-21 19:21:47
761,Angell ll,This app is great regarding the service that i...,5,2015-02-03 15:12:03


### Filtering Out Uninformative Reviews

In [74]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "4_cabify" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\4_cabify.csv
Vocabulary size for 4_cabify : 1978
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 763




In [10]:
# for i in range(0, len(unlabelSet)):
#     print(unlabelSet[i].id)

In [76]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9377602715664612
Number of informative reviews: 551
Number of uninformative reviews: 212
Wall time: 56.6 ms


In [80]:
for i in range(0, len(informRev)):
    print(informRev[i].id)
    print(informRev[i].id - (len(trainSet) + len(testSet)))
    informRev[i].printSelf()
    print('\n')

3514
0
Review id: 3514 Rating: 1 Content: horribl servic account block account lost object vehicl never first ride charg account without author contact custom servic horribl talk chat box give answer help situat request fill form lost object told never lost object sent form fill ride miss trip Ntokens: 41 TS:  Group:  Prob: 1.0 label: 1
Raw text: horrible service my account has been blocked on the account of having a lost object in the vehicle i never had my first ride with them then they charge my account without authorization i contact customer service which is horrible because you have to talk to a chat box where they give you answers that don t help your situation requesting to fill out my form on a lost object i told them i never lost an object and they sent me the form to fill out again no rides for me missed my trip


3515
1
Review id: 3515 Rating: 1 Content: what point reserv cabifi sever hour advanc noon show advanc warn noth app kept say driver less 6min away kept put queue h

4173
659
Review id: 4173 Rating: 5 Content: good app cheap use use code obtain euro use app Ntokens: 10 TS:  Group:  Prob: 1.0 label: 1
Raw text: very good app cheap and useful if you use the code antoniap172 you will obtain 6 euro to use on your app


4175
661
Review id: 4175 Rating: 3 Content: app bit slow respond request live least rio de janeiro mani driver uber also complaint system slow cumbersom almost uber find edit destin start trip Ntokens: 25 TS:  Group:  Prob: 1.0 label: 1
Raw text: the app is a bit slow to respond to a request where i live at least rio de janeiro as there aren t as many drivers as uber s also the complaints system is slow and cumbersome not almost instantaneous as uber and i did not find out how to edit the destination of a started trip


4176
662
Review id: 4176 Rating: 5 Content: work perfectli pick us right front airport took us straight airbnb water includ nice clean car Ntokens: 16 TS:  Group:  Prob: 1.0 label: 1
Raw text: all worked out perfectly he 

In [77]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
    print(uninformRev[i].id)
    print(uninformRev[i].id - (len(trainSet) + len(testSet)))
    uninformRev[i].printSelf()
    print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

3652
138
Review id: 3652 Rating: 3 Content: recov password chang phone reinstal app Ntokens: 6 TS:  Group:  Prob: 0.9962801632333633 label: 0
Raw text: can t recover my password after changing phone and reinstalling the app


3655
141
Review id: 3655 Rating: 5 Content: realli good servic puntual super kind Ntokens: 6 TS:  Group:  Prob: 0 label: 0
Raw text: really good service puntual and super kind


3659
145
Review id: 3659 Rating: 5 Content: good app book ride madrid Ntokens: 5 TS:  Group:  Prob: 0 label: 0
Raw text: very good app for booking rides in madrid and latam


3665
151
Review id: 3665 Rating: 1 Content: app send text verifi detail useless Ntokens: 6 TS:  Group:  Prob: 0.9986591753318542 label: 0
Raw text: app won t send a text to verify my details it s useless


3666
152
Review id: 3666 Rating: 1 Content: useless app doesnt anyth except take space Ntokens: 7 TS:  Group:  Prob: 0.9977338347086872 label: 0
Raw text: uselessness as an app it doesnt do anything except for takin

In [79]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 763 

Total reviews (AFTER): 551 

Total removed reviews: 212 

Wall time: 1.52 ms


In [33]:
df_p3.to_csv(config['csv_input_local']['cabify_apple_google_p3'])

### Correcting Typos

In [42]:
%%time
cleaned_docs = remove_things(df_p3.review)
lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)
lists_of_words_no_stops

Wall time: 350 ms


[['using',
  'month',
  'good',
  'journeys',
  'horrendous',
  'ones',
  'recently',
  'first',
  'occasion',
  'told',
  'cross',
  'legs',
  'car',
  'might',
  'touch',
  'chair',
  'apparently',
  'encourages',
  'people',
  'sit',
  'still',
  'move',
  'car',
  'response',
  'got',
  'customer',
  'service',
  'refund',
  'second',
  'time',
  'dental',
  'appointment',
  'ordered',
  'cab',
  'driver',
  'came',
  'said',
  'going',
  'soon',
  'found',
  'location',
  'got',
  'frustrated',
  'swore',
  'said',
  'far',
  'man',
  'tired',
  'asked',
  'accepted',
  'first',
  'place',
  'want',
  'go',
  'far',
  'get',
  'proper',
  'explanation',
  'apology',
  'pain',
  'dental',
  'extraction',
  'cancel',
  'end',
  'said',
  'said',
  'would',
  'text',
  'let',
  'know',
  'messaged',
  'days',
  'ago',
  'still',
  'reply',
  'charged',
  'trip',
  'happen',
  'terrible',
  'terrible',
  'drivers',
  'bad',
  'customer',
  'service',
  'incidents',
  'like',
  'happen

In [49]:
list_of_typo_words = []
for sentence in lists_of_words_no_stops:
#     print(sentence)
    for word in sentence:
        if spell.correction(word) != word:
            print(word)
            print(spell.correction(word))
            list_of_typo_words.append(word)

xl
al
beckenham
beckham
cctv
cut
cctv
cut
deivers
drivers
fumingg
fuming
hru
thru
cornerr
corner
faq
far
kapten
kaiten
malta
malt
ffs
ifs
llocks
blocks
linkedin
linkedln
xx
ex
teip
trip
touran
toucan
tunis
tunes
gps
gas
viavan
vivian
ms
is
gps
gas
gps
gas
darkmode
darmody
usability
ability
gps
gas
viavan
vivian
postcodes
postcode
viavan
vivian
kapten
kaiten
gps
gas
slovakia
slovakian
kapten
kaiten
badd
bad
malta
malt
funke
funky
ms
is
funke
funky
meriza
media
londoner
londoners
btw
bow
gps
gas
аа
i
не
i
mr
my
masimba
marimba
pls
plus
befre
before
kapten
kaiten
xl
al
amaizing
amazing
kaunas
saunas
km
am
krakow
kraken
reaaly
really
latvia
latvian
ppl
pal
kapten
kaiten
gps
gas
gatwick
patrick
xs
is
xl
al
covid
couid
drivera
driver
kapten
kaiten
sh
so
ft
it
ft
it
ft
it
bmw
bow
tfl
til
sms
sums
unsubscribed
subscribed
uninstalled
installed
viavan
vivian
bonuss
bonus
nicee
nice
gps
gas
hrs
his
coudnt
count
happned
happened
muqtar
guitar
moalin
moulin
xl
al
xl
al
desaster
disaster
kaunas
saun

In [50]:
list_of_typo_words

['xl',
 'beckenham',
 'cctv',
 'cctv',
 'deivers',
 'fumingg',
 'hru',
 'cornerr',
 'faq',
 'kapten',
 'malta',
 'ffs',
 'llocks',
 'linkedin',
 'xx',
 'teip',
 'touran',
 'tunis',
 'gps',
 'viavan',
 'ms',
 'gps',
 'gps',
 'darkmode',
 'usability',
 'gps',
 'viavan',
 'postcodes',
 'viavan',
 'kapten',
 'gps',
 'slovakia',
 'kapten',
 'badd',
 'malta',
 'funke',
 'ms',
 'funke',
 'meriza',
 'londoner',
 'btw',
 'gps',
 'аа',
 'не',
 'mr',
 'masimba',
 'pls',
 'befre',
 'kapten',
 'xl',
 'amaizing',
 'kaunas',
 'km',
 'krakow',
 'reaaly',
 'latvia',
 'ppl',
 'kapten',
 'gps',
 'gatwick',
 'xs',
 'xl',
 'covid',
 'drivera',
 'kapten',
 'sh',
 'ft',
 'ft',
 'ft',
 'bmw',
 'tfl',
 'sms',
 'unsubscribed',
 'uninstalled',
 'viavan',
 'bonuss',
 'nicee',
 'gps',
 'hrs',
 'coudnt',
 'happned',
 'muqtar',
 'moalin',
 'xl',
 'xl',
 'desaster',
 'kaunas',
 'lyftt',
 'cs',
 'sms',
 'tallinn',
 'jdexgn',
 'cascais',
 'cascais',
 'mobey',
 'optionally',
 'cooll',
 'protus',
 'txfy',
 'huf',
 'huf',

# Via

## Read Input

In [23]:
df = pd.read_csv(config['csv_input_local']['via_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 4265 

Total unique users : 4145
Total users who gave multiple reviews: 120

Average rating for this app based on the textual reviews: 3.63 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [24]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 25.4 s


In [25]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 4265 

Total reviews (AFTER): 3962 

Total Non-English reviews: 303 

Wall time: 2.96 ms


In [28]:
df_p1.to_csv(config['csv_input_local']['via_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [107]:
df_p2 = pd.read_csv(config['csv_input_local']['via_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,Joshua Clark,I don't know what kind of GPS Via uses but it ...,3,2020-11-28 23:42:03
1,Cynthia Parkhouse,This app is getting worse and worse. Today I r...,1,2020-11-25 05:32:23
2,Michael Andrade,"Very reliable, relaxing and safe.. Get to go w...",5,2020-12-03 01:02:01
3,JohnnyWishbone420,"Very affordable and very convenient, except on...",4,2020-11-06 16:55:11
4,David Dooley,Great alternative at the moment. Since I curre...,5,2020-11-09 17:02:18
...,...,...,...,...
1858,Yuzzz12,"I love using via, it is so nice to know that a...",5,2015-07-23 14:21:13
1859,Sabrina1witch,I used Via for the first time a couple of days...,5,2015-04-27 20:15:57
1860,amish25,This is a great service. If you want to test ...,5,2015-04-26 11:02:04
1861,Ab111186,"This is a great service, use it daily. Always ...",5,2015-02-26 12:51:17


### Filtering Out Uninformative Reviews

In [108]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "5_via" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\5_via.csv
Vocabulary size for 5_via : 2278
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 1863




In [109]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9366557192644149
Number of informative reviews: 1417
Number of uninformative reviews: 446
Wall time: 79.8 ms


In [112]:
# for i in range(0, len(informRev)):
#     print(informRev[i].id)
#     print(informRev[i].id - (len(trainSet) + len(testSet)))
#     informRev[i].printSelf()
#     print('\n')

In [113]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
#     print(uninformRev[i].id)
#     print(uninformRev[i].id - (len(trainSet) + len(testSet)))
#     uninformRev[i].printSelf()
#     print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

In [114]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 1863 

Total reviews (AFTER): 1417 

Total removed reviews: 446 

Wall time: 1.37 ms


In [115]:
df_p3.to_csv(config['csv_input_local']['via_apple_google_p3'])

# GetAround

## Read Input

In [36]:
df = pd.read_csv(config['csv_input_local']['getaround_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 3219 

Total unique users : 3183
Total users who gave multiple reviews: 36

Average rating for this app based on the textual reviews: 3.3 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [37]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 21.2 s


In [38]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 3219 

Total reviews (AFTER): 894 

Total Non-English reviews: 2325 

Wall time: 3.99 ms


In [39]:
df_p1.to_csv(config['csv_input_local']['getaround_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [82]:
df_p2 = pd.read_csv(config['csv_input_local']['getaround_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,Eugene Attalah,Horrendous service. Had a nightmare with the c...,1,2020-11-16 21:20:53
1,Ludovica Aquino,"Useful and environmental friendly service, eas...",5,2020-10-25 21:19:14
2,Ant Bailey,"Great app, so easy to use and my number one go...",5,2020-10-12 10:51:57
3,Raul Xavier,Often have few options of cars. Very low rate ...,1,2020-09-27 02:19:28
4,Pathik Bhatt,Great concept to reduce car ownership.,5,2020-10-22 19:37:29
...,...,...,...,...
360,gmock78,I can’t believe that nowadays an app has payme...,1,2019-04-12 11:12:06
361,j.rojas79,The first car I rented had a problem with the ...,1,2018-07-01 14:01:56
362,Mr_GVB,The information can be a little confusing the ...,1,2018-04-21 07:19:30
363,Rachael198036,"Love drivy, very secure and safe little money ...",5,2017-02-16 14:23:15


### Filtering Out Uninformative Reviews

In [89]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "6_getaround" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\6_getaround.csv
Vocabulary size for 6_getaround : 1856
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 365




In [10]:
# for i in range(0, len(unlabelSet)):
#     print(unlabelSet[i].id)

In [85]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9366149110978086
Number of informative reviews: 249
Number of uninformative reviews: 116
Wall time: 50.5 ms


In [86]:
for i in range(0, len(informRev)):
    print(informRev[i].id)
    print(informRev[i].id - (len(trainSet) + len(testSet)))
    informRev[i].printSelf()
    print('\n')

3514
0
Review id: 3514 Rating: 1 Content: horrend servic nightmar car book steer clear servic Ntokens: 8 TS:  Group:  Prob: 1.0 label: 1
Raw text: horrendous service had a nightmare with the car i booked steer clear of this service


3515
1
Review id: 3515 Rating: 5 Content: use environment friendli servic easi regist good price Ntokens: 8 TS:  Group:  Prob: 1.0 label: 1
Raw text: useful and environmental friendly service easy to register to good prices


3516
2
Review id: 3516 Rating: 5 Content: great app easi use number one go car hire Ntokens: 9 TS:  Group:  Prob: 1.0 label: 1
Raw text: great app so easy to use and my number one go to for car hire


3517
3
Review id: 3517 Rating: 1 Content: often option car low rate accept awar self servic car sourc mani possibl wors greedi getaround team seem ethic compromis till moment commit client recommend need use app alway ask receipt fuel fill take pictur mileag devic measur occasion discrep overcharg Ntokens: 41 TS:  Group:  Prob: 1.0 label

In [87]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
    print(uninformRev[i].id)
    print(uninformRev[i].id - (len(trainSet) + len(testSet)))
    uninformRev[i].printSelf()
    print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

3518
4
Review id: 3518 Rating: 5 Content: great concept reduc car ownership Ntokens: 5 TS:  Group:  Prob: 0 label: 0
Raw text: great concept to reduce car ownership


3527
13
Review id: 3527 Rating: 5 Content: great app everyth need search get inform follow progress rental Ntokens: 10 TS:  Group:  Prob: 0 label: 0
Raw text: great app it has everything you need to search get information and follow the progress of your rentals


3537
23
Review id: 3537 Rating: 5 Content: great app recommend use rent car Ntokens: 6 TS:  Group:  Prob: 0 label: 0
Raw text: great app recommend to use for renting cars


3539
25
Review id: 3539 Rating: 5 Content: nice app use lot day trip around Ntokens: 7 TS:  Group:  Prob: 0 label: 0
Raw text: nice app use it a lot for day trips around munich


3546
32
Review id: 3546 Rating: 5 Content: great app good servic offer Ntokens: 5 TS:  Group:  Prob: 0 label: 0
Raw text: great app good service offered


3571
57
Review id: 3571 Rating: 5 Content: super easi use app 

In [88]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 365 

Total reviews (AFTER): 249 

Total removed reviews: 116 

Wall time: 894 µs


In [91]:
df_p3.to_csv(config['csv_input_local']['getaround_apple_google_p3'])

# OlaCabs

## Read Input

In [40]:
df = pd.read_csv(config['csv_input_local']['olacabs_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 10922 

Total unique users : 10392
Total users who gave multiple reviews: 530

Average rating for this app based on the textual reviews: 1.6 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [41]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 1min 1s


In [42]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 10922 

Total reviews (AFTER): 10875 

Total Non-English reviews: 47 

Wall time: 3.99 ms


In [43]:
df_p1.to_csv(config['csv_input_local']['olacabs_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [148]:
df_p2 = pd.read_csv(config['csv_input_local']['olacabs_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,ritesh pilare,"This app sucks!!! Don't use Ola, go and use Ub...",1,2019-06-26 16:48:30
1,Swaronika Kar,"They should understand, when people in very ne...",1,2020-06-08 09:13:59
2,Khushi Arora,today I used ola to travel 21km (it showed 354...,1,2019-04-21 19:57:43
3,Mukunda Priyanka,"this one star is also waste for this, I booked...",1,2019-01-23 19:17:39
4,Vatsala Singh,Very horrible service. Payment mode sucks. Cas...,1,2019-11-12 17:47:31
...,...,...,...,...
3761,FMPrakash,Since the latest update at least. Very disappo...,2,2015-03-23 15:15:00
3762,Zenarik,"Bad..! Not showing outstation booking option,",1,2017-10-26 06:27:52
3763,Annaa1234556788,Really good,5,2017-09-29 06:09:19
3764,pńmpńô,"The drivers should be trained , they are misbe...",1,2019-05-22 11:28:32


### Filtering Out Uninformative Reviews

In [149]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "7_olacabs" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\7_olacabs.csv
Vocabulary size for 7_olacabs : 2970
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 3766




In [150]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9356324176029216
Number of informative reviews: 3573
Number of uninformative reviews: 193
Wall time: 189 ms


In [151]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
    print(uninformRev[i].id)
    print(uninformRev[i].id - (len(trainSet) + len(testSet)))
    uninformRev[i].printSelf()
    print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

3564
50
Review id: 3564 Rating: 5 Content: good servic Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: very good service


3571
57
Review id: 3571 Rating: 5 Content: nice easi use Ntokens: 3 TS:  Group:  Prob: 0 label: 0
Raw text: nice easy to use


3596
82
Review id: 3596 Rating: 5 Content: good servic Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: very good service


3616
102
Review id: 3616 Rating: 3 Content: ola auto difficult book cancel Ntokens: 5 TS:  Group:  Prob: 0 label: 0
Raw text: ola auto is difficult to book they are cancelling again and again


3626
112
Review id: 3626 Rating: 4 Content: good servic Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: very good service


3630
116
Review id: 3630 Rating: 5 Content: good servic Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: very good service


3634
120
Review id: 3634 Rating: 1 Content: worst experi mobil app abl complaint emerg Ntokens: 7 TS:  Group:  Prob: 0 label: 0
Raw text: worst experience with this mobile a

In [152]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 3766 

Total reviews (AFTER): 3573 

Total removed reviews: 193 

Wall time: 2 ms


In [153]:
df_p3.to_csv(config['csv_input_local']['olacabs_apple_google_p3'])

# Taxi.eu

## Read Input

In [44]:
df = pd.read_csv(config['csv_input_local']['taxieu_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 775 

Total unique users : 754
Total users who gave multiple reviews: 21

Average rating for this app based on the textual reviews: 3.32 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [45]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 7.11 s


In [46]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 775 

Total reviews (AFTER): 220 

Total Non-English reviews: 555 

Wall time: 3.99 ms


In [47]:
df_p1.to_csv(config['csv_input_local']['taxieu_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [118]:
df_p2 = pd.read_csv(config['csv_input_local']['taxieu_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,Louis-philippe Marier,"The worst app i ever saw. Seriously, between, ...",1,2020-03-07 20:42:30
1,aaron moon,horrible app you cant even tell if the taxi yo...,1,2019-08-21 10:21:08
2,Mark Smith,This is a terrible app! The pick-up point flic...,1,2019-04-10 00:01:50
3,Ashkan Alikhani,This is so stupid. I had a trip from somewhere...,1,2020-04-06 14:50:14
4,AnthonyB87,Useless app doesn't properly work. Unfriendly ...,1,2020-02-14 10:07:37
...,...,...,...,...
67,Andrea__ __ __,Improve the map! The design could be improved,4,2016-12-19 21:19:28
68,_ indiepercui,Working good and realistic about times. Very g...,4,2017-05-28 07:47:48
69,spekulatius123,Unfortunately allows only to order “Business” ...,3,2019-10-13 19:57:15
70,ghasimir,"Terrible app, bad service",1,2018-02-03 00:47:40


### Filtering Out Uninformative Reviews

In [119]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "8_taxieu" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\8_taxieu.csv
Vocabulary size for 8_taxieu : 1744
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 72




In [120]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9366149110978086
Number of informative reviews: 43
Number of uninformative reviews: 29
Wall time: 48.9 ms


In [121]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
    print(uninformRev[i].id)
    print(uninformRev[i].id - (len(trainSet) + len(testSet)))
    uninformRev[i].printSelf()
    print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

3526
12
Review id: 3526 Rating: 1 Content: mani bug app terribl Ntokens: 4 TS:  Group:  Prob: 0 label: 0
Raw text: there are so many bugs this app ia terrible


3527
13
Review id: 3527 Rating: 3 Content: ok Ntokens: 1 TS:  Group:  Prob: 0 label: 0
Raw text: it s ok


3528
14
Review id: 3528 Rating: 4 Content: love Ntokens: 1 TS:  Group:  Prob: 0 label: 0
Raw text: loved it


3529
15
Review id: 3529 Rating: 1 Content: sm verif work order taxi app useless Ntokens: 7 TS:  Group:  Prob: 0 label: 0
Raw text: sms verification does not work which is a prerequisite for ordering a taxi so the app is useless


3533
19
Review id: 3533 Rating: 1 Content: rubbish applic Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: rubbish application


3537
23
Review id: 3537 Rating: 1 Content: singl worst app encount cannot locat address ui horribl level Ntokens: 10 TS:  Group:  Prob: 0 label: 0
Raw text: this is the single worst app i have encountered cannot locate the address i was at the ui is horrible on

In [98]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 1261 

Total reviews (AFTER): 622 

Total removed reviews: 639 

Wall time: 2 ms


In [122]:
df_p3.to_csv(config['csv_input_local']['taxieu_apple_google_p3'])

# FreeNow

## Read Input

In [48]:
df = pd.read_csv(config['csv_input_local']['freenow_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 25428 

Total unique users : 24564
Total users who gave multiple reviews: 864

Average rating for this app based on the textual reviews: 3.52 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [49]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 3min


In [50]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 25428 

Total reviews (AFTER): 10939 

Total Non-English reviews: 14489 

Wall time: 5.96 ms


In [51]:
df_p1.to_csv(config['csv_input_local']['freenow_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [154]:
df_p2 = pd.read_csv(config['csv_input_local']['freenow_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,Danissima,Awful experience... I don't know if there were...,1,2020-12-07 10:39:05
1,Caoimhe Cray,This app is awful. It won't open on my phone a...,1,2020-12-09 16:43:12
2,Darrell Elliott,Updated my review to 1 from 3 stars because it...,1,2020-12-13 12:33:28
3,Charles Bean,"Having terrible problems with the app lately, ...",1,2020-12-20 01:21:40
4,nck,"Useless, I made a booking for the next morning...",1,2020-12-13 05:48:05
...,...,...,...,...
4280,Kinmadrid,Love it! Easy and efficient!,5,2014-01-13 00:20:05
4281,cstockmans,"When it works, it's great, and it usually does...",3,2013-11-14 12:09:19
4282,Cycocarle,Works perfectly ... using this app since 6 mon...,5,2013-07-10 12:48:21
4283,Humanona,Very good app. Easy and useful,5,2013-01-24 22:17:06


### Filtering Out Uninformative Reviews

In [155]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "9_freenow" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\9_freenow.csv
Vocabulary size for 9_freenow : 2491
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 4285




In [156]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.9381928977975749
Number of informative reviews: 2728
Number of uninformative reviews: 1557
Wall time: 113 ms


In [157]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
    print(uninformRev[i].id)
    print(uninformRev[i].id - (len(trainSet) + len(testSet)))
    uninformRev[i].printSelf()
    print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

3541
27
Review id: 3541 Rating: 3 Content: pre book problem app allow prebook app work Ntokens: 8 TS:  Group:  Prob: 0 label: 0
Raw text: why can t you pre book i have a problem with the app not allowing me to prebook the app does not work for this


3544
30
Review id: 3544 Rating: 1 Content: stupid app rebook okay otherwis straight forward way prebook make life easier wors Ntokens: 13 TS:  Group:  Prob: 0 label: 0
Raw text: stupid app for rebookings okay otherwise no straight forward way of prebooking should make life easier not worse


3548
34
Review id: 3548 Rating: 1 Content: book cab complet wrong address use postcod thought london idea useless Ntokens: 11 TS:  Group:  Prob: 0 label: 0
Raw text: booked a cab and they had the complete wrong address i had used my postcode so how they thought i was incentral london and not surrey i have no idea useless


3583
69
Review id: 3583 Rating: 4 Content: good moment could user friendli though Ntokens: 6 TS:  Group:  Prob: 0 label: 0
Raw text



5453
1939
Review id: 5453 Rating: 5 Content: good app Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: good app


5455
1941
Review id: 5455 Rating: 5 Content: great servic Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: great service


5456
1942
Review id: 5456 Rating: 5 Content: brilliant Ntokens: 1 TS:  Group:  Prob: 0 label: 0
Raw text: brilliant


5457
1943
Review id: 5457 Rating: 4 Content: great Ntokens: 1 TS:  Group:  Prob: 0 label: 0
Raw text: great


5458
1944
Review id: 5458 Rating: 5 Content: great Ntokens: 1 TS:  Group:  Prob: 0 label: 0
Raw text: great


5460
1946
Review id: 5460 Rating: 5 Content: good Ntokens: 1 TS:  Group:  Prob: 0 label: 0
Raw text: very good


5461
1947
Review id: 5461 Rating: 5 Content: fantast app Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: fantastic app


5462
1948
Review id: 5462 Rating: 5 Content: great Ntokens: 1 TS:  Group:  Prob: 0 label: 0
Raw text: great


5463
1949
Review id: 5463 Rating: 5 Content: great Ntokens: 1 TS:  Group

Review id: 6869 Rating: 5 Content: great Ntokens: 1 TS:  Group:  Prob: 0 label: 0
Raw text: great


6870
3356
Review id: 6870 Rating: 5 Content: citi one speak english fantast Ntokens: 5 TS:  Group:  Prob: 0 label: 0
Raw text: in a city where no one speaks english this is fantastic


6878
3364
Review id: 6878 Rating: 3 Content: submit order chang detail Ntokens: 4 TS:  Group:  Prob: 0 label: 0
Raw text: once you submit an order you can t change any details


6880
3366
Review id: 6880 Rating: 4 Content: good function bit slow Ntokens: 4 TS:  Group:  Prob: 0 label: 0
Raw text: good functionality but a bit slow


6885
3371
Review id: 6885 Rating: 5 Content: awesom servic everi german citi Ntokens: 5 TS:  Group:  Prob: 0 label: 0
Raw text: awesome service in every german city


6887
3373
Review id: 6887 Rating: 4 Content: work plu discount good Ntokens: 4 TS:  Group:  Prob: 0 label: 0
Raw text: works plus discount is good


6889
3375
Review id: 6889 Rating: 5 Content: great like Ntokens: 2

In [158]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 4285 

Total reviews (AFTER): 2728 

Total removed reviews: 1557 

Wall time: 2.99 ms


In [159]:
df_p3.to_csv(config['csv_input_local']['freenow_apple_google_p3'])

# YandexGo

## Read Input

In [52]:
df = pd.read_csv(config['csv_input_local']['yandexgo_apple_google'], index_col=0)
df = df.reset_index(drop=True)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 7224 

Total unique users : 6797
Total users who gave multiple reviews: 427

Average rating for this app based on the textual reviews: 3.25 



## Preprocessing Input Data

### Remove Non-English User Reviews

In [53]:
%%time
listOfNonEnglishIndex = []

for i in range(0, len(df)):
    reviewText = df['review'][i]
    
#     # for debugging purpose
#     print(reviewText)
#     print(isEnglishReview(reviewText))
#     print('\n')
    
    isEnglish, listToStr, english_score = isEnglishReview(reviewText)
    if isEnglish == False:
        listOfNonEnglishIndex.append(i)

Wall time: 55.4 s


In [54]:
%%time
df_p1 = df.drop(df.index[listOfNonEnglishIndex])
total_reviews_before = len(df)
total_reviews_after = len(df_p1)
total_non_english_reviews = len(listOfNonEnglishIndex)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total Non-English reviews: {total_non_english_reviews} \n')

Total reviews (BEFORE): 7224 

Total reviews (AFTER): 2888 

Total Non-English reviews: 4336 

Wall time: 4.99 ms


In [55]:
df_p1.to_csv(config['csv_input_local']['yandexgo_apple_google_p1'])

### Filtering Out Inconsistent User Review

- Because of an issue in SentiStrength path setting for Windows 10, I can't run it here
- Will able to run it via Linux or macOS

In [124]:
df_p2 = pd.read_csv(config['csv_input_local']['yandexgo_apple_google_p2'], index_col=0)
df_p2 = df_p2.reset_index(drop=True)
df_p2

Unnamed: 0,userName,review,rating,date
0,Dumb Phone,OK in the beginning but now gets worse with ev...,1,2020-12-04 10:38:27
1,69 Paths,"Great app, but there are a few map delays.",4,2020-12-16 04:33:34
2,Alexandra Vigand,All good. One suggestion - to add the option t...,5,2020-10-29 22:02:44
3,Andrei Braghiş,List of partners does not load. After an unple...,1,2020-10-21 21:37:30
4,I Am Me,"Great taxi service app, really simple. Also yo...",5,2020-10-25 22:16:23
...,...,...,...,...
1256,Mihkelp,"It’s just terrible app, doesn’t function prope...",1,2018-05-04 20:40:34
1257,Nikolas4553,Very shameful navigation system.,1,2020-06-12 19:10:34
1258,lalalaaalalaa,taxis are always late without exception. drive...,1,2020-11-07 15:30:33
1259,Av1K,"Cant activate. Only 3 language , not 16 langua...",1,2018-05-02 00:50:25


### Filtering Out Uninformative Reviews

In [125]:
%run ./AR_Miner/AR_util.py
%run ./AR_Miner/AR_reviewInstance.py

# Inputs:
datasetName = "10_yandexgo" # four apps: facebook, templerun2, swiftkey, tapfish:
# datasetName = "templerun2" # four apps: facebook, templerun2, swiftkey, tapfish
rmStopWords = True # Removing stop words lead to information loss and bad f-score
rmRareWords = True # Remove the word with low frequency
skParse = False # set skParse True to directly read of the data that has been filtered out

# Outputs:
if(skParse == False):
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

print('\n')

./datasets/_thesis/trainU\10_yandexgo.csv
Vocabulary size for 10_yandexgo : 1974
Training set Size: 1447
Testing set Size: 2067
Unlabeling set Size: 1261




In [126]:
%%time
%run ./AR_Miner/AR_classifier.py

useSVM = True # SVM is way better than EMNB in the testing
if(skParse == False):
    if(useSVM == False):
#         informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
#         informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
        informRev, uninformRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    # write the result back to the file (optional)
    # AR_writeReviews(informRev, datasetName)
    
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))
print("Number of uninformative reviews: " + str(len(uninformRev)))

Average F-Score for the test data: 0.93768020670961
Number of informative reviews: 622
Number of uninformative reviews: 639
Wall time: 62.8 ms


In [127]:
for i in range(0, len(informRev)):
    print(informRev[i].id)
    print(informRev[i].id - (len(trainSet) + len(testSet)))
    informRev[i].printSelf()
    print('\n')

3514
0
Review id: 3514 Rating: 1 Content: ok begin get wors everi updat use last time final ask want go send taxi instead wast time wast driver time text tri cancel went nowher year old easier carri groceri kilomet supermarket deal app Ntokens: 35 TS:  Group:  Prob: 1.0 label: 1
Raw text: ok in the beginning but now gets worse with every update and use this last time and the final it asks me where i want to go and sends the taxi there instead of to where i am waste of my time waste of driver s time 5 texts trying to cancel went nowhere at 70 years old it s easier to carry my groceries the kilometers from the supermarket than deal with this app


3515
1
Review id: 3515 Rating: 4 Content: great app map delay Ntokens: 4 TS:  Group:  Prob: 1.0 label: 1
Raw text: great app but there are a few map delays


3516
2
Review id: 3516 Rating: 5 Content: good one suggest add option choos femal driver person would will wait longer femal driver feel secur especi night Ntokens: 19 TS:  Group:  Prob: 1

In [128]:
listOfRemovedIndex_p3 = []

for i in range(0, len(uninformRev)):
    print(uninformRev[i].id)
    print(uninformRev[i].id - (len(trainSet) + len(testSet)))
    uninformRev[i].printSelf()
    print('\n')
    
    idxToRemove = uninformRev[i].id - (len(trainSet) + len(testSet))
    listOfRemovedIndex_p3.append(idxToRemove)

3533
19
Review id: 3533 Rating: 5 Content: app amaz navig map accuraci kazakhstan Ntokens: 6 TS:  Group:  Prob: 0 label: 0
Raw text: solid app with amazing navigation and map accuracy in kazakhstan


3548
34
Review id: 3548 Rating: 5 Content: app work pretti good Ntokens: 4 TS:  Group:  Prob: 0 label: 0
Raw text: outstandin app it works pretty good


3558
44
Review id: 3558 Rating: 5 Content: good servic Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: it s a good service


3562
48
Review id: 3562 Rating: 5 Content: favourit app awesom Ntokens: 3 TS:  Group:  Prob: 0 label: 0
Raw text: this is my favourite app just awesome


3569
55
Review id: 3569 Rating: 5 Content: work fine exactli expect Ntokens: 4 TS:  Group:  Prob: 0 label: 0
Raw text: works just fine exactly as expected


3570
56
Review id: 3570 Rating: 5 Content: good trip Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: it was good trip


3587
73
Review id: 3587 Rating: 5 Content: good app Ntokens: 2 TS:  Group:  Prob: 0 l

4645
1131
Review id: 4645 Rating: 5 Content: amaz taxi servic Ntokens: 3 TS:  Group:  Prob: 0 label: 0
Raw text: amazing taxi service


4648
1134
Review id: 4648 Rating: 5 Content: excel app thank Ntokens: 3 TS:  Group:  Prob: 0 label: 0
Raw text: excellent app thanks


4649
1135
Review id: 4649 Rating: 5 Content: nice app Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: very nice app


4654
1140
Review id: 4654 Rating: 5 Content: best russian Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: best and russian


4656
1142
Review id: 4656 Rating: 5 Content: awesom app Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: awesome app


4657
1143
Review id: 4657 Rating: 5 Content: nice app Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: very nice app


4658
1144
Review id: 4658 Rating: 5 Content: great app Ntokens: 2 TS:  Group:  Prob: 0 label: 0
Raw text: great app


4659
1145
Review id: 4659 Rating: 5 Content: love Ntokens: 1 TS:  Group:  Prob: 0 label: 0
Raw text: love it


4661
1147

In [129]:
%%time
df_p3 = df_p2.drop(df_p2.index[listOfRemovedIndex_p3])
total_reviews_before = len(df_p2)
total_reviews_after = len(df_p3)
total_removed_reviews = len(listOfRemovedIndex_p3)

print(f'Total reviews (BEFORE): {total_reviews_before} \n')
print(f'Total reviews (AFTER): {total_reviews_after} \n')
print(f'Total removed reviews: {total_removed_reviews} \n')

Total reviews (BEFORE): 1261 

Total reviews (AFTER): 622 

Total removed reviews: 639 

Wall time: 2.12 ms


In [130]:
df_p3.to_csv(config['csv_input_local']['yandexgo_apple_google_p3'])