# References

# Modified from KEFE's preprocess_review.py

In [3]:
from utils import *
from pprint import pprint

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

config = get_config('config.yaml')

# Let's Get Started

## Bolt (Both Apple App Store and Google Play Store) after non-english remove

In [7]:
df = pd.read_csv(config['csv_input_local']['bolt_apple_google_p1'], index_col = 0)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
unknown_users = len(df[df['userName']=='A Google user'])
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total unknown users: {unknown_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users - unknown_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 40365 

Total unique users : 37132
Total unknown users: 3076
Total users who gave multiple reviews: 157

Average rating for this app based on the textual reviews: 3.75 



In [12]:
reviews = df.review
reviews

0        The first thing I noticed is that you can't pr...
1        Your GPS setting around Cape Town Internationa...
2        I was invited by my friend and was given a £10...
3        Your drivers are great BUT your support is no ...
4        Loving the app, but can only give it a medium ...
                               ...                        
54992    Its the 5th time Im saying, But When Will ther...
54998                                        Just love it!
55022                                           Great app!
55029                           Super convenient and fast!
55060    Taxify's purpose was that you can CHOOSE your ...
Name: review, Length: 40365, dtype: object

In [14]:
cleaned_docs = remove_things(reviews)

In [16]:
lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)

In [17]:
ngrams = make_bigrams(lists_of_words_no_stops)

Making bigrams...


In [18]:
data_lemmatized = lemmatize(ngrams, allowed_postags=['NOUN'])

Lemmatizing...


In [19]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
term_doc = [id2word.doc2bow(text) for text in texts]

# View
print(term_doc[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1)]]


In [20]:
id2word[0]

'app'

In [21]:
[[(id2word[id], freq) for id, freq in cp] for cp in term_doc[:1]]


[[('app', 1),
  ('book', 2),
  ('booking', 1),
  ('catch', 1),
  ('driver', 1),
  ('find', 1),
  ('flight', 1),
  ('get_rid', 1),
  ('pm', 1),
  ('prenook', 1),
  ('service', 1),
  ('thing', 1),
  ('today', 2),
  ('trip', 1),
  ('went', 1)]]

In [22]:
tf_idf = models.TfidfModel(term_doc, smartirs='ntc')[term_doc]
tf_idf[0]

[(0, 0.0718599828705696),
 (1, 0.4035405860157316),
 (2, 0.22527784626751088),
 (3, 0.24961393511563068),
 (4, 0.05494939342285676),
 (5, 0.24502964094121052),
 (6, 0.25693320719331314),
 (7, 0.3251404142739193),
 (8, 0.30240467858038395),
 (9, 0.3478761499674547),
 (10, 0.09286896551076561),
 (11, 0.15435519134265113),
 (12, 0.3240264006716498),
 (13, 0.11496300146374226),
 (14, 0.3478761499674547)]

In [23]:
[[(id2word[id], freq) for id, freq in cp] for cp in tf_idf[:1]]

[[('app', 0.0718599828705696),
  ('book', 0.4035405860157316),
  ('booking', 0.22527784626751088),
  ('catch', 0.24961393511563068),
  ('driver', 0.05494939342285676),
  ('find', 0.24502964094121052),
  ('flight', 0.25693320719331314),
  ('get_rid', 0.3251404142739193),
  ('pm', 0.30240467858038395),
  ('prenook', 0.3478761499674547),
  ('service', 0.09286896551076561),
  ('thing', 0.15435519134265113),
  ('today', 0.3240264006716498),
  ('trip', 0.11496300146374226),
  ('went', 0.3478761499674547)]]

# Save pre-processed data into binary Pickle file

In [24]:
import pickle

output_path = 'preprocessed_data/bolt/v2/'

## data_lemmatized

In [25]:
with open(output_path + 'data_lemmatized.pkl', 'wb') as f:
    pickle.dump(data_lemmatized, f)

In [28]:
df = pd.read_pickle(output_path + 'data_lemmatized.pkl')
df

[['thing',
  'prenook',
  'trip',
  'catch',
  'flight',
  'pm',
  'today',
  'booking',
  'went',
  'book',
  'find',
  'driver',
  'book',
  'service',
  'get_rid',
  'app',
  'today'],
 ['need', 'section', 'airport', 'spot', 'number', 'app'],
 ['friend',
  'use',
  'would_allow',
  'journey',
  'support',
  'customer',
  'promotion',
  'contact',
  'team',
  'reply'],
 ['driver',
  'support',
  'computer',
  'response',
  'case',
  'response',
  'support',
  'discount',
  'account',
  'thought',
  'client'],
 ['rating',
  'moment',
  'driver',
  'rating',
  'trip',
  'driver',
  'apartment',
  'ride',
  'drive',
  'destination'],
 ['price', 'try', 'sister', 'invitation', 'code', 'code', 'reply'],
 ['promotion',
  'price',
  'driver',
  'rider',
  'price',
  'change',
  'driver',
  'destination',
  'rider',
  'thank',
  'fare',
  'taxi',
  'pay'],
 ['app',
  'price_range',
  'trip',
  'getting_destination',
  'amount',
  'rate',
  'app',
  'traffic',
  'delay',
  'abeg'],
 ['price',


## dictionary

In [29]:
with open(output_path + 'dictionary.pkl', 'wb') as f:
    pickle.dump(id2word, f)

In [30]:
import pandas as pd
id2word = pd.read_pickle(output_path + 'dictionary.pkl')

[[(id2word[id], freq) for id, freq in cp] for cp in term_doc[:3]]

[[('app', 1),
  ('book', 2),
  ('booking', 1),
  ('catch', 1),
  ('driver', 1),
  ('find', 1),
  ('flight', 1),
  ('get_rid', 1),
  ('pm', 1),
  ('prenook', 1),
  ('service', 1),
  ('thing', 1),
  ('today', 2),
  ('trip', 1),
  ('went', 1)],
 [('app', 1),
  ('airport', 1),
  ('need', 1),
  ('number', 1),
  ('section', 1),
  ('spot', 1)],
 [('contact', 1),
  ('customer', 1),
  ('friend', 1),
  ('journey', 1),
  ('promotion', 1),
  ('reply', 1),
  ('support', 1),
  ('team', 1),
  ('use', 1),
  ('would_allow', 1)]]

## term_doc

In [31]:
with open(output_path + 'term_doc.pkl', 'wb') as f:
    pickle.dump(term_doc, f)

In [32]:
import pandas as pd
term_doc = pd.read_pickle(output_path + 'term_doc.pkl')
term_doc

[[(0, 1),
  (1, 2),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 2),
  (13, 1),
  (14, 1)],
 [(0, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(4, 1),
  (26, 2),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1)],
 [(4, 2), (13, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 1)],
 [(25, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1)],
 [(4, 2),
  (24, 1),
  (38, 1),
  (45, 2),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 2),
  (52, 1),
  (53, 1)],
 [(0, 2),
  (13, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1)],
 [(10, 1), (13, 1), (26, 1), (27, 1), (45, 1), (61, 1), (62, 1), (63, 1)],
 [(4, 1), (42, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1)],
 [(21, 1),
  (35, 1),
  (55, 1),
  (69, 2),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 3),
  (74, 1),

## tf_idf

In [33]:
with open(output_path + 'tf_idf.pkl', 'wb') as f:
    pickle.dump(tf_idf, f)

In [34]:
import pandas as pd
tf_idf = pd.read_pickle(output_path + 'tf_idf.pkl')
tf_idf

<gensim.interfaces.TransformedCorpus at 0x1f103c264c0>

In [35]:
[[(id2word[id], freq) for id, freq in cp] for cp in tf_idf[:1]]

[[('app', 0.0718599828705696),
  ('book', 0.4035405860157316),
  ('booking', 0.22527784626751088),
  ('catch', 0.24961393511563068),
  ('driver', 0.05494939342285676),
  ('find', 0.24502964094121052),
  ('flight', 0.25693320719331314),
  ('get_rid', 0.3251404142739193),
  ('pm', 0.30240467858038395),
  ('prenook', 0.3478761499674547),
  ('service', 0.09286896551076561),
  ('thing', 0.15435519134265113),
  ('today', 0.3240264006716498),
  ('trip', 0.11496300146374226),
  ('went', 0.3478761499674547)]]