# References

# Modified from KEFE's preprocess_review.py

In [1]:
from utils import *
from pprint import pprint

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

config = get_config('config.yaml')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\enlik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Let's Get Started

## Bolt (Both Apple App Store and Google Play Store) after non-english remove

In [2]:
df = pd.read_csv(config['csv_input_local']['bolt_apple_google_p2'], index_col = 0)
total_reviews = len(df)
unique_users  = len(df['userName'].unique())
unknown_users = len(df[df['userName']=='A Google user'])
mean = df['rating'].mean()

print(f'Total English reviews: {total_reviews} \n')
print(f'Total unique users : {unique_users}')
print(f'Total unknown users: {unknown_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users - unknown_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total English reviews: 17930 

Total unique users : 16568
Total unknown users: 1327
Total users who gave multiple reviews: 35

Average rating for this app based on the textual reviews: 4.04 



In [3]:
reviews = df.review
reviews

3        Your drivers are great BUT your support is no ...
4        Loving the app, but can only give it a medium ...
10       The response time with regards to customer ser...
12       This has been happening for some time now and ...
13       The drivers almost always never have change an...
                               ...                        
40354                                           Great app!
40355                                          Great app!!
40359    The drivers regularly get the wrong address du...
40361                                        Just love it!
40362                                           Great app!
Name: review, Length: 17930, dtype: object

In [4]:
cleaned_docs = remove_things(reviews)

In [5]:
lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)

In [6]:
ngrams = make_bigrams(lists_of_words_no_stops)

Making bigrams...


In [7]:
data_lemmatized = lemmatize(ngrams, allowed_postags=['NOUN'])

Lemmatizing...


In [8]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
term_doc = [id2word.doc2bow(text) for text in texts]

# View
print(term_doc[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 2), (10, 1)]]


In [9]:
id2word[0]

'account'

In [10]:
[[(id2word[id], freq) for id, freq in cp] for cp in term_doc[:1]]


[[('account', 1),
  ('case', 1),
  ('client', 1),
  ('computer', 1),
  ('discount', 1),
  ('driver', 1),
  ('month', 1),
  ('response', 2),
  ('star', 1),
  ('support', 2),
  ('thought', 1)]]

In [11]:
tf_idf = models.TfidfModel(term_doc, smartirs='ntc')[term_doc]
tf_idf[0]

[(0, 0.22378556170984032),
 (1, 0.2628730735955112),
 (2, 0.2379806007567371),
 (3, 0.3924898270511479),
 (4, 0.1895529247825992),
 (5, 0.07683776833093167),
 (6, 0.24720195072228957),
 (7, 0.44134296904185255),
 (8, 0.23749263690618527),
 (9, 0.44544696816567747),
 (10, 0.3429026248682516)]

In [12]:
[[(id2word[id], freq) for id, freq in cp] for cp in tf_idf[:1]]

[[('account', 0.22378556170984032),
  ('case', 0.2628730735955112),
  ('client', 0.2379806007567371),
  ('computer', 0.3924898270511479),
  ('discount', 0.1895529247825992),
  ('driver', 0.07683776833093167),
  ('month', 0.24720195072228957),
  ('response', 0.44134296904185255),
  ('star', 0.23749263690618527),
  ('support', 0.44544696816567747),
  ('thought', 0.3429026248682516)]]

# Save pre-processed data into binary Pickle file

In [13]:
import pickle

output_path = 'preprocessed_data/bolt/p2/'

## data_lemmatized

In [14]:
with open(output_path + 'data_lemmatized.pkl', 'wb') as f:
    pickle.dump(data_lemmatized, f)

In [15]:
df = pd.read_pickle(output_path + 'data_lemmatized.pkl')
df

[['driver',
  'support',
  'computer',
  'response',
  'case',
  'response',
  'star',
  'support',
  'discount',
  'account',
  'month',
  'thought',
  'client'],
 ['rating',
  'moment',
  'driver',
  'rating',
  'trip',
  'driver',
  'apartment',
  'ride',
  'review',
  'drive_destination',
  'rude'],
 ['response',
  'thank',
  'reimbursement',
  'part',
  'customer',
  'promo',
  'part',
  'situation',
  'part',
  'company',
  'extra_money',
  'people',
  'card',
  'dodgy',
  'stick',
  'amount',
  'card'],
 ['time',
  'show',
  'estimate',
  'amount',
  'destination',
  'location',
  'price',
  'way',
  'uninstall',
  'rubbish',
  'app'],
 ['driver',
  'option',
  'change',
  'time',
  'transfer',
  'trust',
  'card',
  'cash',
  'please_fix',
  'problem',
  'way'],
 ['driver',
  'driver',
  'thing',
  'customer',
  'year',
  'service',
  'cause',
  'thing'],
 ['month',
  'unknown_error',
  'message',
  'driver',
  'cancel',
  'place',
  'lift',
  'fix',
  'app',
  'tomorrow'],
 ['

## dictionary

In [16]:
with open(output_path + 'dictionary.pkl', 'wb') as f:
    pickle.dump(id2word, f)

In [17]:
import pandas as pd
id2word = pd.read_pickle(output_path + 'dictionary.pkl')

[[(id2word[id], freq) for id, freq in cp] for cp in term_doc[:3]]

[[('account', 1),
  ('case', 1),
  ('client', 1),
  ('computer', 1),
  ('discount', 1),
  ('driver', 1),
  ('month', 1),
  ('response', 2),
  ('star', 1),
  ('support', 2),
  ('thought', 1)],
 [('driver', 2),
  ('apartment', 1),
  ('drive_destination', 1),
  ('moment', 1),
  ('rating', 2),
  ('review', 1),
  ('ride', 1),
  ('rude', 1),
  ('trip', 1)],
 [('response', 1),
  ('amount', 1),
  ('card', 2),
  ('company', 1),
  ('customer', 1),
  ('dodgy', 1),
  ('extra_money', 1),
  ('part', 3),
  ('people', 1),
  ('promo', 1),
  ('reimbursement', 1),
  ('situation', 1),
  ('stick', 1),
  ('thank', 1)]]

## term_doc

In [18]:
with open(output_path + 'term_doc.pkl', 'wb') as f:
    pickle.dump(term_doc, f)

In [19]:
import pandas as pd
term_doc = pd.read_pickle(output_path + 'term_doc.pkl')
term_doc

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 2),
  (10, 1)],
 [(5, 2),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 2),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1)],
 [(7, 1),
  (19, 1),
  (20, 2),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 3),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1)],
 [(19, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1)],
 [(5, 1),
  (20, 1),
  (39, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1)],
 [(5, 2), (22, 1), (49, 1), (50, 1), (51, 2), (52, 1)],
 [(5, 1),
  (6, 1),
  (32, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1)],
 [(29, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 2),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 2),
  (68, 1)],
 [(5, 2), (19, 1), (69, 1), (70, 1), (71, 1)],
 [(5, 1),
  (9, 1),
  (16, 2),
  (50, 1),
  (65, 1),
  (72, 1),
  (73

## tf_idf

In [20]:
with open(output_path + 'tf_idf.pkl', 'wb') as f:
    pickle.dump(tf_idf, f)

In [21]:
import pandas as pd
tf_idf = pd.read_pickle(output_path + 'tf_idf.pkl')
tf_idf

<gensim.interfaces.TransformedCorpus at 0x2672e390f70>

In [22]:
[[(id2word[id], freq) for id, freq in cp] for cp in tf_idf[:1]]

[[('account', 0.22378556170984032),
  ('case', 0.2628730735955112),
  ('client', 0.2379806007567371),
  ('computer', 0.3924898270511479),
  ('discount', 0.1895529247825992),
  ('driver', 0.07683776833093167),
  ('month', 0.24720195072228957),
  ('response', 0.44134296904185255),
  ('star', 0.23749263690618527),
  ('support', 0.44544696816567747),
  ('thought', 0.3429026248682516)]]