# References

# Modified from KEFE's preprocess_review.py

In [1]:
!python --version
!pwd

Python 3.7.6
/Users/enlik/GitRepo/master-thesis-2021/notebooks


In [2]:
from utils import *
from pprint import pprint

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

config = get_config('config.yaml')

[nltk_data] Downloading package stopwords to /Users/enlik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Let's Get Started

## Combine All Reviews

### Read from Local Machine

In [3]:
df_merged = pd.Series()

In [4]:
for i in config['csv_input_local']:
    if 'apple' in i:
        print('apple')
        df = pd.read_csv(config['csv_input_local'][i], index_col=0)
        new = df.review.astype(str)
        total_reviews = len(new)
        print(f'Total {i} reviews: {total_reviews} \n')
    elif 'google' in i:
        print('google')
        df = pd.read_csv(config['csv_input_local'][i], index_col=0)
        new = df.content.astype(str)
        total_reviews = len(new)
        print(f'Total {i} reviews: {total_reviews} \n')
    else:
        print("Oops!  That was no valid input.  Try again...")
        
    df_merged = df_merged.append(new, ignore_index=True)

apple
Total bolt_apple reviews: 3154 

google
Total bolt_google reviews: 51907 

apple
Total uber_apple reviews: 10342 

google
Total uber_google reviews: 10000 

apple
Total blablacar_apple reviews: 23308 

google
Total blablacar_google reviews: 21172 

apple
Total cabify_apple reviews: 7384 

google
Total cabify_google reviews: 3261 

apple
Total via_apple reviews: 2392 

google
Total via_google reviews: 1873 

apple
Total getaround_apple reviews: 2488 

google
Total getaround_google reviews: 731 

apple
Total olacabs_apple reviews: 922 

google
Total olacabs_google reviews: 10000 

apple
Total taxieu_apple reviews: 564 

google
Total taxieu_google reviews: 211 

apple
Total freenow_apple reviews: 14350 

google
Total freenow_google reviews: 11078 

apple
Total yandexgo_apple reviews: 171 

google
Total yandexgo_google reviews: 7053 



### Read from Google Drive

- Google Drive has limitation after multiple access in short time that cause **HTTPError: HTTP Error 403: Forbidden**

In [5]:
# for i in config['csv_input']:
#     if 'apple' in i:
#         print('apple')
#         df = read_csv_from_gdrive(config['csv_input'][i])
#         new = df.review.astype(str)
#         total_reviews = len(df_merged)
#         print(f'Total English reviews: {total_reviews} \n')
#     elif 'google' in i:
#         print('google')
#         df = read_csv_from_gdrive(config['csv_input'][i])
#         new = df.content.astype(str)
#     else:
#         print("Oops!  That was no valid input.  Try again...")
        
#     df_merged = df_merged.append(new, ignore_index=True)

In [6]:
df_merged

0         I love bolt. I don’t use uber often because on...
1         So annoyed with this app!! Definitely the wors...
2         I’ve been using bolt for a month now. I had a ...
3         To make things clear, I am not a regular revie...
4         I have used the app 3 or 4 times and I thought...
                                ...                        
182356                                                 good
182357                                                 nice
182358                                                 Good
182359                                            Excellent
182360                                                 Good
Length: 182361, dtype: object

In [7]:
total_reviews = len(df_merged)

print(f'Total English reviews: {total_reviews} \n')

Total English reviews: 182361 



In [8]:
cleaned_docs = remove_things(df_merged)

In [9]:
lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)

In [10]:
ngrams = make_bigrams(lists_of_words_no_stops)

Making bigrams...


In [11]:
data_lemmatized = lemmatize(ngrams, allowed_postags=['NOUN'])

Lemmatizing...


In [12]:
corpora.Dictionary(data_lemmatized)

<gensim.corpora.dictionary.Dictionary at 0x154c19668>

In [13]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
term_doc = [id2word.doc2bow(text) for text in texts]

# View
print(term_doc[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]]


In [14]:
id2word[0]

'ambassador'

In [15]:
[[(id2word[id], freq) for id, freq in cp] for cp in term_doc[:1]]


[[('ambassador', 1),
  ('contact', 1),
  ('driver', 2),
  ('first_time', 1),
  ('love', 1),
  ('ride', 1),
  ('throughout_journey', 1),
  ('understanding', 1)]]

In [16]:
tf_idf = models.TfidfModel(term_doc, smartirs='ntc')[term_doc]
tf_idf[0]

[(0, 0.5108908012072018),
 (1, 0.2737937766067359),
 (2, 0.2195610285993868),
 (3, 0.37066305477630296),
 (4, 0.20098426367518943),
 (5, 0.14880566183194763),
 (6, 0.5108908012072018),
 (7, 0.3935507612508511)]

In [17]:
[[(id2word[id], freq) for id, freq in cp] for cp in tf_idf[:1]]

[[('ambassador', 0.5108908012072018),
  ('contact', 0.2737937766067359),
  ('driver', 0.2195610285993868),
  ('first_time', 0.37066305477630296),
  ('love', 0.20098426367518943),
  ('ride', 0.14880566183194763),
  ('throughout_journey', 0.5108908012072018),
  ('understanding', 0.3935507612508511)]]

# Save pre-processed data into binary Pickle file

In [18]:
import pickle

output_path = 'preprocessed_data/all/'

## data_lemmatized

In [19]:
with open(output_path + 'data_lemmatized.pkl', 'wb') as f:
    pickle.dump(data_lemmatized, f)

In [20]:
df = pd.read_pickle('preprocessed_data/data_lemmatized.pkl')
df

[['fare', 'guy'],
 [],
 [],
 ['uber', 'freezing', 'locating', 'form', 'payment', 'day', 'update'],
 ['uber',
  'care_customer',
  'location',
  'always_ask',
  'money',
  'credit',
  'ride',
  'time'],
 ['ride'],
 ['address', 'passenger', 'error', 'way', 'problem', 'problem'],
 [],
 ['facility',
  'cancelling_trip',
  'driver',
  'penalty',
  'money',
  'customer',
  'user',
  'trip',
  'people',
  'name',
  'reality',
  'grade',
  'service',
  'rating',
  'rate'],
 ['problem', 'way', 'comment', 'problem', 'problem', 'rating', 'offer', 'way'],
 ['ride', 'request', 'minute', 'car', 'rate', 'bit', 'lyft', 'second'],
 ['facility'],
 ['taxi',
  'dollar',
  'taxi',
  'cancelling',
  'price',
  'price',
  'pay',
  'dollar',
  'difference'],
 ['book', 'trip', 'function'],
 ['cost', 'ride', 'ride', 'solve', 'bug'],
 ['user', 'venmo', 'credit_card', 'get'],
 ['thank', 'service', 'uber'],
 ['pin', 'app', 'location'],
 ['complaint'],
 ['customer', 'care_customer'],
 ['total', 'paytm', 'use', 'app

## dictionary

In [21]:
with open(output_path + 'dictionary.pkl', 'wb') as f:
    pickle.dump(id2word, f)

## term_doc

In [22]:
with open(output_path + 'term_doc.pkl', 'wb') as f:
    pickle.dump(term_doc, f)

## tf_idf

In [23]:
with open(output_path + 'tf_idf.pkl', 'wb') as f:
    pickle.dump(tf_idf, f)

## Save df_merged into CSV file

In [24]:
df_merged.to_csv(output_path + 'all_10_apps.csv')