In [26]:
import pandas as pd
import gzip
from functools import reduce
import pickle
import re
from joblib import Parallel, delayed
import gc
import numpy as np
from itertools import chain
from hunspell import HunSpell
from gensim.parsing.porter import PorterStemmer
from multiprocessing import Pool
from tqdm import tqdm
import os
import swifter


# Check for misspelled words
spellchecker = HunSpell('dicts_hun/en_US.dic',
                        'dicts_hun/en_US.aff')

# Stem words -> see example below
port_stem = PorterStemmer()



In [27]:
port_stem.stem('ponies')

'poni'

In [2]:
#### Pick up downloaded datasets to process
names = ['Amazon_Instant_Video', 
         'Apps_for_Android', 
         'Automotive', 
         'Baby', 
         'Beauty', 
         'Digital_Music', 
         'Grocery_and_Gourmet_Food', 
         'Health_and_Personal_Care', 
         'Home_and_Kitchen', 
         'Kindle_Store'
        ]

In [14]:
def process_data(name):
    
    ### If dataset has already been processed, just skip it
    if os.path.isfile('pickled_datasets/{}_truncated.pickle'.format(name)):
        print('Dataset file {} exists'.format(name))
        with open('pickled_datasets/{}_truncated.pickle'.format(name), 'rb') as f:
            data = pickle.load(f)
            return data
    
    json_iterator = pd.read_json('data/reviews/reviews_{}.json.gz'.format(name), orient='records', lines=True, compression='gzip')

    print('Finished reading {}'.format(name))
    

    reviews = json_iterator
    
    
    ### We only need reviweText and summary columns
    reviews.drop(reviews.columns.difference(['reviewText','summary']), 1, inplace=True)

    ### Two columns -> one columns
    col_series = reviews.summary.append(reviews.reviewText)
    
    ### Leave only letters and split by ' '
    col_series = col_series.swifter.apply(lambda x: re.sub("[^a-zA-Z ]", ' ', x.lower()))
    col_series = col_series.swifter.apply(lambda x: list(x.replace('/',' ').split(' ')))
    
    complete_list = list(col_series)
    
    del col_series

    ### Good and fast trick for this
    ### [['I', 'love', 'Machine'], ['Learning'], ['TAs', 'are'], ['the', 'best']] ->
    ##  ['I', 'love', 'Machine', 'Learning', 'TAs', 'are', 'the', 'best'] 
    words = pd.Series(list(chain.from_iterable(complete_list)))
    
    ### Leave only words with len(str) > 3, in this way we remove 'the', 'and', 'I', 'my', 'she', 'he' etc
    ind_len_words = words.str.contains('^\w{4,}$')
    words = words[ind_len_words]
    
    ### Spell check
    ind_spell_words = words.swifter.apply(spellchecker.spell)
    words = words[ind_spell_words]
    
    ### Stemming
    words = words.swifter.apply(port_stemp.stem)
        
    ### Unique
    words = words.unique()


    with open('pickled_datasets/{}_truncated.pickle'.format(name), 'wb') as f:
        pickle.dump(words, f)
        
    print('Finished {}, len={}'.format(name, len(words)))

    return words



In [None]:


all_words = []

for name in names:
    words = process_data(name)
    all_words.append(words)



Dataset file Amazon_Instant_Video exists
Dataset file Apps_for_Android exists
Finished reading Automotive


Pandas Apply: 100%|██████████| 2747536/2747536 [00:11<00:00, 240375.03it/s]
Pandas Apply: 100%|██████████| 2747536/2747536 [00:17<00:00, 158794.79it/s]
Pandas Apply: 100%|██████████| 48227146/48227146 [01:19<00:00, 607092.41it/s]
Pandas Apply: 100%|██████████| 46506047/46506047 [05:00<00:00, 154600.21it/s]


Finished Automotive, len=21115
Finished reading Baby


Pandas Apply: 100%|██████████| 1830892/1830892 [00:08<00:00, 209975.26it/s]
Pandas Apply: 100%|██████████| 1830892/1830892 [00:13<00:00, 134914.76it/s]
Pandas Apply: 100%|██████████| 42550709/42550709 [01:08<00:00, 625549.87it/s]
Pandas Apply: 100%|██████████| 41283976/41283976 [04:25<00:00, 155493.83it/s]


Finished Baby, len=19561
Finished reading Beauty


Pandas Apply: 100%|██████████| 4046140/4046140 [00:16<00:00, 242021.14it/s]
Pandas Apply: 100%|██████████| 4046140/4046140 [00:24<00:00, 166783.37it/s]
Pandas Apply: 100%|██████████| 74944267/74944267 [02:00<00:00, 622848.18it/s]
Pandas Apply: 100%|██████████| 72391791/72391791 [07:45<00:00, 155416.92it/s]


Finished Beauty, len=23030
Finished reading Digital_Music


Pandas Apply: 100%|██████████| 1672012/1672012 [00:08<00:00, 196996.30it/s]
Pandas Apply: 100%|██████████| 1672012/1672012 [00:11<00:00, 145555.33it/s]
Pandas Apply: 100%|██████████| 38604642/38604642 [01:04<00:00, 597728.10it/s]
Pandas Apply: 100%|██████████| 35898376/35898376 [03:48<00:00, 156933.85it/s]


Finished Digital_Music, len=24990
Finished reading Grocery_and_Gourmet_Food


Pandas Apply: 100%|██████████| 2594312/2594312 [00:11<00:00, 233664.32it/s]
Pandas Apply: 100%|██████████| 2594312/2594312 [00:13<00:00, 185860.89it/s]
Pandas Apply: 100%|██████████| 47946946/47946946 [01:17<00:00, 615534.98it/s]
Pandas Apply: 100%|██████████| 46019181/46019181 [05:00<00:00, 153205.54it/s]


Finished Grocery_and_Gourmet_Food, len=22570
Finished reading Health_and_Personal_Care


Pandas Apply: 100%|██████████| 5964652/5964652 [00:25<00:00, 234181.75it/s]
Pandas Apply: 100%|██████████| 5964652/5964652 [00:36<00:00, 162029.81it/s]
Pandas Apply: 100%|██████████| 119365739/119365739 [03:13<00:00, 617588.31it/s]
Pandas Apply: 100%|██████████| 115350724/115350724 [12:32<00:00, 153248.34it/s]


Finished Health_and_Personal_Care, len=26046
Finished reading Home_and_Kitchen


Pandas Apply: 100%|██████████| 8507852/8507852 [00:37<00:00, 229849.10it/s]
Pandas Apply: 100%|██████████| 8507852/8507852 [00:58<00:00, 146380.82it/s]
Pandas Apply: 100%|██████████| 175752827/175752827 [04:43<00:00, 619930.50it/s]
Pandas Apply: 100%|██████████| 170850087/170850087 [18:32<00:00, 153513.02it/s]


Finished Home_and_Kitchen, len=26454
Finished reading Kindle_Store


Pandas Apply: 100%|██████████| 6410934/6410934 [00:30<00:00, 208242.05it/s]
Pandas Apply: 100%|██████████| 6410934/6410934 [00:47<00:00, 135010.09it/s]
Pandas Apply: 100%|██████████| 160292138/160292138 [04:29<00:00, 593705.95it/s]
Pandas Apply:  99%|█████████▉| 149947332/151497298 [16:23<00:10, 154767.87it/s]

In [22]:
### Put all words in one set

a = set()

for name in names:
    with open('pickled_datasets/{}_truncated.pickle'.format(name), 'rb') as f:
            data = pickle.load(f)
            a = a.union(data)
            
a = list(a)
print(len(a))

with open('pickled_datasets/all_words_truncated.pickle', 'wb') as f:
    pickle.dump(a, f)


32730


In [21]:
print(a)

