In [29]:
from gensim.test.utils import datapath

In [17]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import LdaModel
from gensim import corpora
import pandas as pd
import numpy as np
import gzip
import re
import string
from tqdm import tqdm
from time import time
from hunspell import HunSpell
from multiprocessing import pool
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
from gensim.parsing.porter import PorterStemmer
import pickle
import nltk

nltk.download('stopwords')

spellchecker = HunSpell('dicts_hun/en_US.dic',
                        'dicts_hun/en_US.aff')

stemmer = PorterStemmer()

names = ['Amazon_Instant_Video', 
         'Apps_for_Android', 
         'Automotive', 
         'Baby', 
         'Beauty', 
         'Digital_Music', 
         'Grocery_and_Gourmet_Food', 
         'Health_and_Personal_Care', 
         'Home_and_Kitchen', 
         'Kindle_Store'
        ]

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:

ans = ''

for name in tqdm(names):

    df = getDF('data/reviews/reviews_{}.json.gz'.format(name))
    ans += df.reviewText + ' ' + df.summary + ' '

n = 50000
re_punctuation = re.compile('['+string.punctuation+']')
tokenizer = RegexpTokenizer('\w+')
stop = stopwords.words('english')
preprocessed_comments = []
for comment in tqdm(np.random.choice(ans, n)):
    comment = re_punctuation.sub(' ', comment)
    comment = tokenizer.tokenize(comment)
    comment = [x for x in comment if not any(c.isdigit() for c in x)]
    comment = [stemmer.stem(x) for x in comment if spellchecker.spell(x)]
#     comment = [ for x in comment]
    comment = [word for word in comment if word not in stop]
    preprocessed_comments.append(comment)
    
    
wordFrequency = Counter()
for comment in preprocessed_comments:
    wordFrequency.update(comment)                                  # Count overall word frequency
print('Unique Words In Comments: {}'.format(len(wordFrequency)))

minimumWordOccurrences = 5
# Remove rare words
texts = [[word for word in comment if wordFrequency[word] > minimumWordOccurrences] for comment in preprocessed_comments]

dictionary = corpora.Dictionary(texts)                             # Create word dictionary
vocabulary = [dictionary[i] for i in dictionary.keys()]
print('Documents/Comments: {}'.format(len(texts)))

corpus = [dictionary.doc2bow(doc) for doc in preprocessed_comments] # Create corpus

with open('results/corpus.pickle_', 'wb') as f:
        pickle.dump(corpus, f)


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:28<04:12, 28.09s/it][A
 20%|██        | 2/10 [02:28<07:26, 55.87s/it][A
 30%|███       | 3/10 [03:38<07:00, 60.09s/it][A
 40%|████      | 4/10 [04:25<05:35, 55.97s/it][A
 50%|█████     | 5/10 [06:06<05:47, 69.49s/it][A
 60%|██████    | 6/10 [06:50<04:08, 62.02s/it][A
 70%|███████   | 7/10 [07:57<03:09, 63.31s/it][A
 80%|████████  | 8/10 [10:36<03:04, 92.10s/it][A
 90%|█████████ | 9/10 [14:17<02:10, 130.90s/it][A
100%|██████████| 10/10 [17:07<00:00, 142.46s/it][A
  0%|          | 0/50000 [00:00<?, ?it/s][A
  0%|          | 17/50000 [00:00<06:53, 120.83it/s][A
  0%|          | 39/50000 [00:00<06:00, 138.68it/s][A
  0%|          | 55/50000 [00:00<05:46, 144.29it/s][A
  0%|          | 72/50000 [00:00<05:38, 147.51it/s][A
  0%|          | 89/50000 [00:00<05:27, 152.53it/s][A
  0%|          | 107/50000 [00:00<05:14, 158.46it/s][A
  0%|          | 123/50000 [00:00<05:19, 156.21it/s][A
  0%|          | 143/5

 10%|█         | 5185/50000 [00:29<04:11, 178.43it/s][A
 10%|█         | 5205/50000 [00:29<04:03, 184.33it/s][A
 10%|█         | 5224/50000 [00:29<04:03, 184.25it/s][A
 10%|█         | 5244/50000 [00:29<03:59, 186.58it/s][A
 11%|█         | 5263/50000 [00:30<04:08, 179.77it/s][A
 11%|█         | 5282/50000 [00:30<04:19, 172.38it/s][A
 11%|█         | 5300/50000 [00:30<04:25, 168.15it/s][A
 11%|█         | 5320/50000 [00:30<04:16, 174.09it/s][A
 11%|█         | 5338/50000 [00:30<04:19, 172.28it/s][A
 11%|█         | 5357/50000 [00:30<04:14, 175.23it/s][A
 11%|█         | 5375/50000 [00:30<04:17, 173.62it/s][A
 11%|█         | 5395/50000 [00:30<04:09, 179.03it/s][A
 11%|█         | 5413/50000 [00:30<04:08, 179.10it/s][A
 11%|█         | 5432/50000 [00:31<04:08, 179.63it/s][A
 11%|█         | 5451/50000 [00:31<04:10, 177.98it/s][A
 11%|█         | 5470/50000 [00:31<04:07, 180.24it/s][A
 11%|█         | 5489/50000 [00:31<04:04, 181.69it/s][A
 11%|█         | 5508/50000 [00

 21%|██        | 10563/50000 [01:00<03:40, 179.09it/s][A
 21%|██        | 10582/50000 [01:00<03:40, 178.85it/s][A
 21%|██        | 10603/50000 [01:00<03:30, 186.82it/s][A
 21%|██        | 10622/50000 [01:00<03:43, 176.55it/s][A
 21%|██▏       | 10640/50000 [01:00<03:44, 175.47it/s][A
 21%|██▏       | 10658/50000 [01:00<03:46, 173.92it/s][A
 21%|██▏       | 10678/50000 [01:00<03:39, 179.44it/s][A
 21%|██▏       | 10697/50000 [01:00<03:45, 174.51it/s][A
 21%|██▏       | 10715/50000 [01:01<03:44, 174.64it/s][A
 21%|██▏       | 10733/50000 [01:01<03:48, 171.63it/s][A
 22%|██▏       | 10752/50000 [01:01<03:44, 174.71it/s][A
 22%|██▏       | 10770/50000 [01:01<03:48, 171.42it/s][A
 22%|██▏       | 10788/50000 [01:01<03:53, 168.01it/s][A
 22%|██▏       | 10806/50000 [01:01<03:49, 170.84it/s][A
 22%|██▏       | 10824/50000 [01:01<03:47, 172.22it/s][A
 22%|██▏       | 10844/50000 [01:01<03:40, 177.42it/s][A
 22%|██▏       | 10863/50000 [01:01<03:39, 178.57it/s][A
 22%|██▏      

 32%|███▏      | 15881/50000 [01:30<03:07, 182.44it/s][A
 32%|███▏      | 15900/50000 [01:30<03:10, 178.92it/s][A
 32%|███▏      | 15918/50000 [01:30<03:13, 176.53it/s][A
 32%|███▏      | 15938/50000 [01:30<03:08, 180.85it/s][A
 32%|███▏      | 15957/50000 [01:30<03:17, 172.08it/s][A
 32%|███▏      | 15975/50000 [01:30<03:19, 170.32it/s][A
 32%|███▏      | 15993/50000 [01:31<03:26, 164.35it/s][A
 32%|███▏      | 16010/50000 [01:31<03:25, 165.21it/s][A
 32%|███▏      | 16031/50000 [01:31<03:14, 174.88it/s][A
 32%|███▏      | 16050/50000 [01:31<03:10, 177.90it/s][A
 32%|███▏      | 16068/50000 [01:31<03:11, 177.48it/s][A
 32%|███▏      | 16088/50000 [01:31<03:04, 183.44it/s][A
 32%|███▏      | 16107/50000 [01:31<03:09, 179.23it/s][A
 32%|███▏      | 16126/50000 [01:31<03:12, 175.71it/s][A
 32%|███▏      | 16144/50000 [01:31<03:16, 171.90it/s][A
 32%|███▏      | 16162/50000 [01:31<03:26, 163.72it/s][A
 32%|███▏      | 16182/50000 [01:32<03:17, 171.40it/s][A
 32%|███▏     

 42%|████▏     | 21176/50000 [02:00<02:50, 169.48it/s][A
 42%|████▏     | 21194/50000 [02:00<02:48, 170.62it/s][A
 42%|████▏     | 21212/50000 [02:00<02:48, 171.14it/s][A
 42%|████▏     | 21230/50000 [02:00<02:47, 171.71it/s][A
 42%|████▎     | 21250/50000 [02:01<02:41, 177.71it/s][A
 43%|████▎     | 21268/50000 [02:01<02:42, 176.82it/s][A
 43%|████▎     | 21286/50000 [02:01<02:41, 177.32it/s][A
 43%|████▎     | 21304/50000 [02:01<02:41, 177.85it/s][A
 43%|████▎     | 21324/50000 [02:01<02:36, 183.41it/s][A
 43%|████▎     | 21345/50000 [02:01<02:34, 186.02it/s][A
 43%|████▎     | 21365/50000 [02:01<02:32, 187.56it/s][A
 43%|████▎     | 21385/50000 [02:01<02:29, 191.05it/s][A
 43%|████▎     | 21405/50000 [02:01<02:32, 187.23it/s][A
 43%|████▎     | 21424/50000 [02:01<02:39, 178.77it/s][A
 43%|████▎     | 21442/50000 [02:02<02:42, 176.08it/s][A
 43%|████▎     | 21460/50000 [02:02<02:44, 173.82it/s][A
 43%|████▎     | 21478/50000 [02:02<02:47, 169.91it/s][A
 43%|████▎    

 53%|█████▎    | 26496/50000 [02:30<02:11, 179.30it/s][A
 53%|█████▎    | 26516/50000 [02:30<02:08, 182.89it/s][A
 53%|█████▎    | 26535/50000 [02:30<02:15, 173.41it/s][A
 53%|█████▎    | 26553/50000 [02:31<02:14, 174.46it/s][A
 53%|█████▎    | 26575/50000 [02:31<02:06, 185.79it/s][A
 53%|█████▎    | 26594/50000 [02:31<02:08, 182.16it/s][A
 53%|█████▎    | 26613/50000 [02:31<02:11, 177.51it/s][A
 53%|█████▎    | 26631/50000 [02:31<02:12, 177.03it/s][A
 53%|█████▎    | 26650/50000 [02:31<02:09, 180.53it/s][A
 53%|█████▎    | 26669/50000 [02:31<02:13, 174.86it/s][A
 53%|█████▎    | 26688/50000 [02:31<02:10, 178.98it/s][A
 53%|█████▎    | 26707/50000 [02:31<02:10, 179.05it/s][A
 53%|█████▎    | 26725/50000 [02:32<02:16, 170.44it/s][A
 53%|█████▎    | 26743/50000 [02:32<02:21, 164.81it/s][A
 54%|█████▎    | 26761/50000 [02:32<02:17, 168.62it/s][A
 54%|█████▎    | 26781/50000 [02:32<02:13, 173.93it/s][A
 54%|█████▎    | 26800/50000 [02:32<02:11, 176.39it/s][A
 54%|█████▎   

 64%|██████▎   | 31799/50000 [03:01<01:53, 160.84it/s][A
 64%|██████▎   | 31816/50000 [03:01<01:51, 162.41it/s][A
 64%|██████▎   | 31834/50000 [03:01<01:48, 166.78it/s][A
 64%|██████▎   | 31851/50000 [03:01<01:48, 167.56it/s][A
 64%|██████▎   | 31868/50000 [03:01<01:48, 167.26it/s][A
 64%|██████▍   | 31885/50000 [03:01<01:48, 166.75it/s][A
 64%|██████▍   | 31903/50000 [03:01<01:46, 169.89it/s][A
 64%|██████▍   | 31921/50000 [03:01<01:45, 171.75it/s][A
 64%|██████▍   | 31939/50000 [03:01<01:43, 173.72it/s][A
 64%|██████▍   | 31961/50000 [03:01<01:38, 182.54it/s][A
 64%|██████▍   | 31980/50000 [03:02<01:41, 178.09it/s][A
 64%|██████▍   | 31998/50000 [03:02<01:41, 176.54it/s][A
 64%|██████▍   | 32016/50000 [03:02<01:41, 176.97it/s][A
 64%|██████▍   | 32035/50000 [03:02<01:39, 180.36it/s][A
 64%|██████▍   | 32054/50000 [03:02<01:39, 181.08it/s][A
 64%|██████▍   | 32073/50000 [03:02<01:41, 176.87it/s][A
 64%|██████▍   | 32091/50000 [03:02<01:44, 171.25it/s][A
 64%|██████▍  

 74%|███████▍  | 37103/50000 [03:31<01:11, 179.30it/s][A
 74%|███████▍  | 37122/50000 [03:31<01:11, 179.34it/s][A
 74%|███████▍  | 37142/50000 [03:31<01:10, 181.14it/s][A
 74%|███████▍  | 37161/50000 [03:31<01:12, 177.25it/s][A
 74%|███████▍  | 37181/50000 [03:31<01:10, 182.47it/s][A
 74%|███████▍  | 37200/50000 [03:31<01:11, 178.34it/s][A
 74%|███████▍  | 37220/50000 [03:31<01:09, 183.23it/s][A
 74%|███████▍  | 37239/50000 [03:31<01:10, 181.95it/s][A
 75%|███████▍  | 37259/50000 [03:31<01:08, 185.83it/s][A
 75%|███████▍  | 37278/50000 [03:32<01:10, 181.57it/s][A
 75%|███████▍  | 37297/50000 [03:32<01:09, 181.75it/s][A
 75%|███████▍  | 37318/50000 [03:32<01:08, 186.11it/s][A
 75%|███████▍  | 37337/50000 [03:32<01:09, 181.78it/s][A
 75%|███████▍  | 37356/50000 [03:32<01:09, 180.66it/s][A
 75%|███████▍  | 37375/50000 [03:32<01:09, 181.28it/s][A
 75%|███████▍  | 37394/50000 [03:32<01:09, 180.41it/s][A
 75%|███████▍  | 37413/50000 [03:32<01:09, 180.89it/s][A
 75%|███████▍ 

 85%|████████▍ | 42433/50000 [04:01<00:41, 181.16it/s][A
 85%|████████▍ | 42452/50000 [04:01<00:43, 172.81it/s][A
 85%|████████▍ | 42470/50000 [04:01<00:43, 172.88it/s][A
 85%|████████▍ | 42491/50000 [04:01<00:41, 180.84it/s][A
 85%|████████▌ | 42510/50000 [04:01<00:41, 180.57it/s][A
 85%|████████▌ | 42529/50000 [04:01<00:41, 178.32it/s][A
 85%|████████▌ | 42547/50000 [04:01<00:41, 178.01it/s][A
 85%|████████▌ | 42568/50000 [04:02<00:40, 184.36it/s][A
 85%|████████▌ | 42589/50000 [04:02<00:38, 190.18it/s][A
 85%|████████▌ | 42609/50000 [04:02<00:39, 187.43it/s][A
 85%|████████▌ | 42629/50000 [04:02<00:38, 189.35it/s][A
 85%|████████▌ | 42649/50000 [04:02<00:40, 182.19it/s][A
 85%|████████▌ | 42668/50000 [04:02<00:41, 178.02it/s][A
 85%|████████▌ | 42687/50000 [04:02<00:40, 178.50it/s][A
 85%|████████▌ | 42707/50000 [04:02<00:39, 183.07it/s][A
 85%|████████▌ | 42727/50000 [04:02<00:39, 183.34it/s][A
 85%|████████▌ | 42746/50000 [04:02<00:39, 183.57it/s][A
 86%|████████▌

 96%|█████████▌| 47764/50000 [04:31<00:13, 168.40it/s][A
 96%|█████████▌| 47783/50000 [04:31<00:12, 173.02it/s][A
 96%|█████████▌| 47801/50000 [04:31<00:12, 172.66it/s][A
 96%|█████████▌| 47822/50000 [04:31<00:12, 181.14it/s][A
 96%|█████████▌| 47841/50000 [04:31<00:12, 175.38it/s][A
 96%|█████████▌| 47861/50000 [04:32<00:11, 180.06it/s][A
 96%|█████████▌| 47880/50000 [04:32<00:11, 179.64it/s][A
 96%|█████████▌| 47899/50000 [04:32<00:11, 180.04it/s][A
 96%|█████████▌| 47918/50000 [04:32<00:11, 181.15it/s][A
 96%|█████████▌| 47937/50000 [04:32<00:11, 181.64it/s][A
 96%|█████████▌| 47957/50000 [04:32<00:11, 180.98it/s][A
 96%|█████████▌| 47977/50000 [04:32<00:10, 185.74it/s][A
 96%|█████████▌| 47998/50000 [04:32<00:10, 191.94it/s][A
 96%|█████████▌| 48018/50000 [04:32<00:10, 183.31it/s][A
 96%|█████████▌| 48037/50000 [04:33<00:11, 172.13it/s][A
 96%|█████████▌| 48056/50000 [04:33<00:11, 176.48it/s][A
 96%|█████████▌| 48075/50000 [04:33<00:10, 177.62it/s][A
 96%|█████████

Unique Words In Comments: 30588
Documents/Comments: 50000


In [20]:
with open('results/dictionary.pickle_', 'wb') as f:
        pickle.dump(dictionary, f)
with open('results/vocab.pickle_', 'wb') as f:
        pickle.dump(vocabulary, f)

In [23]:
numberTopics = 20   #Number of topics

model_gensim = LdaMulticore(num_topics=numberTopics,
                        id2word=dictionary,
                        iterations=10,
                        passes=1,
                        chunksize=50,
                        eta='auto',
                        workers=14)


perp_gensim = []
times_gensim = []
i=0
max_it = 5
min_prep = np.inf
start = time()

for _ in tqdm(range(100)):
    model_gensim.update(corpus)
    tmp = np.exp(-1 * model_gensim.log_perplexity(corpus))
    perp_gensim.append(tmp)
    times_gensim.append(time() - start)
    if(tmp<min_prep):
        min_prep = tmp;
        i = 0
    else:
        i = i + 1;
        if (i==max_it):
            break                # if prep increase for max_it number it will break the update procedure 
for i, topic in enumerate(model_gensim.get_topics().argsort(axis=1)[:, -10:][:, ::-1], 1):
    print('Topic {}: {}'.format(i, ' '.join([vocabulary[id] for id in topic])))



  0%|          | 0/100 [00:00<?, ?it/s][A[A

  1%|          | 1/100 [01:33<2:33:40, 93.13s/it][A[A

  2%|▏         | 2/100 [03:08<2:33:09, 93.77s/it][A[A

  3%|▎         | 3/100 [04:45<2:33:15, 94.80s/it][A[A

  4%|▍         | 4/100 [06:20<2:31:51, 94.91s/it][A[A

  5%|▌         | 5/100 [07:54<2:29:39, 94.52s/it][A[A

  6%|▌         | 6/100 [09:29<2:28:11, 94.59s/it][A[A

  7%|▋         | 7/100 [11:03<2:26:33, 94.56s/it][A[A

  8%|▊         | 8/100 [12:38<2:25:10, 94.68s/it][A[A

  9%|▉         | 9/100 [14:13<2:23:32, 94.65s/it][A[A

 10%|█         | 10/100 [15:47<2:21:54, 94.61s/it][A[A

 11%|█         | 11/100 [17:21<2:20:05, 94.45s/it][A[A

 12%|█▏        | 12/100 [18:56<2:18:39, 94.54s/it][A[A

 13%|█▎        | 13/100 [20:31<2:17:05, 94.55s/it][A[A

 14%|█▍        | 14/100 [22:06<2:16:00, 94.89s/it][A[A

 15%|█▌        | 15/100 [23:41<2:14:19, 94.82s/it][A[A

 16%|█▌        | 16/100 [25:18<2:13:31, 95.38s/it][A[A

 17%|█▋        | 17/100 [26:56<2

Topic 1: thi us food ar tast product cook like make eat
Topic 2: thi tea love like smell great wa veri good flavor
Topic 3: album thi song music sound ar wa band like record
Topic 4: thi thei ar like great good love wa veri book
Topic 5: thi us skin product oil face wa work cream great
Topic 6: coffe thi cup us water great make pot good like
Topic 7: thi great bottl wa us love work good product time
Topic 8: thi seat wa car us ar fit stroller veri back
Topic 9: hi thi stori wa charact ar ha book thei life
Topic 10: quot thi album ar hi like wa track cd song
Topic 11: thi wa take us dai time great help product ar
Topic 12: thi great love product good wa veri us price like
Topic 13: thi babi great wa love veri us good sleep like
Topic 14: thi us batteri work great unit power wa charg ar
Topic 15: game ar thei thi plai love fun great like color
Topic 16: us thi clean water diaper get ar wa thei bag
Topic 17: wa thi amazon product thei order time replac filter return
Topic 18: hair thi us 

In [37]:
model_gensim.save('results/model_genism/model_genism.model')

In [38]:
lda = LdaModel.load("results/model_genism/model_genism.model", mmap='r')

In [53]:
for i, topic in enumerate(model_gensim.get_topics().argsort(axis=1)[:, -10:][:, ::-1], 1):
    print('Topic {}: {}'.format(i, ' '.join([vocabulary[id] for id in topic])))

Topic 1: thi us food ar tast product cook like make eat
Topic 2: thi tea love like smell great wa veri good flavor
Topic 3: album thi song music sound ar wa band like record
Topic 4: thi thei ar like great good love wa veri book
Topic 5: thi us skin product oil face wa work cream great
Topic 6: coffe thi cup us water great make pot good like
Topic 7: thi great bottl wa us love work good product time
Topic 8: thi seat wa car us ar fit stroller veri back
Topic 9: hi thi stori wa charact ar ha book thei life
Topic 10: quot thi album ar hi like wa track cd song
Topic 11: thi wa take us dai time great help product ar
Topic 12: thi great love product good wa veri us price like
Topic 13: thi babi great wa love veri us good sleep like
Topic 14: thi us batteri work great unit power wa charg ar
Topic 15: game ar thei thi plai love fun great like color
Topic 16: us thi clean water diaper get ar wa thei bag
Topic 17: wa thi amazon product thei order time replac filter return
Topic 18: hair thi us 

In [61]:
lda.show_topics(num_topics=7, num_words=50, log=False, formatted=True)

[(17,
  '0.054*"hair" + 0.032*"thi" + 0.023*"us" + 0.011*"great" + 0.011*"product" + 0.011*"wa" + 0.010*"veri" + 0.009*"brush" + 0.009*"iron" + 0.009*"work" + 0.008*"get" + 0.008*"like" + 0.008*"shave" + 0.008*"time" + 0.008*"good" + 0.007*"dry" + 0.007*"shampoo" + 0.007*"love" + 0.007*"look" + 0.006*"ha" + 0.005*"dryer" + 0.005*"ar" + 0.005*"would" + 0.005*"well" + 0.005*"realli" + 0.005*"condition" + 0.005*"razor" + 0.005*"much" + 0.005*"curl" + 0.004*"make" + 0.004*"onli" + 0.004*"year" + 0.004*"doe" + 0.004*"long" + 0.004*"littl" + 0.004*"color" + 0.004*"head" + 0.004*"becaus" + 0.004*"also" + 0.004*"heat" + 0.004*"leav" + 0.004*"shaver" + 0.003*"thick" + 0.003*"need" + 0.003*"thei" + 0.003*"recommend" + 0.003*"best" + 0.003*"nice" + 0.003*"want" + 0.003*"go"'),
 (18,
  '0.034*"thi" + 0.019*"wa" + 0.018*"us" + 0.011*"get" + 0.009*"time" + 0.009*"product" + 0.008*"thei" + 0.008*"like" + 0.007*"work" + 0.007*"nail" + 0.007*"would" + 0.006*"look" + 0.006*"first" + 0.006*"ar" + 0.005*"

In [60]:
stemmer.stem('are')

'ar'