In [None]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import LdaModel
from gensim import corpora
import pandas as pd
import numpy as np
import gzip
import re
import string
from tqdm import tqdm
from time import time
# from hunspell import HunSpell
from multiprocessing import pool
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [61]:
df = getDF('data/reviews/reviews_Musical_Instruments.json.gz')

ans = df.reviewText + ' ' + df.summary

n = 10000
re_punctuation = re.compile('['+string.punctuation+']')
tokenizer = RegexpTokenizer('\w+')
stop = stopwords.words('english')
preprocessed_comments = []
for comment in tqdm(np.random.choice(ans, n)):
    comment = re_punctuation.sub(' ', comment)
    comment = tokenizer.tokenize(comment)
    comment = [x for x in comment if not any(c.isdigit() for c in x)]
    ########### HERE You have to insert HunSpell
    comment = [word for word in comment if word not in stop]
    preprocessed_comments.append(comment)
    
    
wordFrequency = Counter()
for comment in preprocessed_comments:
    wordFrequency.update(comment)                                  # Count overall word frequency
print('Unique Words In Comments: {}'.format(len(wordFrequency)))

minimumWordOccurrences = 5
# Remove rare words
texts = [[word for word in comment if wordFrequency[word] > minimumWordOccurrences] for comment in preprocessed_comments]

dictionary = corpora.Dictionary(texts)                             # Create word dictionary
vocabulary = [dictionary[i] for i in dictionary.keys()]
print('Documents/Comments: {}'.format(len(texts)))

corpus = [dictionary.doc2bow(doc) for doc in preprocessed_comments]                # Create corpus


  0%|          | 0/10000 [00:00<?, ?it/s][A
  4%|▍         | 391/10000 [00:00<00:02, 3902.08it/s][A
  9%|▊         | 861/10000 [00:00<00:02, 4111.20it/s][A
 13%|█▎        | 1289/10000 [00:00<00:02, 4158.31it/s][A
 16%|█▋        | 1633/10000 [00:00<00:02, 3912.03it/s][A
 20%|██        | 2031/10000 [00:00<00:02, 3928.85it/s][A
 24%|██▎       | 2374/10000 [00:00<00:02, 3763.90it/s][A
 28%|██▊       | 2772/10000 [00:00<00:01, 3819.59it/s][A
 32%|███▏      | 3154/10000 [00:00<00:01, 3804.23it/s][A
 36%|███▌      | 3552/10000 [00:00<00:01, 3854.24it/s][A
 39%|███▉      | 3924/10000 [00:01<00:01, 3785.32it/s][A
 43%|████▎     | 4345/10000 [00:01<00:01, 3903.12it/s][A
 47%|████▋     | 4730/10000 [00:01<00:01, 3869.94it/s][A
 51%|█████     | 5113/10000 [00:01<00:01, 3831.68it/s][A
 55%|█████▍    | 5494/10000 [00:01<00:01, 3816.31it/s][A
 59%|█████▉    | 5922/10000 [00:01<00:01, 3943.63it/s][A
 63%|██████▎   | 6317/10000 [00:01<00:00, 3917.03it/s][A
 67%|██████▋   | 6718/10000 

Unique Words In Comments: 34550
Documents/Comments: 10000


In [62]:
numberTopics = 20   #Number of topics

model_gensim = LdaModel(num_topics=numberTopics,
                        id2word=dictionary,
                        iterations=10,
                        passes=1,
                        chunksize=50,
                        alpha='auto',
                        eta='auto',
                        update_every=1)


perp_gensim = []
times_gensim = []
i=0
max_it = 5
min_prep = np.inf
start = time()

for _ in tqdm(range(100)):
    model_gensim.update(corpus)
    tmp = np.exp(-1 * model_gensim.log_perplexity(corpus))
    perp_gensim.append(tmp)
    times_gensim.append(time() - start)
    if(tmp<min_prep):
        min_prep = tmp;
    else:
        i = i + 1;
        if (i==max_it):
            break                # if prep increase for max_it number it will break the update procedure 
for i, topic in enumerate(model_gensim.get_topics().argsort(axis=1)[:, -10:][:, ::-1], 1):
    print('Topic {}: {}'.format(i, ' '.join([vocabulary[id] for id in topic])))


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:09<01:24,  9.41s/it][A
 20%|██        | 2/10 [00:18<01:13,  9.21s/it][A
 30%|███       | 3/10 [00:28<01:06,  9.53s/it][A
 40%|████      | 4/10 [00:39<00:59,  9.91s/it][A
 50%|█████     | 5/10 [00:50<00:51, 10.20s/it][A
 60%|██████    | 6/10 [01:00<00:41, 10.36s/it][A
 70%|███████   | 7/10 [01:11<00:31, 10.50s/it][A
 80%|████████  | 8/10 [01:22<00:21, 10.55s/it][A
 90%|█████████ | 9/10 [01:33<00:10, 10.64s/it][A
100%|██████████| 10/10 [01:43<00:00, 10.67s/it][A

Topic 1: perfect hard found always item record others received original collection
Topic 2: Not full hear speaker fun volume performance Love speakers wood
Topic 3: works mic recording cable unit software USB worked perfectly plug
Topic 4: pedal violin clean beginner ones reason rest learn board video
Topic 5: The two work many one There first power less also
Topic 6: sound like really little better much good It sounds even
Topic 7: high low guitars top side solid Fender bridge All finish
Topic 8: keyboard drum Excellent days value change nothing kit next seem
Topic 9: They small These Amazon We tried big system read simple
Topic 10: set live band A built studio room mics times level
Topic 11: great It This price well quality good Great product nice
Topic 12: I one would time get bought case used could got
Topic 13: music piano CD album songs song recording heard track enjoy
Topic 14: guitar amp playing instrument play tone string end neck size
Topic 15: strings love He beautiful playe