# NOTE: BEFORE YOU GET STARTED

PLEASE DOWNLOAD MODEL ON [THIS](https://drive.google.com/file/d/1Tx6stkf_pEP7Mis_B4MiAD0yHZMa7jdJ/view?usp=sharing) LINK (~100MB), EXTRACT ALL FILES TO `results` FOLDER

WE'VE PUT COMMENTS ON PACKAGES LIKE `hunspell` WHICH RUN ONLY DURING PREPROCESSING! SINCE YOU DOWNLOADED ALL FILES, RUNNING THIS CODE WILL JUST READ THE FILES, THAT ALREADY CONTAIN PROCESSED WORDS.

#### 0. Code we used to download dataset.

No need to run this since, models are already saved :)

In [None]:
# 'Amazon_Instant_Video', 
# 'Apps_for_Android', 
# 'Automotive', 
# 'Baby', 
# 'Beauty', 
# 'Digital_Music', 
# 'Grocery_and_Gourmet_Food', 
# 'Health_and_Personal_Care', 
# 'Home_and_Kitchen', 
# 'Kindle_Store'

#DOWLOAD DATASETS

## UNCOMMENT to test :)
#!python Download_files.py -f amvi apps auto baby beau dgmu food heal hmkt kind  -dt r

### 1. Import all packages

In [None]:
from gensim.test.utils import common_texts,datapath
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import LdaModel
from gensim import corpora 
from gensim.matutils import Sparse2Corpus, Scipy2Corpus
import pandas as pd
import numpy as np
import gzip
import re
import string
import os
from tqdm import tqdm_notebook
from time import time
# from hunspell import HunSpell # Installing this caused a lot of hassle
from multiprocessing import pool
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
from gensim.parsing.porter import PorterStemmer
import pickle
import nltk
from joblib import Parallel, delayed
from gensim.corpora import MmCorpus
from gensim.models import TfidfModel
from sklearn.feature_extraction.text import TfidfTransformer
from itertools import chain
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation as LDA_sk
sns.set()


nltk.download('stopwords')

# spellchecker = HunSpell('dicts_hun/en_US.dic',
#                         'dicts_hun/en_US.aff')

stemmer = PorterStemmer()

names = ['Amazon_Instant_Video', 
         'Apps_for_Android', 
         'Automotive', 
         'Baby', 
         'Beauty', 
         'Digital_Music', 
         'Grocery_and_Gourmet_Food', 
         'Health_and_Personal_Care', 
         'Home_and_Kitchen', 
         'Kindle_Store'
        ]

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

#### Word preprocessing! It doesn't take a lot of time, if files in `results` folder exists, preprocessing won't be run again and files will be loaded!

1. Datasets are loaded (this is **SLOW**, but parallized! You can set number of cores with `paralelize_reading`. TAKES A LOT OF **RAM**)
2. `n` comments are randomly chosen
3. Remove punctionation and digits
4. Remove STOP words 'he', 'she', 'is' etc.
5. Remove wrongly spelled words (we thought of spelling them correctly, but it slows process 50x times!)
6. Stemming! `stemmer.stem('ponies') -> 'poni`
7. Remove rare words
8. Create dictionary, vocabulary, corpus and SAVE THEM!

In [None]:
force_rerun_preprocessing = 0 ## If you want run code again even if files exist
paralelize_reading = 12 ## Number of cores used for reading, set to 1 if you don't want to paralelize

def read_dataset(name):
    df = getDF('data/reviews/reviews_{}.json.gz'.format(name))
    print('Reading {} finished!'.format(name))
    return df.reviewText + ' ' + df.summary + ' '
    

if ((os.path.isfile('results/corpus.pickle') and 
    os.path.isfile('results/dictionary.pickle') and
    os.path.isfile('results/vocab.pickle')) and not(force_rerun_preprocessing)):
    
    with open('results/corpus.pickle', 'rb') as f:
        corpus = pickle.load(f)
    with open('results/dictionary.pickle', 'rb') as f:
        dictionary = pickle.load(f)
    with open('results/vocab.pickle', 'rb') as f:
        vocabulary = pickle.load(f)
        
    print('Files already exist! No need to load this again!')
else:

    ans = ''

#     for name in tqdm_notebook(names):

#         df = getDF('data/reviews/reviews_{}.json.gz'.format(name))
#         ans += df.reviewText + ' ' + df.summary + ' '

    ## Let's use those cores :p
    print('Reading datasets')
    array_ans = Parallel(n_jobs=paralelize_reading)(delayed(read_dataset)(name) for name in names)
    
    for temp in array_ans:
        ans+=temp

    n = 50000

    re_punctuation = re.compile('['+string.punctuation+']')
    tokenizer = RegexpTokenizer('\w+')
    stop = stopwords.words('english')
    preprocessed_comments = []
    for comment in tqdm_notebook(np.random.choice(ans, n)):
        comment = re_punctuation.sub(' ', comment)
        comment = tokenizer.tokenize(comment)
        comment = [x for x in comment if not any(c.isdigit() for c in x)]
        comment = [word for word in comment if word not in stop]
        comment = [stemmer.stem(x) for x in comment if spellchecker.spell(x)]
        comment = [x for x in comment if len(x) > 3]
        preprocessed_comments.append(comment)
    
    
    wordFrequency = Counter()
    for comment in tqdm_notebook(preprocessed_comments):
        wordFrequency.update(comment)                                  # Count overall word frequency
    print('Unique Words In Comments: {}'.format(len(wordFrequency)))

    minimumWordOccurrences = 5
    # Remove rare words
    print('Removing rare words... ')
    texts = [[word for word in comment if wordFrequency[word] > minimumWordOccurrences] for comment in tqdm_notebook(preprocessed_comments)]

    print('Creating vocabulary...')
    dictionary = corpora.Dictionary(texts)                             # Create word dictionary
    vocabulary = [dictionary[i] for i in tqdm_notebook(dictionary.keys())]
    print('Documents/Comments: {}'.format(len(texts)))

    print('Creating corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in tqdm_notebook(preprocessed_comments)] # Create corpus

    
    print('Saving files...')
    with open('results/corpus.pickle', 'wb') as f:
        pickle.dump(corpus, f)
    with open('results/dictionary.pickle', 'wb') as f:
        pickle.dump(dictionary, f)
    with open('results/vocab.pickle', 'wb') as f:
        pickle.dump(vocabulary, f)
    
    print('WE ARE GOOD TO GO!')

### GENSIM LDA

#### Training is computationaly heavy and can take up to 5 hours, this has been processed on AWS EC2 cluster.
Note: If models exist in `results` folder, training won't run again and already existing model will be loaded.
You can freely run this cell.

In [None]:
force_rerun_lda = 0

if (os.path.isfile('results/model_gensim/model_gensim.model') & (not force_rerun_lda)):
    model_gensim = LdaMulticore.load("results/model_gensim/model_gensim.model", mmap='r')
    with open('results/model_gensim/perp_gensim.pickle', 'rb') as f:
        perp_gensim = pickle.load(f)
    with open('results/model_gensim/time_gensim.pickle', 'rb') as f:
        time_gensim = pickle.load(f)
else:
    numberTopics = 50  #Number of topics
    model_gensim = LdaMulticore(num_topics=numberTopics,
                            id2word=dictionary,
                            iterations=10,
                            passes=1,
                            chunksize=50,
                            eta='auto',
                            workers=12)

    
    perp_gensim = []
    times_gensim = []
    i=0
    max_it = 5
    min_prep = np.inf
    start = time()
    for _ in tqdm_notebook(range(100)):
        model_gensim.update(corpus)
        tmp = np.exp(-1 * model_gensim.log_perplexity(corpus))
        perp_gensim.append(tmp)
        times_gensim.append(time() - start)
        if(tmp<min_prep):
            min_prep = tmp;
            i = 0
        else:
            i = i + 1;
            if (i==max_it):
                break                # if prep increase for max_it number it will break the update procedure 
    model_gensim.save('results/model_gensim/model_gensim.model')
    with open('results/model_gensim/perp_gensim.pickle', 'wb') as f:
        pickle.dump(perp_gensim, f)
    with open('results/model_gensim/time_gensim.pickle', 'wb') as f:
        pickle.dump(times_gensim, f)
    

In [None]:
plt.figure(figsize=[15,5])
plt.plot(perp_gensim,'-o', label='Perplexity')
plt.xlabel('Iterations')
plt.ylabel('Perplexity')
plt.legend();

## GENERATED TOPICS

In [None]:
for i, topic in enumerate(model_gensim.get_topics().argsort(axis=1)[:, -10:][:, ::-1], 1):
    print('Topic {}: {}'.format(i, ' '.join([vocabulary[id] for id in topic])))

### SKLEARN LDA

CREATING CORPUS SPARSE

In [None]:
force_rerun_sk = 0


if os.path.isfile('results/model_sk/sk_lda.pickle') & \
    os.path.isfile('results/model_sk/perplexity_sklearn.pickle') & \
    os.path.isfile('results/model_sk/timestamps_sklearn.pickle') & (not force_rerun_sk):
    with open('results/sk_lda.pickle', 'rb') as f:
        sk_lda = pickle.load(f)
    with open('results/model_sk/timestamps_sklearn.pickle') as f:
        timestamps_sklearn = pickle.load(f)
    with open('results/model_sk/perplexity_sklearn.pickle') as f:
        perplexity_sklearn = pickle.load(f)
        
    print('File loaded!')
else: 

    arr_1 = np.asarray([np.array(list(map(np.array, corpus[i]))) for i in range(len(corpus))])
    data = np.array([])
    row = np.array([])
    column = np.array([])

    for i in tqdm_notebook(range(arr_1.shape[0])):
        data = np.append(data, arr_1[i][:, 1])
        row = np.append(row, i*np.ones(shape=(arr_1[i].shape[0],)))
        column = np.append(column, arr_1[i][:,0])
        if not(data.shape == row.shape == column.shape):
            print(i)
            break

    sk_corpus = coo_matrix((data, (row.astype(int), column.astype(int)))).tocsc()

    sk_lda = LDA_sk(n_components=50, 
                    learning_method='online',
                    n_jobs=-1,
                    max_iter = 1,
                    total_samples = 10000)

    perplexity_sklearn = []
    timestamps_sklearn = []
    start = time()
    for _ in tqdm_notebook(range(100)):
        model_sklearn.partial_fit(X)
        perp_sklearn.append(model_sklearn.perplexity(X))        # Append metric
        times_sklearn.append(time()-start)

    print('Saving files...')
    with open('results/sk_lda.pickle', 'wb') as f:
        pickle.dump(sk_lda, f)
    with open('results/model_sk/timestamps_sklearn.pickle') as f:
        pickle.dump(timestamps_sklearn, f)
    with open('results/model_sk/perplexity_sklearn.pickle') as f:
        pickle.dump(perplexity_sklearn, f)


In [None]:

n_top_words=15
n_components = 50
data = np.array([([vocabulary[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) for topic in sk_lda.components_])
topics = pd.DataFrame(data, columns=['word_{}'.format(i) for i in range(1, n_top_words+1)], index=['topic_{}'.format(i) for i in range(1, 1+n_components)])
topics.head(15)     

### PCA VISUALIZATIONS


In [None]:
import pickle
import numpy as np
from sklearn.decomposition import PCA
with open('Downloads/results.pickle', 'rb') as f:
    embeddings = pickle.load(f)
emb = np.array([embeddings[key] for key in embeddings.keys()])
pca = PCA(n_components=2)
pca.fit(emb)
comp = pca.components_
with open('PCA_Amazon.pickle', 'wb') as f:
    pickle.dump(comp, f)

with open('../../PCA_Amazon.pickle', 'rb') as f:
    comp = pickle.load(f)
with open('../../Downloads/results.pickle', 'rb') as f:
    embeddings = pickle.load(f)

In [None]:
n_top_topics = [0, 2, 14, 19]
n_top_words = 12

topics_pca=[]
for i, topic in enumerate(lda.get_topics().argsort(axis=1)[:, -n_top_words-1:][:, ::-1]):
    if i in n_top_topics:
        topics_pca.append([vocabulary[id] for id in topic if vocabulary[id] in embeddings.keys()])
        print(topics_pca[-1])
topics_pca = np.array(topics_pca)

In [None]:
topics_components = []
for i, topic in enumerate(lda.get_topics().argsort(axis=1)[:, -n_top_words-1:][:, ::-1]):
    if i in n_top_topics:
        topics_components.append(np.array([[np.array(embeddings[vocabulary[id]]).dot(component) for component in comp] for id in topic if vocabulary[id] in embeddings.keys()]))
        print(topics_components[-1])
topics_components = np.array(topics_components)

In [None]:
plt.figure(figsize=(15, 10))
for i, topic_components in enumerate(topics_components):
    plt.scatter(topic_components[:, 0], topic_components[:, 1])
    for txt, point in zip(topics_pca[i], topic_components):
        plt.annotate(txt, point)
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.savefig('pca_topic.png')