# Import and Setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q lda
!pip install -q "tmtoolkit[recommended, lda]"
!pip install -q pyLDAvis

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.8/349.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for globre (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import warnings
import pandas as pd
import numpy as np
import random
import operator
import itertools
import gzip
import json
import pickle

from lda import LDA
import scipy.sparse
from tmtoolkit.topicmod.tm_lda import compute_models_parallel, evaluate_topic_models
from tmtoolkit.topicmod.model_io import ldamodel_top_doc_topics, print_ldamodel_topic_words
from tmtoolkit.topicmod.model_io import save_ldamodel_to_pickle, load_ldamodel_from_pickle
from tmtoolkit.bow.bow_stats import doc_lengths
from tmtoolkit.topicmod.model_stats import generate_topic_labels_from_top_words
from tmtoolkit.topicmod.visualize import parameters_for_ldavis

import pyLDAvis
import matplotlib.pyplot as plt

SEED = 2004
random.seed(SEED)
np.set_printoptions(precision=5)
warnings.filterwarnings("ignore")

DIR = 'drive/MyDrive/dtsa5799_product_review_topic_model'
DATA_DIR = '%s/data' % DIR

In [None]:
# load
dtm_sm = scipy.sparse.load_npz(f'{DATA_DIR}/new_dtm_sm.npz')
dtm_bg = scipy.sparse.load_npz(f'{DATA_DIR}/new_dtm_bg.npz')

doc_labels_sm = pickle.load(open(f'{DATA_DIR}/new_doc_labels_sm.p', 'rb'))
doc_labels_bg = pickle.load(open(f'{DATA_DIR}/new_doc_labels_bg.p', 'rb'))

vocab_sm = pickle.load(open(f'{DATA_DIR}/new_vocab_sm.p', 'rb'))
vocab_bg = pickle.load(open(f'{DATA_DIR}/new_vocab_bg.p', 'rb'))

# Model_24_1_1

In [None]:
dtms = {'corp_bg': dtm_bg,
        # 'corp_sm': dtm_sm
        }
const_params = {
    'n_iter': 500,
    'eta': 1,
    'n_topics': 24,
    'alpha': 1/24,
    'random_state': SEED
}
models = compute_models_parallel(dtms,
                                 #varying_parameters=var_params,
                                 constant_parameters=const_params
                                 )
models

defaultdict(list,
            {'corp_bg': [({'n_iter': 500,
                'eta': 1,
                'n_topics': 24,
                'alpha': 0.041666666666666664,
                'random_state': 2004},
               <lda.lda.LDA at 0x7827e4112e90>)]})

In [None]:
models.keys()

dict_keys(['corp_bg'])

In [None]:
model_bg = models['corp_bg'][0][1]

In [None]:
model_path = f'{DATA_DIR}/model_24_1_1.p'
save_ldamodel_to_pickle(model_path, model_bg, vocab_bg, doc_labels_bg, dtm = dtm_bg)

In [None]:
print_ldamodel_topic_words(model_bg.topic_word_, vocab_bg, top_n=5)

topic_1
> #1. shoe (0.057573)
> #2. play (0.037029)
> #3. basketball (0.025114)
> #4. good (0.024806)
> #5. great (0.023574)
topic_2
> #1. size (0.124150)
> #2. shoe (0.053820)
> #3. small (0.050069)
> #4. order (0.043170)
> #5. run (0.033122)
topic_3
> #1. sock (0.072928)
> #2. foot (0.031511)
> #3. wear (0.029554)
> #4. sandal (0.022298)
> #5. comfortable (0.021238)
topic_4
> #1. shoe (0.084624)
> #2. great (0.039148)
> #3. comfortable (0.031729)
> #4. run (0.029459)
> #5. support (0.021603)
topic_5
> #1. shoe (0.068535)
> #2. foot (0.048101)
> #3. wear (0.021439)
> #4. support (0.019688)
> #5. arch (0.015212)
topic_6
> #1. shoe (0.045311)
> #2. find (0.037521)
> #3. buy (0.035444)
> #4. store (0.032414)
> #5. pair (0.031895)
topic_7
> #1. shoe (0.059087)
> #2. size (0.047230)
> #3. foot (0.046366)
> #4. fit (0.044011)
> #5. wide (0.030898)
topic_8
> #1. great (0.047499)
> #2. shoe (0.041177)
> #3. fit (0.032073)
> #4. love (0.027353)
> #5. arrive (0.022632)
topic_9
> #1. shoe (0.075

In [None]:
topic_labels_bg = generate_topic_labels_from_top_words(
    model_bg.topic_word_,
    model_bg.doc_topic_,
    doc_lengths(dtm_bg),
    np.array(vocab_bg),
    lambda_=0.7,
    n_words=3
)
topic_labels_bg

array(['1_play_basketball_cleat', '2_size_small_order',
       '3_sock_sandal_foot', '4_shoe_great_gym', '5_foot_shoe_support',
       '6_store_find_buy', '7_wide_foot_size', '8_great_arrive_thank',
       '9_shoe_feel_walk', '10_shoe_run_free', '11_good_quality_product',
       '12_boot_work_day', '13_run_shoe_mile', '14_shoe_month_squeak',
       '15_muy_que_los', '16_pair_comfortable_wear', '17_air_nike_max',
       '18_return_order_pay', '19_bag_gym_pocket', '20_love_gift_son',
       '21_watch_band_wrist', '22_black_white_color',
       '23_shirt_short_sunglass', '24_color_love_shoe'], dtype='<U24')

In [None]:
topic_labels_path = f'{DATA_DIR}/topic_labels_24_1_1.p'
with open(topic_labels_path, 'wb') as file:
    pickle.dump(topic_labels_bg, file)

In [None]:
doc_clas_bg = ldamodel_top_doc_topics(model_bg.doc_topic_, doc_labels_bg, top_n=2, topic_labels=topic_labels_bg)
doc_clas_bg.head()

Unnamed: 0_level_0,rank_1,rank_2
document,Unnamed: 1_level_1,Unnamed: 2_level_1
B0000V9K32.A3BVWMS9I8OH8U,2_size_small_order (0.4583),21_watch_band_wrist (0.4583)
B0000V9K32.ACT5DY536GISV,22_black_white_color (0.5744),20_love_gift_son (0.3601)
B0000V9KRI.A1BEBWGPSB2DLM,23_shirt_short_sunglass (0.7042),10_shoe_run_free (0.2042)
B0000V9KRI.A1EDPEDXSQ78G4,21_watch_band_wrist (0.5868),12_boot_work_day (0.1701)
B0000V9KRI.AR7L2ZP173QEE,11_good_quality_product (0.7724),21_watch_band_wrist (0.1571)


In [None]:
texts_bg_path = f'{DATA_DIR}/top_texts_bg.json'
with open(texts_bg_path, 'r') as f:
    top_texts_bg = json.load(f)

In [None]:
doc_texts_bg = list(top_texts_bg.values())
doc_texts_bg[:3]

['the colour i received is not blue as shown but yellow.Couldnt change it because it was a birthday present for my daughter and havent got time.She really didn,t like it',
 'Very cute and is really practical. Fits better on smaller wrists which is my case. I wear them everywhere. I really love this watch!',
 'good price, very good material and excellent design, very useful for traveling, totally recomendation this use this product, to buy this']

In [None]:
doc_clas_bg["text"] = doc_texts_bg
doc_clas_bg.head()

Unnamed: 0_level_0,rank_1,rank_2,text
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B0000V9K32.A3BVWMS9I8OH8U,2_size_small_order (0.4583),21_watch_band_wrist (0.4583),the colour i received is not blue as shown but...
B0000V9K32.ACT5DY536GISV,22_black_white_color (0.5744),20_love_gift_son (0.3601),Very cute and is really practical. Fits better...
B0000V9KRI.A1BEBWGPSB2DLM,23_shirt_short_sunglass (0.7042),10_shoe_run_free (0.2042),"good price, very good material and excellent d..."
B0000V9KRI.A1EDPEDXSQ78G4,21_watch_band_wrist (0.5868),12_boot_work_day (0.1701),"I mean, Roxy rocks, but I'm kinda dissapointed..."
B0000V9KRI.AR7L2ZP173QEE,11_good_quality_product (0.7724),21_watch_band_wrist (0.1571),"I love this watch, i use every day, every wher..."


In [None]:
doc_clas_path = f'{DATA_DIR}/doc_clas_24_1_1.csv'
doc_clas_bg.to_csv(doc_clas_path)

In [None]:
ldavis_params_bg = parameters_for_ldavis(model_bg.topic_word_,
                                         model_bg.doc_topic_,
                                         dtm_bg,
                                         vocab_bg)

In [None]:
%matplotlib inline
vis = pyLDAvis.prepare(**ldavis_params_bg)
pyLDAvis.enable_notebook(local=True)
pyLDAvis.display(vis)

![LDAvis](pic/pyLDAvis_24_1_1.png)