# Import and Setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q lda
!pip install -q "tmtoolkit[recommended, lda]"
!pip install -q pyLDAvis

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.8/349.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for globre (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import warnings
import pandas as pd
import numpy as np
import random
import operator
import itertools
import gzip
import json
import pickle

from lda import LDA
import scipy.sparse
from tmtoolkit.topicmod.tm_lda import compute_models_parallel, evaluate_topic_models
from tmtoolkit.topicmod.model_io import ldamodel_top_doc_topics, print_ldamodel_topic_words
from tmtoolkit.topicmod.model_io import save_ldamodel_to_pickle, load_ldamodel_from_pickle
from tmtoolkit.bow.bow_stats import doc_lengths
from tmtoolkit.topicmod.model_stats import generate_topic_labels_from_top_words
from tmtoolkit.topicmod.visualize import parameters_for_ldavis

import pyLDAvis
import matplotlib.pyplot as plt

SEED = 2004
random.seed(SEED)
np.set_printoptions(precision=5)
warnings.filterwarnings("ignore")

DIR = 'drive/MyDrive/dtsa5799_product_review_topic_model'
DATA_DIR = '%s/data' % DIR

In [None]:
# load
dtm_sm = scipy.sparse.load_npz(f'{DATA_DIR}/new_dtm_sm.npz')
dtm_bg = scipy.sparse.load_npz(f'{DATA_DIR}/new_dtm_bg.npz')

doc_labels_sm = pickle.load(open(f'{DATA_DIR}/new_doc_labels_sm.p', 'rb'))
doc_labels_bg = pickle.load(open(f'{DATA_DIR}/new_doc_labels_bg.p', 'rb'))

vocab_sm = pickle.load(open(f'{DATA_DIR}/new_vocab_sm.p', 'rb'))
vocab_bg = pickle.load(open(f'{DATA_DIR}/new_vocab_bg.p', 'rb'))

# Model_27_01_1

In [None]:
dtms = {'corp_bg': dtm_bg,
        # 'corp_sm': dtm_sm
        }
const_params = {
    'n_iter': 500,
    'eta': 0.1,
    'n_topics': 27,
    'alpha': 1/27,
    'random_state': SEED
}
models = compute_models_parallel(dtms,
                                 #varying_parameters=var_params,
                                 constant_parameters=const_params
                                 )
models

defaultdict(list,
            {'corp_bg': [({'n_iter': 500,
                'eta': 0.1,
                'n_topics': 27,
                'alpha': 0.037037037037037035,
                'random_state': 2004},
               <lda.lda.LDA at 0x7b711608beb0>)]})

In [None]:
models.keys()

dict_keys(['corp_bg'])

In [None]:
model_bg = models['corp_bg'][0][1]

In [None]:
model_path = f'{DATA_DIR}/model_27_01_1.p'
save_ldamodel_to_pickle(model_path, model_bg, vocab_bg, doc_labels_bg, dtm = dtm_bg)

In [None]:
print_ldamodel_topic_words(model_bg.topic_word_, vocab_bg, top_n=5)

topic_1
> #1. shoe (0.065519)
> #2. foot (0.048738)
> #3. wear (0.036045)
> #4. get (0.016361)
> #5. buy (0.015931)
topic_2
> #1. shoe (0.070421)
> #2. like (0.028970)
> #3. foot (0.022724)
> #4. look (0.018522)
> #5. lace (0.017614)
topic_3
> #1. shoe (0.100486)
> #2. great (0.057322)
> #3. comfortable (0.056092)
> #4. fit (0.053221)
> #5. light (0.045019)
topic_4
> #1. sock (0.115973)
> #2. fit (0.037133)
> #3. foot (0.026881)
> #4. wear (0.026056)
> #5. like (0.024642)
topic_5
> #1. shoe (0.081743)
> #2. wear (0.050670)
> #3. day (0.048700)
> #4. comfortable (0.046840)
> #5. work (0.046074)
topic_6
> #1. wear (0.057862)
> #2. year (0.042454)
> #3. boot (0.040156)
> #4. son (0.037588)
> #5. love (0.037453)
topic_7
> #1. size (0.152514)
> #2. small (0.061369)
> #3. shoe (0.055054)
> #4. order (0.050104)
> #5. run (0.040461)
topic_8
> #1. love (0.079913)
> #2. buy (0.050274)
> #3. son (0.042247)
> #4. gift (0.040148)
> #5. shoe (0.033603)
topic_9
> #1. good (0.094505)
> #2. product (0.

In [None]:
topic_labels_bg = generate_topic_labels_from_top_words(
    model_bg.topic_word_,
    model_bg.doc_topic_,
    doc_lengths(dtm_bg),
    np.array(vocab_bg),
    lambda_=0.7,
    n_words=3
)
topic_labels_bg

array(['1_foot_shoe_wear', '2_shoe_lace_material',
       '3_shoe_light_comfortable', '4_sock_shirt_fit', '5_day_work_shoe',
       '6_boot_year_wear', '7_size_small_order', '8_gift_love_son',
       '9_good_product_quality', '10_support_shoe_arch',
       '11_play_basketball_cleat', '12_nike_shoe_pair', '13_muy_que_los',
       '14_watch_band_wrist', '15_color_black_white', '16_run_shoe_free',
       '17_arrive_fast_great', '18_love_color_fit', '19_shoe_month_nike',
       '20_sandal_slide_flip', '21_air_max_nike', '22_wide_foot_narrow',
       '23_return_order_send', '24_bag_gym_pocket',
       '25_find_store_amazon', '26_shoe_training_gym',
       '27_worth_money_cheap'], dtype='<U24')

In [None]:
topic_labels_path = f'{DATA_DIR}/topic_labels_27_01_1.p'
with open(topic_labels_path, 'wb') as file:
    pickle.dump(topic_labels_bg, file)

In [None]:
doc_clas_bg = ldamodel_top_doc_topics(model_bg.doc_topic_, doc_labels_bg, top_n=2, topic_labels=topic_labels_bg)
doc_clas_bg.head()

Unnamed: 0_level_0,rank_1,rank_2
document,Unnamed: 1_level_1,Unnamed: 2_level_1
B0000V9K32.A3BVWMS9I8OH8U,14_watch_band_wrist (0.5488),4_sock_shirt_fit (0.1852)
B0000V9K32.ACT5DY536GISV,15_color_black_white (0.5031),8_gift_love_son (0.4198)
B0000V9KRI.A1BEBWGPSB2DLM,27_worth_money_cheap (0.9037),13_muy_que_los (0.003704)
B0000V9KRI.A1EDPEDXSQ78G4,24_bag_gym_pocket (0.5031),14_watch_band_wrist (0.1698)
B0000V9KRI.AR7L2ZP173QEE,9_good_product_quality (0.6952),24_bag_gym_pocket (0.2336)


In [None]:
texts_bg_path = f'{DATA_DIR}/top_texts_bg.json'
with open(texts_bg_path, 'r') as f:
    top_texts_bg = json.load(f)

In [None]:
doc_texts_bg = list(top_texts_bg.values())
doc_texts_bg[:3]

['the colour i received is not blue as shown but yellow.Couldnt change it because it was a birthday present for my daughter and havent got time.She really didn,t like it',
 'Very cute and is really practical. Fits better on smaller wrists which is my case. I wear them everywhere. I really love this watch!',
 'good price, very good material and excellent design, very useful for traveling, totally recomendation this use this product, to buy this']

In [None]:
doc_clas_bg["text"] = doc_texts_bg
doc_clas_bg.head()

Unnamed: 0_level_0,rank_1,rank_2,text
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B0000V9K32.A3BVWMS9I8OH8U,14_watch_band_wrist (0.5488),4_sock_shirt_fit (0.1852),the colour i received is not blue as shown but...
B0000V9K32.ACT5DY536GISV,15_color_black_white (0.5031),8_gift_love_son (0.4198),Very cute and is really practical. Fits better...
B0000V9KRI.A1BEBWGPSB2DLM,27_worth_money_cheap (0.9037),13_muy_que_los (0.003704),"good price, very good material and excellent d..."
B0000V9KRI.A1EDPEDXSQ78G4,24_bag_gym_pocket (0.5031),14_watch_band_wrist (0.1698),"I mean, Roxy rocks, but I'm kinda dissapointed..."
B0000V9KRI.AR7L2ZP173QEE,9_good_product_quality (0.6952),24_bag_gym_pocket (0.2336),"I love this watch, i use every day, every wher..."


In [None]:
doc_clas_path = f'{DATA_DIR}/doc_clas_27_01_1.csv'
doc_clas_bg.to_csv(doc_clas_path, index=False)

In [None]:
ldavis_params_bg = parameters_for_ldavis(model_bg.topic_word_,
                                         model_bg.doc_topic_,
                                         dtm_bg,
                                         vocab_bg)

In [None]:
%matplotlib inline
vis = pyLDAvis.prepare(**ldavis_params_bg)
pyLDAvis.enable_notebook(local=True)
pyLDAvis.display(vis)

![x](pic/pyLDAvis_27_01_1.png)