# Import and Setting

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q lda
!pip install -q "tmtoolkit[recommended, lda]"
!pip install -q pyLDAvis

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.8/349.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for globre (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import warnings
import pandas as pd
import numpy as np
import random
import operator
import itertools
import gzip
import json
import pickle

from lda import LDA
import scipy.sparse
from tmtoolkit.topicmod.tm_lda import compute_models_parallel, evaluate_topic_models
from tmtoolkit.topicmod.model_io import ldamodel_top_doc_topics, print_ldamodel_topic_words
from tmtoolkit.topicmod.model_io import save_ldamodel_to_pickle, load_ldamodel_from_pickle
from tmtoolkit.bow.bow_stats import doc_lengths
from tmtoolkit.topicmod.model_stats import generate_topic_labels_from_top_words
from tmtoolkit.topicmod.visualize import parameters_for_ldavis

import pyLDAvis
import matplotlib.pyplot as plt

SEED = 2004
random.seed(SEED)
np.set_printoptions(precision=5)
warnings.filterwarnings("ignore")

DIR = 'drive/MyDrive/dtsa5799_product_review_topic_model'
DATA_DIR = '%s/data' % DIR

In [4]:
# load
dtm_sm = scipy.sparse.load_npz(f'{DATA_DIR}/new_dtm_sm.npz')
dtm_bg = scipy.sparse.load_npz(f'{DATA_DIR}/new_dtm_bg.npz')

doc_labels_sm = pickle.load(open(f'{DATA_DIR}/new_doc_labels_sm.p', 'rb'))
doc_labels_bg = pickle.load(open(f'{DATA_DIR}/new_doc_labels_bg.p', 'rb'))

vocab_sm = pickle.load(open(f'{DATA_DIR}/new_vocab_sm.p', 'rb'))
vocab_bg = pickle.load(open(f'{DATA_DIR}/new_vocab_bg.p', 'rb'))

# Model_22_01_5

In [5]:
dtms = {# 'corp_bg': dtm_bg,
        'corp_sm': dtm_sm
        }
const_params = {
    'n_iter': 500,
    'eta': .1,
    'n_topics': 22,
    'alpha': 5*(1/22),
    'random_state': SEED
}
models = compute_models_parallel(dtms,
                                 #varying_parameters=var_params,
                                 constant_parameters=const_params
                                 )
models

defaultdict(list,
            {'corp_sm': [({'n_iter': 500,
                'eta': 0.1,
                'n_topics': 22,
                'alpha': 0.2272727272727273,
                'random_state': 2004},
               <lda.lda.LDA at 0x7b4a28f1d7b0>)]})

In [6]:
models.keys()

dict_keys(['corp_sm'])

In [7]:
model_sm = models['corp_sm'][0][1]

In [8]:
model_path = f'{DATA_DIR}/model_sm_22_01_5.p'
save_ldamodel_to_pickle(model_path, model_sm, vocab_sm, doc_labels_sm, dtm = dtm_sm)

In [9]:
print_ldamodel_topic_words(model_sm.topic_word_, vocab_sm, top_n=5)

topic_1
> #1. comfortable (0.073572)
> #2. sandal (0.058935)
> #3. foot (0.054557)
> #4. wear (0.050929)
> #5. like (0.042672)
topic_2
> #1. shoe (0.136088)
> #2. foot (0.101922)
> #3. support (0.068633)
> #4. arch (0.037777)
> #5. need (0.023760)
topic_3
> #1. great (0.045619)
> #2. order (0.044205)
> #3. arrive (0.043851)
> #4. expect (0.040552)
> #5. time (0.039609)
topic_4
> #1. shoe (0.118165)
> #2. wear (0.029893)
> #3. toe (0.026950)
> #4. like (0.021985)
> #5. month (0.018675)
topic_5
> #1. watch (0.137713)
> #2. time (0.033775)
> #3. band (0.024343)
> #4. easy (0.023872)
> #5. like (0.020571)
topic_6
> #1. shoe (0.055056)
> #2. find (0.043406)
> #3. amazon (0.036416)
> #4. store (0.036319)
> #5. order (0.035154)
topic_7
> #1. buy (0.071793)
> #2. shoe (0.046578)
> #3. worth (0.039991)
> #4. money (0.034993)
> #5. not (0.028860)
topic_8
> #1. sock (0.122625)
> #2. foot (0.094074)
> #3. fit (0.063089)
> #4. wide (0.047596)
> #5. tight (0.045936)
topic_9
> #1. color (0.098296)
> 

In [10]:
topic_labels_sm = generate_topic_labels_from_top_words(
    model_sm.topic_word_,
    model_sm.doc_topic_,
    doc_lengths(dtm_sm),
    np.array(vocab_sm),
    lambda_=0.7,
    n_words=3
)
topic_labels_sm

array(['1_sandal_comfortable_slide', '2_foot_support_shoe',
       '3_arrive_fast_expect', '4_shoe_toe_lace', '5_watch_band_wrist',
       '6_amazon_store_find', '7_worth_money_buy', '8_sock_foot_wide',
       '9_color_black_white', '10_look_great_nice', '11_bag_gym_use',
       '12_air_nike_boot', '13_run_shoe_running',
       '14_good_quality_product', '15_pair_year_nike',
       '16_recommend_shoe_comfortable', '17_color_love_style',
       '18_play_basketball_shoe', '19_fit_perfectly_size',
       '20_love_son_say', '21_size_small_order', '22_day_work_wear'],
      dtype='<U29')

In [11]:
topic_labels_path = f'{DATA_DIR}/topic_labels_sm_22_01_5.p'
with open(topic_labels_path, 'wb') as file:
    pickle.dump(topic_labels_sm, file)

In [12]:
doc_clas_sm = ldamodel_top_doc_topics(model_sm.doc_topic_, doc_labels_sm, top_n=2, topic_labels=topic_labels_sm)
doc_clas_sm.head()

Unnamed: 0_level_0,rank_1,rank_2
document,Unnamed: 1_level_1,Unnamed: 2_level_1
B0000V9K32.A3BVWMS9I8OH8U,5_watch_band_wrist (0.3019),8_sock_foot_wide (0.3019)
B0000V9K32.ACT5DY536GISV,9_color_black_white (0.4517),20_love_son_say (0.2017)
B0000V9KRI.A1BEBWGPSB2DLM,14_good_quality_product (0.2934),4_shoe_toe_lace (0.1116)
B0000V9KRI.A1EDPEDXSQ78G4,11_bag_gym_use (0.2818),5_watch_band_wrist (0.1485)
B0000V9KRI.AR7L2ZP173QEE,14_good_quality_product (0.5767),11_bag_gym_use (0.1392)


In [13]:
texts_sm_path = f'{DATA_DIR}/top_texts_sm.json'
with open(texts_sm_path, 'r') as f:
    top_texts_sm = json.load(f)

In [14]:
doc_texts_sm = list(top_texts_sm.values())
doc_texts_sm[:3]

['the colour i received is not blue as shown but yellow.Couldnt change it because it was a birthday present for my daughter and havent got time.She really didn,t like it',
 'Very cute and is really practical. Fits better on smaller wrists which is my case. I wear them everywhere. I really love this watch!',
 'good price, very good material and excellent design, very useful for traveling, totally recomendation this use this product, to buy this']

In [15]:
doc_clas_sm["text"] = doc_texts_sm
doc_clas_sm.head()

Unnamed: 0_level_0,rank_1,rank_2,text
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B0000V9K32.A3BVWMS9I8OH8U,5_watch_band_wrist (0.3019),8_sock_foot_wide (0.3019),the colour i received is not blue as shown but...
B0000V9K32.ACT5DY536GISV,9_color_black_white (0.4517),20_love_son_say (0.2017),Very cute and is really practical. Fits better...
B0000V9KRI.A1BEBWGPSB2DLM,14_good_quality_product (0.2934),4_shoe_toe_lace (0.1116),"good price, very good material and excellent d..."
B0000V9KRI.A1EDPEDXSQ78G4,11_bag_gym_use (0.2818),5_watch_band_wrist (0.1485),"I mean, Roxy rocks, but I'm kinda dissapointed..."
B0000V9KRI.AR7L2ZP173QEE,14_good_quality_product (0.5767),11_bag_gym_use (0.1392),"I love this watch, i use every day, every wher..."


In [16]:
doc_clas_path = f'{DATA_DIR}/doc_clas_sm_22_01_5.csv'
doc_clas_sm.to_csv(doc_clas_path, index=False)

In [17]:
ldavis_params_sm = parameters_for_ldavis(model_sm.topic_word_,
                                         model_sm.doc_topic_,
                                         dtm_sm,
                                         vocab_sm)

In [18]:
%matplotlib inline
vis = pyLDAvis.prepare(**ldavis_params_sm)
pyLDAvis.enable_notebook(local=True)
pyLDAvis.display(vis)

![LDAvis](pic/pyLDAvis_22_01_5.png)