# Imports and Loading in the DTM


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import os

try:
  import tmtoolkit
except:
  !pip install -U "tmtoolkit[recommended]"
  import tmtoolkit

In [13]:
try:
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel
except:
  !pip install tmtoolkit['lda']
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel

In [14]:
import pickle
import scipy.sparse
import logging
import warnings

In [15]:
try:
  from lda import LDA
except:
  !pip install lda

In [16]:
working_directory = '/content/drive/MyDrive/2work/MSDS_marketing_text_analytics/master_files/2_topic_modeling'

doc_labels_sm = pickle.load(open('%s/doc_labels_sm.p' % working_directory, 'rb'))
doc_labels_bg = pickle.load(open('%s/doc_labels_bg.p' % working_directory, 'rb'))


dtm_sm = scipy.sparse.load_npz('%s/small_dtm.npz' % working_directory)
dtm_bg = scipy.sparse.load_npz('%s/big_dtm.npz' % working_directory)

vocab_bg = pickle.load(open('%s/big_vocab.p' % working_directory, 'rb'))
vocab_sm = pickle.load(open('%s/small_vocab.p' % working_directory, 'rb'))

# Creating Models

In [17]:
# suppress the "INFO" messages and warnings from lda
logger = logging.getLogger('lda')
logger.addHandler(logging.NullHandler())
logger.propagate = False
warnings.filterwarnings('ignore')

In [18]:
# set data to use
dtms = {
    'bigger': dtm_bg,
    'smaller': dtm_sm
}

# and fixed hyperparameters
# Here, alpha represents document-topic density - with a higher alpha, documents
# are made up of more topics, and with lower alpha, documents contain fewer topics.
#Beta represents topic-word density - with a high beta, topics are made up of
#most of the words in the corpus, and with a low beta they consist of few words.
# https://www.thoughtvector.io/blog/lda-alpha-and-beta-parameters-the-intuition/
lda_params = {
    'n_topics': 16,
    'eta': .01,
    'n_iter': 500,
    'random_state': 20191122,  # to make results reproducible
    'alpha': 1/16
}

models = compute_models_parallel(dtms, constant_parameters=lda_params)

In [19]:
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words

model_sm = models['smaller'][0][1]
print_ldamodel_topic_words(model_sm.topic_word_, vocab_sm, top_n=5)

topic_1
> #1. size (0.157635)
> #2. shoe (0.112448)
> #3. small (0.093707)
> #4. run (0.070593)
> #5. order (0.062367)
topic_2
> #1. great (0.183331)
> #2. shoe (0.116051)
> #3. fit (0.100661)
> #4. look (0.065929)
> #5. price (0.056258)
topic_3
> #1. shoe (0.128435)
> #2. foot (0.110740)
> #3. wear (0.103459)
> #4. day (0.067901)
> #5. comfortable (0.059773)
topic_4
> #1. sneaker (0.149575)
> #2. pair (0.063499)
> #3. love (0.060476)
> #4. wear (0.052311)
> #5. comfortable (0.049590)
topic_5
> #1. shoe (0.161562)
> #2. great (0.076362)
> #3. comfortable (0.074376)
> #4. love (0.062957)
> #5. run (0.057992)
topic_6
> #1. shoe (0.140876)
> #2. pair (0.105473)
> #3. nike (0.077798)
> #4. wear (0.073456)
> #5. buy (0.071763)
topic_7
> #1. size (0.175838)
> #2. 10 (0.158691)
> #3. shoe (0.071061)
> #4. 1 (0.069248)
> #5. wear (0.055068)
topic_8
> #1. shoe (0.186363)
> #2. run (0.124834)
> #3. 1 (0.066856)
> #4. nike (0.059276)
> #5. foot (0.037014)
topic_9
> #1. shoe (0.146513)
> #2. love 

In [20]:
model_bg = models['bigger'][0][1]
print_ldamodel_topic_words(model_bg.topic_word_, vocab_bg, top_n=5)

topic_1
> #1. size (0.103218)
> #2. 10 (0.063432)
> #3. shoe (0.061245)
> #4. fit (0.047294)
> #5. foot (0.040554)
topic_2
> #1. good (0.087190)
> #2. quality (0.060357)
> #3. product (0.059997)
> #4. fit (0.052947)
> #5. great (0.043811)
topic_3
> #1. love (0.108968)
> #2. color (0.067123)
> #3. shoe (0.063103)
> #4. comfortable (0.052687)
> #5. fit (0.048119)
topic_4
> #1. 1 (0.070153)
> #2. shoe (0.068224)
> #3. pair (0.063794)
> #4. wear (0.063325)
> #5. year (0.040445)
topic_5
> #1. size (0.145974)
> #2. 1 (0.085930)
> #3. shoe (0.052945)
> #4. small (0.049646)
> #5. order (0.047745)
topic_6
> #1. shoe (0.056506)
> #2. order (0.045869)
> #3. 1 (0.028653)
> #4. return (0.026993)
> #5. 10 (0.025333)
topic_7
> #1. sock (0.077557)
> #2. wear (0.047642)
> #3. foot (0.047409)
> #4. comfortable (0.034930)
> #5. boot (0.031781)
topic_8
> #1. great (0.064558)
> #2. shoe (0.055013)
> #3. love (0.054243)
> #4. fit (0.042209)
> #5. son (0.031301)
topic_9
> #1. shoe (0.138990)
> #2. great (0.0

When comparing two topic models with different document-term matrices (DTMs)—one with 2,500 words and another with 500 words—consider several factors to critically evaluate which topic model is better. Here are some top considerations for inspecting the top words for each topic across the two models:

1. **Coherence and Interpretability**:
   - Check if the top words in each topic form a coherent theme that is easy to interpret. A good topic model should produce topics with words that are semantically related.
   - A larger DTM may include more specific or less frequent words that could potentially lead to more nuanced topics, while a smaller DTM may result in broader topics.

2. **Distinctiveness of Topics**:
   - Evaluate whether the topics are distinct from each other or if there is significant overlap in top words across topics.
   - A model with a larger DTM might have more unique words that can help differentiate topics, but it could also introduce noise. Conversely, a smaller DTM might lead to more general topics that could overlap.

3. **Relevance of Top Words**:
   - Consider the relevance of the top words to the topics. Are the top words representative of the topic, or are they too general or too obscure?
   - A larger DTM might include more relevant terms for each topic, but it could also contain irrelevant words that do not contribute to the topic's meaning.