# Part II: Gensim LDA

In [30]:
# run functions notbook
%run ../functions/gensim_functions.ipynb

### Seeds Generation

In [31]:
seeds = generate_random_seeds(5)
seeds

array([1139, 1585, 5442, 8411, 5060])

### Load Processed Texts

In [32]:
file_path = "../../data/train_clean.txt"
        
processed_ngrams = load_processed_text(file_path)
print(processed_ngrams[:1])
print(len(processed_ngrams))

[['buy', 'guess', 'flow', 'heavy', 'sort', 'thing', 'back', 'tampon']]
24510


### Baseline Model

In [33]:
# Create Dictionary
id2word = corpora.Dictionary(processed_ngrams)

# Create Corpus
texts = processed_ngrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [34]:
%%time
model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=20,
    random_state=42,
    chunksize=1000,
    passes=10,
    iterations=100,
    update_every = 1,
    alpha='auto',
    eta='auto',  
    eval_every=None # helps to train faster
)
print('\nPerplexity: ', model.log_perplexity(corpus))


Perplexity:  -9.329036690349717
Wall time: 56.3 s


In [47]:
%%time
# Compute Coherence Score
coherence_model_lda = models.CoherenceModel(model=model, texts=processed_ngrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5331070305996004
Wall time: 12.2 s


In [58]:
%%time
# Extract top 50 words for each topic and create texts representing top words for each topic
top_words_texts = [[word for word, _ in model.show_topic(i, topn=50)] for i in range(20)]

# Compute Coherence Score
coherence_model_lda_top_words = models.CoherenceModel(topics=top_words_texts, texts=processed_ngrams, dictionary=id2word, coherence='c_v')
coherence_lda_top_words = coherence_model_lda_top_words.get_coherence()
print('\nCoherence Score (Top Words): ', coherence_lda_top_words)


Coherence Score (Top Words):  0.5331070305996004
Wall time: 12.3 s


### Tune LDA

### K=10

In [None]:
num_topics = 10

grid = {'alpha': ['auto', 0.01, 0.1, 1.0],  
        'eta': ['auto', 0.01, 0.1, 1.0]}

In [None]:
%%time
total_iterations = seeds.shape[0]
intrusion_tests = {}

progress_bar = tqdm(total=total_iterations, desc="Running for seed")
for i, seed in enumerate(seeds):
    # tune lda
    best_lda_model, best_score, best_alpha, best_eta = tune_lda(seed, corpus, id2word, num_topics, grid)
    print("Best Model Perplexity:", best_score)
    
    # make tests
    tests = prepare_gensim_tests(seed, best_lda_model)
    intrusion_tests[i] = {'seed': seed, 'tests': tests,
                          'alpha': best_alpha, 'eta': best_eta} 
    
    progress_bar.update(1)

progress_bar.close()

In [None]:
intrusion_tests.get(0)

In [None]:
df_gensim = pd.DataFrame(intrusion_tests).transpose()
k_values = [10, 10, 10, 10, 10]
df_gensim.insert(loc=0, column='k', value=k_values)

In [None]:
df_gensim

### K=20

In [None]:
num_topics = 20

grid = {'alpha': ['auto', 0.01, 0.1, 1.0],  
        'eta': ['auto', 0.01, 0.1, 1.0]}

In [None]:
total_iterations = seeds.shape[0]
intrusion_tests_20 = {}

progress_bar = tqdm(total=total_iterations, desc="Running for seed")
for i, seed in enumerate(seeds):
    # tune lda
    best_lda_model, best_score, best_alpha, best_eta = tune_lda(seed, corpus, id2word, num_topics, grid)
    print("Best Model Perplexity:", best_score)
    
    # make tests
    tests = prepare_gensim_tests(seed, best_lda_model)
    intrusion_tests_20[i] = {'k': num_topics, 'seed': seed, 'tests': tests,
                          'alpha': best_alpha, 'eta': best_eta} 
    
    progress_bar.update(1)

progress_bar.close()

### K=50

In [None]:
num_topics = 50

grid = {'alpha': ['auto', 0.01, 0.1, 1.0],  
        'eta': ['auto', 0.01, 0.1, 1.0]}

In [None]:
total_iterations = seeds.shape[0]
intrusion_tests_50 = {}

progress_bar = tqdm(total=total_iterations, desc="Running for seed")
for i, seed in enumerate(seeds):
    # tune lda
    best_lda_model, best_score, best_alpha, best_eta = tune_lda(seed, corpus, id2word, num_topics, grid)
    print("Best Model Perplexity:", best_score)
    
    # make tests
    tests = prepare_gensim_tests(seed, best_lda_model)
    intrusion_tests_50[i] = {'seed': seed, 'tests': tests,
                          'alpha': best_alpha, 'eta': best_eta} 
    
    progress_bar.update(1)

progress_bar.close()

### K=100

In [None]:
num_topics = 100

grid = {'alpha': ['auto', 0.01, 0.1, 1.0],  
        'eta': ['auto', 0.01, 0.1, 1.0]}

In [None]:
total_iterations = seeds.shape[0]
intrusion_tests_100 = {}

progress_bar = tqdm(total=total_iterations, desc="Running for seed")
for i, seed in enumerate(seeds):
    # tune lda
    best_lda_model, best_score, best_alpha, best_eta = tune_lda(seed, corpus, id2word, num_topics, grid)
    print("Best Model Perplexity:", best_score)
    
    # make tests
    tests = prepare_gensim_tests(seed, best_lda_model)
    intrusion_tests_100[i] = {'seed': seed, 'tests': tests,
                          'alpha': best_alpha, 'eta': best_eta} 
    
    progress_bar.update(1)

progress_bar.close()