# Part II: Gensim LDA

In [1]:
# run functions notbook
%run ../functions/gensim_functions.ipynb

### Load Processed Texts

In [2]:
file_path = "../../data/train_clean.txt"
        
processed_ngrams = load_processed_text(file_path)
print(processed_ngrams[:1])
print(len(processed_ngrams))

[['buy', 'guess', 'flow', 'heavy', 'sort', 'thing', 'back', 'tampon']]
24510


### Baseline Model

In [3]:
# Create Dictionary
id2word = corpora.Dictionary(processed_ngrams)

# Create Corpus
texts = processed_ngrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [4]:
%%time
# LDA Model
model = models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=20,
    random_state=42,
    chunksize=1000,
    passes=10,
    iterations=50,
    update_every = 1,
    alpha='auto',
    eta='auto',  
    eval_every=None # helps to train faster
)

# Compute Coherence Score
coherence_model = models.CoherenceModel(model=model, texts=processed_ngrams, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('\nCoherence Score:', coherence_score)


Coherence Score: 0.5294372097749029
Wall time: 1min 4s


In [5]:
%%time
# Extract top 50 words for each topic and create texts representing top words for each topic
top_words_texts = [[word for word, _ in model.show_topic(i, topn=50)] for i in range(20)]
top_words_texts

# Compute Coherence Score using top 50 words in each topics
coherence_model_top_words = models.CoherenceModel(topics=top_words_texts, texts=texts, dictionary=id2word, coherence='c_v')
coherence_score_top_words = coherence_model_top_words.get_coherence()
print('\nCoherence Score (Top Words):', coherence_score_top_words)


Coherence Score (Top Words): 0.5294372097749029
Wall time: 12.2 s


### Tune LDA

In [10]:
# Seeds Generation
seeds = generate_random_seeds(3)
seeds

[8558, 3883, 7967]

In [11]:
# Params Grid
num_topics = [10, 20, 50]

grid = {'alpha': ['symmetric','asymmetric','auto', 0.1, 0.01],  
        'eta': ['symmetric', 'auto', 0.1, 0.01]}

In [12]:
df = tune_lda_iter(texts, corpus, id2word, num_topics, seeds, grid)


Total progress:   0%|                                                                          | 0/180 [00:00<?, ?it/s][A
Total progress:   1%|▎                                                               | 1/180 [00:57<2:52:44, 57.90s/it][A
Total progress:   1%|▋                                                               | 2/180 [02:00<2:59:57, 60.66s/it][A
Total progress:   2%|█                                                               | 3/180 [02:59<2:56:52, 59.96s/it][A
Total progress:   2%|█▍                                                              | 4/180 [04:00<2:57:04, 60.37s/it][A
Total progress:   3%|█▊                                                              | 5/180 [04:57<2:52:21, 59.10s/it][A
Total progress:   3%|██▏                                                             | 6/180 [05:59<2:54:42, 60.24s/it][A
Total progress:   4%|██▍                                                             | 7/180 [07:02<2:55:35, 60.90s/it][A
Total progress:

Total progress:  37%|██████████████████████▎                                      | 66/180 [1:06:09<2:05:59, 66.31s/it][A
Total progress:  37%|██████████████████████▋                                      | 67/180 [1:07:11<2:02:26, 65.02s/it][A
Total progress:  38%|███████████████████████                                      | 68/180 [1:08:14<2:00:23, 64.50s/it][A
Total progress:  38%|███████████████████████▍                                     | 69/180 [1:09:17<1:58:35, 64.10s/it][A
Total progress:  39%|███████████████████████▋                                     | 70/180 [1:10:21<1:57:23, 64.03s/it][A
Total progress:  39%|████████████████████████                                     | 71/180 [1:11:20<1:53:50, 62.67s/it][A
Total progress:  40%|████████████████████████▍                                    | 72/180 [1:12:23<1:52:42, 62.62s/it][A
Total progress:  41%|████████████████████████▋                                    | 73/180 [1:13:30<1:53:56, 63.89s/it][A
Total progress: 

Total progress:  73%|████████████████████████████████████████████                | 132/180 [2:22:11<1:07:34, 84.47s/it][A
Total progress:  74%|████████████████████████████████████████████▎               | 133/180 [2:23:35<1:06:05, 84.37s/it][A
Total progress:  74%|████████████████████████████████████████████▋               | 134/180 [2:25:05<1:05:52, 85.93s/it][A
Total progress:  75%|█████████████████████████████████████████████               | 135/180 [2:26:28<1:03:41, 84.93s/it][A
Total progress:  76%|█████████████████████████████████████████████▎              | 136/180 [2:27:49<1:01:31, 83.89s/it][A
Total progress:  76%|███████████████████████████████████████████████▏              | 137/180 [2:29:10<59:35, 83.15s/it][A
Total progress:  77%|███████████████████████████████████████████████▌              | 138/180 [2:30:42<59:51, 85.52s/it][A
Total progress:  77%|███████████████████████████████████████████████▉              | 139/180 [2:32:04<57:45, 84.54s/it][A
Total progress: 

In [13]:
df

Unnamed: 0,num_topics,seed,score,alpha,eta,tests
0,10,8558,0.626402,asymmetric,0.01,"{0: {'top 5 sample': ['problem', 'man', 'body'..."
1,10,3883,0.580637,asymmetric,symmetric,"{0: {'top 5 sample': ['make', 'formula', 'cup'..."
2,10,7967,0.625912,auto,symmetric,"{0: {'top 5 sample': ['wear', 'buy', 'hour', '..."
3,20,8558,0.559939,0.01,0.1,"{0: {'top 5 sample': ['mother', 'live', 'murde..."
4,20,3883,0.569342,0.1,0.01,"{0: {'top 5 sample': ['side', 'weird', 'belly'..."
5,20,7967,0.562992,0.01,0.1,"{0: {'top 5 sample': ['overnight', 'bother', '..."
6,50,8558,0.490047,0.01,0.1,"{0: {'top 5 sample': ['treatment', 'physical',..."
7,50,3883,0.472989,asymmetric,0.1,"{0: {'top 5 sample': ['feel', 'wear', 'pad', '..."
8,50,7967,0.495381,0.01,0.1,"{0: {'top 5 sample': ['push', 'wear', 'start',..."


### Save Results

In [None]:
path ='../../res/gensim_test.csv'

df_to_csv(df, path)