# Part II: Gensim LDA

In [1]:
# run functions notbook
%run ../functions/gensim_functions.ipynb

### Load Processed Texts

In [2]:
file_path = "../../data/train_clean.txt"
        
processed_ngrams = load_processed_text(file_path)
print(processed_ngrams[:1])
print(len(processed_ngrams))

[['buy', 'guess', 'flow', 'heavy', 'sort', 'thing', 'back', 'tampon']]
24510


### Baseline Model

In [3]:
# Create Dictionary
id2word = corpora.Dictionary(processed_ngrams)

# Create Corpus
texts = processed_ngrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [4]:
%%time
# LDA Model
model = models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=20,
    random_state=42,
    chunksize=1000,
    passes=10,
    iterations=50,
    update_every = 1,
    alpha='auto',
    eta='auto',  
    eval_every=None # helps to train faster
)

# Compute Coherence Score
coherence_model = models.CoherenceModel(model=model, texts=processed_ngrams, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('\nCoherence Score:', coherence_score)


Coherence Score: 0.5294372097749029
Wall time: 1min 7s


In [5]:
%%time
# Extract top 50 words for each topic and create texts representing top words for each topic
top_words_texts = [[word for word, _ in model.show_topic(i, topn=50)] for i in range(20)]
top_words_texts

# Compute Coherence Score using top 50 words in each topics
coherence_model_top_words = models.CoherenceModel(topics=top_words_texts, texts=texts, dictionary=id2word, coherence='c_v')
coherence_score_top_words = coherence_model_top_words.get_coherence()
print('\nCoherence Score (Top Words):', coherence_score_top_words)


Coherence Score (Top Words): 0.5294372097749029
Wall time: 13 s


### Tune LDA

In [6]:
# Seeds Generation
seeds = generate_random_seeds(5)
seeds

[1928, 3328, 1062, 3953, 1172]

In [12]:
# Params Grid
num_topics = [10, 20, 50, 100]

grid = {'alpha': ['symmetric','asymmetric','auto', 10, 0.1, 0.01],  
        'eta': [None, 'symmetric', 'auto', 10, 0.1, 0.01]}

In [13]:
df = tune_lda_grid(texts, corpus, id2word, num_topics, seeds, grid)

Total progress: 100%|█████████████████████████████████████████████████████████████| 720/720 [14:19:26<00:00, 71.62s/it]


In [16]:
df

Unnamed: 0,num_topics,seed,score,alpha,eta,tests
0,10,1928,0.615142,asymmetric,0.01,"{0: {'top 5 sample': ['start', 'time', 'feel',..."
1,10,3328,0.632058,10,0.01,"{0: {'top 5 sample': ['iud', 'month', 'bad', '..."
2,10,1062,0.640627,10,,"{0: {'top 5 sample': ['cup', 'flow', 'partner'..."
3,10,3953,0.633518,10,0.01,"{0: {'top 5 sample': ['pregnancy', 'early', 't..."
4,10,1172,0.648327,10,,"{0: {'top 5 sample': ['control', 'pill', 'take..."
5,20,1928,0.55793,asymmetric,0.1,"{0: {'top 5 sample': ['good', 'time', 'feel', ..."
6,20,3328,0.574971,0.01,,"{0: {'top 5 sample': ['feel', 'time', 'ultraso..."
7,20,1062,0.561865,10,,"{0: {'top 5 sample': ['pad', 'sex', 'clean', '..."
8,20,3953,0.563291,asymmetric,0.01,"{0: {'top 5 sample': ['week', 'doctor', 'day',..."
9,20,1172,0.58556,10,0.1,"{0: {'top 5 sample': ['period', 'pill', 'pack'..."


### Save Results

In [17]:
path ='../../res/gensim_test.csv'

df_to_csv(df, path)