# Part II: Mallet LDA

In [58]:
# run functions notbook
%run ../functions/mallet_functions.ipynb

### Load Processed Texts

In [48]:
file_path = "../../data/train_clean.txt"
        
training_data = load_processed_text(file_path)
print(training_data[:1])
print(len(training_data))

['buy guess flow heavy sort thing back tampon']
24510


### Mallet Setup

In [49]:
path_to_mallet = 'C:/mallet/bin/mallet'  # CHANGE THIS TO YOUR MALLET PATH
output_directory_path = 'C:/mallet/lda-data' # CHANGE THIS TO YOUR OUTPUT DIRECTORY

path_to_training_data           = output_directory_path + '/training.txt'
path_to_formatted_training_data = output_directory_path + '/mallet.training'

In [50]:
import_data(path_to_mallet,
                path_to_training_data,
                path_to_formatted_training_data,
                training_data)

Importing data...
Complete


### Baseline Model

In [54]:
# Create Corpus
texts = [doc.split() for doc in training_data]
print(texts[:1])

# Create Dictionary
id2word = corpora.Dictionary(texts)

# mallet paths
path_to_model                   = output_directory_path + '/mallet.model.' + str(num_topics)
path_to_topic_keys              = output_directory_path + '/mallet.topic_keys.' + str(num_topics) + '.txt'
path_to_topic_distributions     = output_directory_path + '/mallet.topic_distributions.' + str(num_topics) + '.txt'

[['buy', 'guess', 'flow', 'heavy', 'sort', 'thing', 'back', 'tampon']]


In [55]:
%%time
train_topic_model(path_to_mallet,
                  path_to_formatted_training_data,
                  path_to_topic_keys,
                  path_to_topic_distributions,
                  num_topics= 20,
                  param = 10,
                  random_state = 42)

# load topic words
topic_words = load_topic_words(path_to_topic_keys)

# Compute Coherence Score using top 50 words in each topics
coherence_model_top_words = models.CoherenceModel(topics=topic_words, texts=texts, dictionary=id2word, coherence='c_v')
coherence_score_top_words = coherence_model_top_words.get_coherence()
print('\nCoherence Score (Top Words):', coherence_score_top_words)

Training topic model...
Complete

Coherence Score (Top Words): 0.6493448917887764
Wall time: 1min 5s


### Tune LDA

In [59]:
# Seeds Generation
seeds = generate_random_seeds(2)
seeds

[836]

In [60]:
# Params Grid
num_topics = [10,20]

grid = {'param': [5, 10, 20]}

In [None]:
def tune_lda_mallet(texts, id2word, num_topics, seeds, grid):
    '''
    tune lda for each num_topics and seeds using grid search
    '''
    results = []

    total_iter = len(num_topics)*len(seeds)*len(grid['alpha']) * len(grid['eta'])
    progress_bar = tqdm(total=total_iter, desc="Total progress")
    
    for k in num_topics:
        
        for seed in seeds:
            
            # tune lda
            best_model = None
            best_score = float('-inf')
            best_alpha = None
            best_eta = None
            
            for alpha in grid['alpha']:
                for eta in grid['eta']:
                    model = models.ldamodel.LdaModel(
                                corpus=corpus,
                                id2word=id2word,
                                num_topics=k,
                                random_state=seed,
                                chunksize=1000,
                                passes=10,
                                iterations=50,
                                update_every = 1,
                                alpha=alpha,
                                eta=eta,  
                                eval_every=None)
                    
                    top_words = [[word for word, _ in model.show_topic(i, topn=50)] for i in range(k)]
                    coherence_model = models.CoherenceModel(topics=top_words, texts=texts, dictionary=id2word, coherence='c_v')
                    score = coherence_model.get_coherence()
                        
                    if score > best_score:
                        best_score = score
                        best_model = model
                        best_alpha = alpha
                        best_eta = eta
              
                    progress_bar.update(1)
            
            
            # make tests
            tests = prepare_gensim_tests(seed, best_model)
            
            results.append({
                    'num_topics': k,
                    'seed': seed,
                    'score': best_score,
                    'alpha': best_alpha,
                    'eta': best_eta,
                    'tests': tests})
    
    progress_bar.close()
   
    df = pd.DataFrame(results)
    return df