# Part II: Mallet LDA

In [1]:
# run functions notbook
%run ../functions/mallet_functions.ipynb

### Load Processed Texts

In [2]:
file_path = "../../data/train_clean.txt"
        
training_data = load_processed_text(file_path)
print(training_data[:1])
print(len(training_data))

['buy guess flow heavy sort thing back tampon']
24510


### Mallet Setup

In [3]:
path_to_mallet = 'C:/mallet/bin/mallet'  # CHANGE THIS TO YOUR MALLET PATH
output_directory_path = 'C:/mallet/lda-data' # CHANGE THIS TO YOUR OUTPUT DIRECTORY

path_to_training_data           = output_directory_path + '/training.txt'
path_to_formatted_training_data = output_directory_path + '/mallet.training'

In [4]:
import_data(path_to_mallet,
                path_to_training_data,
                path_to_formatted_training_data,
                training_data)

Importing data...
Complete


### Baseline Model

In [5]:
# Create Corpus
texts = [doc.split() for doc in training_data]
print(texts[:1])

# Create Dictionary
id2word = corpora.Dictionary(texts)

# mallet paths
path_to_topic_keys              = output_directory_path + '/mallet.topic_keys.' + str(20) + '.txt'
path_to_topic_distributions     = output_directory_path + '/mallet.topic_distributions.' + str(20) + '.txt'

[['buy', 'guess', 'flow', 'heavy', 'sort', 'thing', 'back', 'tampon']]


In [6]:
%%time
train_topic_model(path_to_mallet,
                  path_to_formatted_training_data,
                  path_to_topic_keys,
                  path_to_topic_distributions,
                  num_topics = 20,
                  interval = 10,
                  burnin = 20,
                  random_state = 42)

# load topic words
topic_words = load_topic_words(path_to_topic_keys)

# Compute Coherence Score using top 50 words in each topics
coherence_model_top_words = models.CoherenceModel(topics=topic_words, texts=texts, dictionary=id2word, coherence='c_v')
coherence_score_top_words = coherence_model_top_words.get_coherence()
print('\nCoherence Score (Top Words):', coherence_score_top_words)


Coherence Score (Top Words): 0.6430015216940941
Wall time: 1min 1s


### Tune LDA

In [7]:
# Seeds Generation
seeds = generate_random_seeds(5)
seeds

[7097, 5301, 8687, 3433, 511]

In [8]:
# Params Grid
num_topics = [10, 20, 50, 100]

grid = {'interval': [5, 10, 20, 50, 100],
        'burnin': [50, 100, 200, 500]}

In [9]:
df = tune_lda_mallet(path_to_mallet, path_to_formatted_training_data, texts, id2word, num_topics, seeds, grid)

Total progress: 100%|██████████████████████████████████████████████████████████████| 400/400 [7:20:48<00:00, 66.12s/it]


In [10]:
df

Unnamed: 0,num_topics,seed,score,interval,burnin,tests
0,10,7097,0.619082,5,100,"{0: {'top 5 sample': ['milk', 'feed', 'month',..."
1,10,5301,0.632168,100,100,"{0: {'top 5 sample': ['pregnant', 'day', 'week..."
2,10,8687,0.638164,100,500,"{0: {'top 5 sample': ['breastfeed', 'milk', 'f..."
3,10,3433,0.6246,100,500,"{0: {'top 5 sample': ['time', 'make', 'orgasm'..."
4,10,511,0.626687,100,200,"{0: {'top 5 sample': ['abortion', 'mother', 'f..."
5,20,7097,0.646652,100,500,"{0: {'top 5 sample': ['day', 'make', 'help', '..."
6,20,5301,0.645397,100,200,"{0: {'top 5 sample': ['time', 'start', 'feel',..."
7,20,8687,0.644444,10,100,"{0: {'top 5 sample': ['baby', 'epidural', 'con..."
8,20,3433,0.632472,100,500,"{0: {'top 5 sample': ['feel', 'make', 'pregnan..."
9,20,511,0.646027,100,500,"{0: {'top 5 sample': ['gain', 'pregnancy', 'we..."


### Save Results

In [11]:
path ='../../res/mallet_test.csv'

df_to_csv(df, path)