# Part II: Mallet LDA

In [1]:
# run functions notbook
%run ../functions/mallet_functions.ipynb

### Load Processed Texts

In [2]:
file_path = "../../data/train_clean.txt"
        
training_data = load_processed_text(file_path)
print(training_data[:1])
print(len(training_data))

['buy guess flow heavy sort thing back tampon']
24510


### Mallet Setup

In [3]:
path_to_mallet = 'C:/mallet/bin/mallet'  # CHANGE THIS TO YOUR MALLET PATH
output_directory_path = 'C:/mallet/lda-data' # CHANGE THIS TO YOUR OUTPUT DIRECTORY

path_to_training_data           = output_directory_path + '/training.txt'
path_to_formatted_training_data = output_directory_path + '/mallet.training'

In [4]:
import_data(path_to_mallet,
                path_to_training_data,
                path_to_formatted_training_data,
                training_data)

Importing data...
Complete


### Baseline Model

In [5]:
# Create Corpus
texts = [doc.split() for doc in training_data]
print(texts[:1])

# Create Dictionary
id2word = corpora.Dictionary(texts)

# mallet paths
path_to_topic_keys              = output_directory_path + '/mallet.topic_keys.' + str(20) + '.txt'
path_to_topic_distributions     = output_directory_path + '/mallet.topic_distributions.' + str(20) + '.txt'

[['buy', 'guess', 'flow', 'heavy', 'sort', 'thing', 'back', 'tampon']]


In [6]:
%%time
train_topic_model(path_to_mallet,
                  path_to_formatted_training_data,
                  path_to_topic_keys,
                  path_to_topic_distributions,
                  num_topics = 20,
                  interval = 10,
                  burnin = 20,
                  random_state = 42)

# load topic words
topic_words = load_topic_words(path_to_topic_keys)

# Compute Coherence Score using top 50 words in each topics
coherence_model_top_words = models.CoherenceModel(topics=topic_words, texts=texts, dictionary=id2word, coherence='c_v')
coherence_score_top_words = coherence_model_top_words.get_coherence()
print('\nCoherence Score (Top Words):', coherence_score_top_words)


Coherence Score (Top Words): 0.6430015216940941
Wall time: 1min 6s


### Tune LDA

In [7]:
# Seeds Generation
seeds = generate_random_seeds(3)
seeds

[1897, 8025, 6026]

In [8]:
# Params Grid
num_topics = [10, 20, 50]

grid = {'interval': [5, 10, 20, 50],
        'burnin': [100, 200, 300]}

In [9]:
df = tune_lda_mallet(path_to_mallet, path_to_formatted_training_data, texts, id2word, num_topics, seeds, grid)

Total progress: 100%|██████████████████████████████████████████████████████████████| 108/108 [1:51:32<00:00, 61.97s/it]


In [10]:
df

Unnamed: 0,num_topics,seed,score,interval,burnin,tests
0,10,1897,0.618147,20,200,"{0: {'top 5 sample': ['day', 'pregnant', 'baby..."
1,10,8025,0.633682,50,200,"{0: {'top 5 sample': ['test', 'pap', 'pregnanc..."
2,10,6026,0.616574,5,100,"{0: {'top 5 sample': ['pad', 'leak', 'feel', '..."
3,20,1897,0.614488,5,300,"{0: {'top 5 sample': ['midwife', 'home', 'baby..."
4,20,8025,0.633334,50,100,"{0: {'top 5 sample': ['week', 'boy', 'girl', '..."
5,20,6026,0.640129,50,300,"{0: {'top 5 sample': ['test', 'result', 'docto..."
6,50,1897,0.635813,50,300,"{0: {'top 5 sample': ['mother', 'make', 'life'..."
7,50,8025,0.639538,50,100,"{0: {'top 5 sample': ['yeast_infection', 'wate..."
8,50,6026,0.64243,50,300,"{0: {'top 5 sample': ['penis', 'foreskin', 'cu..."


### Save Results

In [11]:
path ='../../res/mallet_test.csv'

df_to_csv(df, path)