In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import artm
import copy
import re


batch_vectorizer = artm.BatchVectorizer(data_path='vw.tags.100.txt', data_format='vowpal_wabbit',target_folder='posts-tags-100')
dictionary = batch_vectorizer.dictionary

In [2]:
sparsity_phi_grid = [-5, -10, -15]
decorrelation_phi_grid = [50000, 100000, 500000]
num_topics_grid = [1000, 1500, 2000]

In [None]:
perplexity_tracker = 18100
contrast_tracker = 0
purity_tracker = 0

best_perplexity_model = []
best_contrast_model = []
best_purity_model = []

for num_topics in num_topics_grid:
    topic_names = ['topic_{}'.format(i) for i in range(num_topics)]
    for sparsity_phi in sparsity_phi_grid:
        for decorrelation_phi in decorrelation_phi_grid:
            metrics = []
            model_artm = artm.ARTM(topic_names = topic_names, num_processors = 4,
                    scores = [artm.PerplexityScore(name = 'PerplexityScore', dictionary = dictionary), 
                            artm.SparsityPhiScore(name = 'SparsityPhiScore'),
                            artm.SparsityThetaScore(name = 'SparsityThetaScore'), 
                            artm.TopicKernelScore(name = 'TopicKernelScore', probability_mass_threshold = 0.1),
                            artm.TopTokensScore(name = 'TopTokensScore', num_tokens = 20)],
                    regularizers = [artm.DecorrelatorPhiRegularizer(name = 'decorrelator_phi_regularizer', tau = decorrelation_phi),
                                    artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', tau = sparsity_phi)],
                    seed = 62, show_progress_bars = False)
            model_artm.initialize(dictionary = dictionary)
            model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes = 15)
            metrics.append(model_artm.get_score('SparsityPhiScore').value)
            metrics.append(model_artm.get_score('SparsityThetaScore').value)
            metrics.append(model_artm.get_score('TopicKernelScore').average_kernel_contrast)
            metrics.append(model_artm.get_score('TopicKernelScore').average_kernel_purity)
            metrics.append(model_artm.get_score('PerplexityScore').value)
            filename = str(num_topics) + "_" + str(sparsity_phi) + "_" + str(decorrelation_phi) + ".txt"
            with open(filename, 'w') as f:
                for item in metrics:
                    f.write("%s\n" % item)
            if (model_artm.get_score('PerplexityScore').value < perplexity_tracker):
                perplexity_tracker = model_artm.get_score('PerplexityScore').value
                best_perplexity_model = [num_topics, sparsity_phi, decorrelation_phi]
            if (model_artm.get_score('TopicKernelScore').average_kernel_purity > purity_tracker):
                purity_tracker = model_artm.get_score('TopicKernelScore').average_kernel_purity
                best_purity_model = [num_topics, sparsity_phi, decorrelation_phi]
            if (model_artm.get_score('TopicKernelScore').average_kernel_contrast > contrast_tracker):
                contrast_tracker = model_artm.get_score('TopicKernelScore').average_kernel_contrast
                best_contrast_model = [num_topics, sparsity_phi, decorrelation_phi]