In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import artm
import copy
import re


def print_and_gather_measures(model, metrics):
    print('Sparsity Phi: ', model.get_score('SparsityPhiScore').value)
    metrics.append(model.get_score('SparsityPhiScore').value)

    print('Sparsity Theta: ', model.get_score('SparsityThetaScore').value)
    metrics.append(model.get_score('SparsityThetaScore').value)

    print('Kernel contrast: ', model.get_score('TopicKernelScore').average_kernel_contrast)
    metrics.append(model.get_score('TopicKernelScore').average_kernel_contrast)

    print('Kernel purity: ', model.get_score('TopicKernelScore').average_kernel_purity)
    metrics.append(model.get_score('TopicKernelScore').average_kernel_purity)

    print('PerplexityScore: ', model.get_score('PerplexityScore').value)
    metrics.append(model.get_score('PerplexityScore').value)

    
def get_clustering(model, topics):
    phi_matrix = copy.deepcopy(model.phi_)
    pattern = '\(\'@default_class\', \'(.*?)\'\)'
    cleaned_index = [re.search(pattern, str(x)).group(1) for x in phi_matrix.index.tolist()]
    phi_matrix.index = cleaned_index
    num_topics = len(topics)
    clustering = [list() for _ in range(num_topics)]
    for i in range(num_topics):
        current_topic = topics[i]
        phi_matrix.sort_values(by = current_topic, ascending = False, inplace = True)
        top_tokens = phi_matrix.index[phi_matrix[current_topic] > 0]
        clustering[i] = top_tokens
    return clustering

def print_clustering(clustering, topics):
    num_topics = len(topics)
    for i in range(num_topics):
        print(topics[i])
        print(clustering[i])
        print(len(clustering[i]))
        
def save_clustering(clustering, topics, filename):
    f = open(filename, "w")
    num_topics = len(topics)
    for i in range(num_topics):
        f.write(topics[i])
        f.write("\n Amount of tags in cluster: ")
        f.write(str(len(clustering[i])))
        f.write("\n")
        for j in range(len(clustering[i])):
            f.write(clustering[i][j])
            f.write(", ")
        f.write("\n")
    f.close()
    
def print_top_tags(score_tracker):
    topics = score_tracker.topic_name
    tags = score_tracker.token
    weights = score_tracker.weight
    for i in range(1, len(topics) + 1):
        if ((topics[i-1] != topics[i]) | (i == 1)):
            print(topics[i])
        print('{}: {}, '.format(tags[i], weights[i]))
        
def save_top_tags(score_tracker, filename):
    f = open(filename, "w")
    topics = score_tracker.topic_name
    tags = score_tracker.token
    weights = score_tracker.weight
    for i in range(1, len(topics)):
        if ((topics[i-1] != topics[i]) | (i == 1)):
            f.write(topics[i])
            f.write("\n")
        f.write('{}: {}; \n '.format(tags[i], weights[i]))
    f.close()

# Model PLSA, num_topics = 500

In [2]:
batch_vectorizer = artm.BatchVectorizer(data_path='vw.tags.100.txt', data_format='vowpal_wabbit',target_folder='posts-tags-100')
dictionary = batch_vectorizer.dictionary

topic_names = ['topic_{}'.format(i) for i in range(500)]
model_plsa_500 = artm.ARTM(topic_names = topic_names, num_processors = 4,
                       scores = [artm.PerplexityScore(name = 'PerplexityScore', dictionary = dictionary), 
                                artm.SparsityPhiScore(name='SparsityPhiScore'),
                                artm.SparsityThetaScore(name='SparsityThetaScore'), 
                                artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.1),
                                artm.TopTokensScore(name='TopTokensScore', num_tokens = 20)],
                       seed = 62, show_progress_bars = True)


model_plsa_500.initialize(dictionary=dictionary)
model_plsa_500.fit_online(batch_vectorizer = batch_vectorizer, asynchronous = True)
model_plsa_500.dump_artm_model("model_plsa_500")

HBox(children=(IntProgress(value=0, description='Batch', max=18125, style=ProgressStyle(description_width='ini…



In [3]:
metrics_baseline = []
print_and_gather_measures(model_plsa_500, metrics_baseline)

Sparsity Phi:  0.9696954488754272
Sparsity Theta:  0.9932314157485962
Kernel contrast:  0.9974137544631958
Kernel purity:  0.9999590516090393
PerplexityScore:  12.941472053527832


In [5]:
clustering = get_clustering(model_plsa_500, topic_names)
#print_clustering(clustering, topic_names)
save_clustering(clustering, topic_names, "clustering_500.txt")
#print_top_tags(model_plsa.get_score('TopTokensScore'))
save_top_tags(model_plsa_500.get_score('TopTokensScore'), "top_tags_500.txt")

# Model 01 (PLSA, num_topics = 1000) МНЕ НУЖНА КОГЕРЕНТНОСТЬ

In [None]:
#batch_vectorizer = artm.BatchVectorizer(data_path='posts-tags-100',data_format='batches')     
#dictionary = batch_vectorizer.dictionary

topic_names = ['topic_{}'.format(i) for i in range(1000)]
model_plsa = artm.ARTM(topic_names = topic_names, num_processors = 4,
                       scores = [artm.PerplexityScore(name = 'PerplexityScore', dictionary = dictionary), 
                                artm.SparsityPhiScore(name='SparsityPhiScore'),
                                artm.SparsityThetaScore(name='SparsityThetaScore'), 
                                artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold = 0.1),
                                artm.TopTokensScore(name='TopTokensScore', num_tokens = 20)],
                       seed = 62, show_progress_bars = True)


model_plsa.initialize(dictionary=dictionary)

model_plsa.fit_online(batch_vectorizer = batch_vectorizer, asynchronous = True)
model_plsa.dump_artm_model("model_plsa_01")

HBox(children=(IntProgress(value=0, description='Batch', max=18125, style=ProgressStyle(description_width='ini…

In [None]:
metrics_01 = []
print_and_gather_measures(model_plsa, metrics_01)

In [None]:
clustering = get_clustering(model_plsa, topic_names)
#print_clustering(clustering, topic_names)
save_clustering(clustering, topic_names, "clustering_01.txt")
#print_top_tags(model_plsa.get_score('TopTokensScore'))
save_top_tags(model_plsa.get_score('TopTokensScore'), "top_tags_01.txt")

# Model 02 (+ Decorrelation Regularizer)

In [None]:
model_artm = copy.deepcopy(model_plsa)

In [None]:
model_artm.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))
model_artm.regularizers['decorrelator_phi_regularizer'].tau = 100
model_artm.fit_online(batch_vectorizer = batch_vectorizer, asynchronous = True)
model_artm.dump_artm_model("model_artm_02_decorrelator")

In [None]:
metrics_02 = []
print_and_gather_measures(model_artm, metrics_02)

In [None]:
clustering = get_clustering(model_artm, topic_names)
#print_clustering(clustering, topic_names)
save_clustering(clustering, topic_names, "clustering_02.txt")
#print_top_tags(model_artm.get_score('TopTokensScore'))
save_top_tags(model_artm.get_score('TopTokensScore'), "top_tags_02.txt")