In [1]:

import artm
import glob
import os

import artm
import glob #module gives an opp to search for a file with a particular regex
import os

In [2]:
def get_batch_vectorizer(target_batches_folder, data_path):
    if not glob.glob(os.path.join(target_batches_folder, "*")):
        return artm.BatchVectorizer(data_path=data_path,
                                                data_format='vowpal_wabbit',
                                                collection_name=data_path + '_collection',
                                                target_folder=target_batches_folder)
    else:
        return artm.BatchVectorizer(data_path=target_batches_folder,
                                                data_format='batches')


In [3]:
def get_dict(target_batches_folder):
    dict_name = os.path.join(target_batches_folder, "dict.txt")
    dictionary = artm.Dictionary(name="dictionary")
    if not os.path.exists(dict_name):
        dictionary.gather(target_batches_folder)
        dictionary.save_text(dict_name)
    else:
        dictionary.load_text(dict_name)
    return dictionary


In [4]:
def print_top_words(model, tracker_name="top_tokens"):
    for topic_name in model.topic_names:
        print(topic_name + ': ')
        if topic_named in last_tokens:
            for word in model.score_tracker[tracker_name].last_tokens[topic_name]:
                print (word)
        else:
            print("free topic")
        print()


In [5]:

def generate_topic_names(topic_count, background_topic_count):

    background_topics, objective_topics, all_topics = [], [], []
    for i in range(topic_count):
        topic_name = ("background_topic_" + str(i)) if i < background_topic_count \
            else ("objective_topic_" + str(i - background_topic_count))
        all_topics.append(topic_name)
        if i < background_topic_count:
            background_topics.append(topic_name)
        else:
            objective_topics.append(topic_name)
    return all_topics, objective_topics, background_topics


In [6]:
def set_regularizers(model, devided, topic_names,  **regs):
    all_topics, objective_topics, background_topics = topic_names
    if devided:
            if 'objective_sparse_phi' in regs:
                    model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='objective_sparse_phi',
                            topic_names=objective_topics,
                            tau=regs['objective_sparse_phi']),
                        overwrite= True)
            if 'objective_sparse_theta' in regs:
                    model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='objective_sparse_theta',
                            topic_names=objective_topics,
                            tau=regs['objective_sparse_theta']),
                        overwrite= True)
            if 'background_sparse_phi' in regs:
                    model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='background_sparse_phi',
                            topic_names=background_topics,
                            tau=regs['background_sparse_phi']),
                        overwrite= True)
            if 'background_sparse_theta' in regs:
                    model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='background_sparse_theta',
                            topic_names=background_topics,
                            tau=regs['background_sparse_theta']),
                        overwrite=True)
    else:
        if 'sparse_phi' in regs:
                    model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='sparse_phi',
                            tau=regs['sparse_phi']),
                        overwrite=True)
        if 'sparse_theta' in regs:
                    model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='sparse_theta',
                            tau=regs['sparse_theta']),
                        overwrite=True)
    if  'decorrelator_phi' in regs:
            if devided:
                model.regularizers.add(
                            artm.DecorrelatorPhiRegularizer(
                                name='decorrelator_phi',
                                topic_names=objective_topics,
                                tau=regs['decorrelator_phi']),
                            overwrite=True)
            else:
                model.regularizers.add(
                            artm.DecorrelatorPhiRegularizer(
                                name='decorrelator_phi',
                                tau=regs['decorrelator_phi']),
                            overwrite=True)



In [7]:
def set_scores(model, topic_names, devided=True,  **scores):
    #if not ('perplexity_score' in [score.name for
    #                               score in model.scores]):
    #    model.scores.add(PerplexityScore(name='perplexity_score'))
    all_topics, objective_topics ,background_topics = topic_names
    if 'top_tokens' in scores:
        model.scores.add(artm.TopTokensScore(
            name='top_tokens',
            num_tokens=scores['top_tokens']),
            overwrite= True)
    if 'top_tokens_extended' in scores:
        model.scores.add(artm.TopTokensScore(
            name='top_tokens_extended',
            num_tokens=scores['top_tokens_extended']),
            overwrite= True)

    if devided:
            if 'objective_sparsity_phi' in scores:
                    model.scores.add(
                        artm.SparsityPhiScore(
                            name='objective_sparsity_phi',
                            topic_names=objective_topics),
                        overwrite= True)
            if 'objective_sparsity_theta' in scores:
                    model.scores.add(
                        artm.SparsityThetaScore(
                            name='objective_sparsity_theta',
                            topic_names=objective_topics),
                        overwrite= True)
            if 'background_sparsity_phi' in scores:
                    model.scores.add(
                        artm.SparsityPhiScore(
                            name='background_sparsity_phi',
                            topic_names=background_topics),
                        overwrite= True)
            if 'background_sparsityity_theta' in scores:
                    model.scores.add(
                        artm.SparsityThetaScore(
                            name='background_sparsity_theta',
                            topic_names=background_topics),
                        overwrite=True)
    else:
        if 'sparsity_phi' in scores:
                    print ('if sparsity_phi in scores:')
                    model.scores.add(
                        artm.SparsityPhiScore(
                            name='sparsity_phi'),
                        overwrite=True)
        if 'sparsity_theta' in scores:
                    print ('sparsity_theta  in scores')
                    model.scores.add(
                        artm.SparsityThetaScore(
                            name='sparsity_theta'),
                        overwrite=True)



In [8]:

def get_scores(topic_names):
    # background_topics = (background_topics if background_topics else topics_amount//10)

    all_topics, objective_topics, background_topics = topic_names
    print("get_scores", all_topics)
    print("get scores : " , background_topics)
    print ("get_scores : " , objective_topics)

    scores_list=[]
    scores_list.append(artm.PerplexityScore(name='objective_perplexity_score',
                                            topic_names=objective_topics))
    scores_list.append(artm.SparsityPhiScore(name='objective_sparsity_phi',
                                             topic_names=objective_topics))
    scores_list.append(artm.SparsityThetaScore(name='objective_sparsity_theta',
                                               topic_names=objective_topics))

    scores_list.append(artm.PerplexityScore(name='perplexity_score',
                                            topic_names=all_topics))

    scores_list.append(artm.SparsityThetaScore(name='background_sparsity_theta',
                                               topic_names=background_topics))
    scores_list.append(artm.SparsityPhiScore(name='background_sparsity_phi',
                                               topic_names=background_topics))
    scores_list.append(artm.TopTokensScore(name="top_words",
                                              num_tokens=10, topic_names=objective_topics))
    return scores_list



## Get Batch vectorizer

In [9]:
batch_vectorizer = get_batch_vectorizer("bel_sites_batches", "bel_sites.txt")

In [10]:
dictionary=get_dict("bel_sites_batches")

In [11]:
T = 50
topic_names = generate_topic_names(T, 10)
all_topics, objective_topics, background_topics = topic_names

In [12]:
_num_tokens = 15

In [13]:
devided_model = artm.ARTM(num_topics=T,
                          topic_names = topic_names[0],
                          cache_theta=True,
                          reuse_theta=False,
                          theta_columns_naming="title",
                          seed=42,
                          num_document_passes=20,
                          num_processors = 10,
                          class_ids = {"@content" :1, "@title":5})

devided_model.initialize(dictionary)


##                                                 add some scores

In [14]:
devided_model.scores.add(artm.PerplexityScore(name='perplexity_score'))

### top tokens score

In [15]:
devided_model.scores.add(
    artm.TopTokensScore(
        name='top_tokens',
        num_tokens=_num_tokens, 
        class_id="@content", 
        topic_names=all_topics,
        ),overwrite=True)


### objective_sparsity scores

In [16]:
devided_model.scores.add(
    artm.SparsityPhiScore(
        name='objective_sparsity_phi', 
        topic_names=objective_topics))
devided_model.scores.add(
    artm.SparsityThetaScore(
        name='objective_sparsity_theta',
        topic_names=objective_topics))

### background_sparsity scores

In [17]:
devided_model.scores.add(
        artm.SparsityPhiScore(
            name='background_sparsity_phi',
            topic_names=background_topics))
devided_model.scores.add(
    artm.SparsityThetaScore(
        name='background_sparsity_theta',
        topic_names=background_topics))

## add regulirizers

In [18]:
regs = {
    "objective_sparse_phi":20,
    'objective_sparse_theta':-4.24,   
    'background_sparse_phi':10, 
    'background_sparse_theta':4.24,
    'decorrelator_phi':100
} 

### add objective regulirizers

In [19]:
devided_model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='objective_sparse_phi',
                            topic_names=objective_topics,
                            tau=regs['objective_sparse_phi']))
devided_model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='objective_sparse_theta',
                            topic_names=objective_topics,
                            tau=regs['objective_sparse_theta']))

### add background regulirizers

In [20]:
devided_model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='background_sparse_phi',
                            topic_names=background_topics,
                            tau=regs['background_sparse_phi']))
devided_model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='background_sparse_theta',
                            topic_names=background_topics,
                            tau=regs['background_sparse_theta']))

### add decorrelator

In [21]:
devided_model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(
                name='decorrelator_phi',
                topic_names=objective_topics,
                tau=regs['decorrelator_phi']))

### fit offline

In [22]:
devided_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=3)

In [23]:
devided_model.score_tracker['top_tokens'].last_tokens.keys()

dict_keys(['background_topic_0', 'objective_topic_6', 'objective_topic_15', 'objective_topic_7', 'objective_topic_12', 'background_topic_5', 'background_topic_7', 'background_topic_9', 'background_topic_2', 'objective_topic_5', 'objective_topic_0', 'objective_topic_34', 'objective_topic_32', 'objective_topic_18', 'objective_topic_11', 'objective_topic_2', 'objective_topic_4', 'objective_topic_39', 'objective_topic_28', 'objective_topic_14', 'objective_topic_21', 'objective_topic_10', 'objective_topic_20', 'objective_topic_22', 'objective_topic_31', 'objective_topic_1', 'background_topic_3', 'background_topic_6', 'objective_topic_16', 'objective_topic_17', 'objective_topic_33', 'objective_topic_30', 'objective_topic_35', 'objective_topic_19', 'objective_topic_24', 'objective_topic_36', 'objective_topic_26', 'background_topic_8', 'background_topic_4', 'objective_topic_37', 'objective_topic_3', 'objective_topic_27', 'objective_topic_38', 'objective_topic_29', 'objective_topic_9', 'objecti

In [24]:
for topic_name in devided_model.topic_names:
    if topic_name in devided_model.score_tracker['top_tokens'].last_tokens:
        print(topic_name + ': '+ ' '.join(devided_model.score_tracker['top_tokens'].last_tokens[topic_name]))

print ("Perplexity:", devided_model.score_tracker["perplexity_score"].last_value)
print (devided_model.get_phi())
print(devided_model.get_theta())
#for i,raw in enumerate(devided_model.get_phi()):
#    print(i,' ',raw)

background_topic_0: быть это который один такой тот страна цена день самый квартира минск так два свой
background_topic_1: год что беларусь тысяча рубль тот как такой мочь уже быть сообщить какой самый два
background_topic_2: это который что также можно такой для получить самый мочь беларусь уже тот как сегодня
background_topic_3: что это быть год мочь такой весь один большой он здесь также так для тот
background_topic_4: быть что беларусь для один это год также гэта получить белорусский слово этот два свой
background_topic_5: быть год один рубль что он для который можно уже свой как самый водитель миллион
background_topic_6: мочь год быть тот который рубль он свой день при другой также это один этот
background_topic_7: год быть один также беларусь свой версия миллион новый группа страна получить что тот белорусский
background_topic_8: быть это что который как весь для год он человек тот этот свой один время
background_topic_9: что год это быть беларусь весь который один день самый как

                        7001      7002      7003      7004      7005  \
background_topic_0  0.024825  0.061886  0.051506  0.049496  0.065560   
background_topic_1  0.087350  0.060581  0.053022  0.064639  0.084331   
background_topic_2  0.012405  0.089422  0.127682  0.073588  0.095214   
background_topic_3  0.054871  0.132103  0.074225  0.225626  0.086392   
background_topic_4  0.743264  0.064900  0.092746  0.106420  0.083611   
background_topic_5  0.030922  0.104275  0.054085  0.050171  0.100383   
background_topic_6  0.009083  0.076844  0.111197  0.063893  0.083338   
background_topic_7  0.013166  0.055559  0.045309  0.061345  0.056926   
background_topic_8  0.009090  0.268084  0.292328  0.146747  0.270087   
background_topic_9  0.015024  0.086347  0.097900  0.158075  0.074160   
objective_topic_0   0.000000  0.000000  0.000000  0.000000  0.000000   
objective_topic_1   0.000000  0.000000  0.000000  0.000000  0.000000   
objective_topic_2   0.000000  0.000000  0.000000  0.000000  0.00