In [2]:
from nltk.corpus import stopwords

In [3]:
import sys

In [4]:
import artm
import glob
import os
import artm
import glob #module gives an opp to search for a file with a particular regex
import os

In [5]:
from matplotlib import pyplot as plt
%matplotlib inline


In [6]:
def get_batch_vectorizer(target_batches_folder, data_path):
    if not glob.glob(os.path.join(target_batches_folder, "*")):
        return artm.BatchVectorizer(data_path=data_path,
                                                data_format='vowpal_wabbit',
                                                collection_name=data_path + '_collection',
                                                target_folder=target_batches_folder)
    else:
        return artm.BatchVectorizer(data_path=target_batches_folder,
                                                data_format='batches')


In [7]:
def get_dict(target_batches_folder):
    dict_name = os.path.join(target_batches_folder, "dict.txt")
    dictionary = artm.Dictionary(name="dictionary")
    if not os.path.exists(dict_name):
        dictionary.gather(target_batches_folder)
        dictionary.save_text(dict_name)
    else:
        dictionary.load_text(dict_name)
    return dictionary


In [8]:
def print_top_words(model, tracker_name="top_tokens"):
    for topic_name in model.topic_names:
        print(topic_name + ': ')
        if topic_named in last_tokens:
            for word in model.score_tracker[tracker_name].last_tokens[topic_name]:
                print (word)
        else:
            print("free topic")
        print()


In [9]:

def generate_topic_names(topic_count, background_topic_count):

    background_topics, objective_topics, all_topics = [], [], []
    for i in range(topic_count):
        topic_name = ("background_topic_" + str(i)) if i < background_topic_count \
            else ("objective_topic_" + str(i - background_topic_count))
        all_topics.append(topic_name)
        if i < background_topic_count:
            background_topics.append(topic_name)
        else:
            objective_topics.append(topic_name)
    return all_topics, objective_topics, background_topics


In [10]:
def set_regularizers(model, devided, topic_names,  **regs):
    all_topics, objective_topics, background_topics = topic_names
    if devided:
            if 'objective_sparse_phi' in regs:
                    model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='objective_sparse_phi',
                            topic_names=objective_topics,
                            tau=regs['objective_sparse_phi']),
                        overwrite= True)
            if 'objective_sparse_theta' in regs:
                    model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='objective_sparse_theta',
                            topic_names=objective_topics,
                            tau=regs['objective_sparse_theta']),
                        overwrite= True)
            if 'background_sparse_phi' in regs:
                    model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='background_sparse_phi',
                            topic_names=background_topics,
                            tau=regs['background_sparse_phi']),
                        overwrite= True)
            if 'background_sparse_theta' in regs:
                    model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='background_sparse_theta',
                            topic_names=background_topics,
                            tau=regs['background_sparse_theta']),
                        overwrite=True)
    else:
        if 'sparse_phi' in regs:
                    model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='sparse_phi',
                            tau=regs['sparse_phi']),
                        overwrite=True)
        if 'sparse_theta' in regs:
                    model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='sparse_theta',
                            tau=regs['sparse_theta']),
                        overwrite=True)
    if  'decorrelator_phi' in regs:
            if devided:
                model.regularizers.add(
                            artm.DecorrelatorPhiRegularizer(
                                name='decorrelator_phi',
                                topic_names=objective_topics,
                                tau=regs['decorrelator_phi']),
                            overwrite=True)
            else:
                model.regularizers.add(
                            artm.DecorrelatorPhiRegularizer(
                                name='decorrelator_phi',
                                tau=regs['decorrelator_phi']),
                            overwrite=True)



In [11]:
def set_scores(model, topic_names, devided=True,  **scores):
    #if not ('perplexity_score' in [score.name for
    #                               score in model.scores]):
    #    model.scores.add(PerplexityScore(name='perplexity_score'))
    all_topics, objective_topics ,background_topics = topic_names
    if 'top_tokens' in scores:
        model.scores.add(artm.TopTokensScore(
            name='top_tokens',
            num_tokens=scores['top_tokens']),
            overwrite= True)
    if 'top_tokens_extended' in scores:
        model.scores.add(artm.TopTokensScore(
            name='top_tokens_extended',
            num_tokens=scores['top_tokens_extended']),
            overwrite= True)

    if devided:
            if 'objective_sparsity_phi' in scores:
                    model.scores.add(
                        artm.SparsityPhiScore(
                            name='objective_sparsity_phi',
                            topic_names=objective_topics),
                        overwrite= True)
            if 'objective_sparsity_theta' in scores:
                    model.scores.add(
                        artm.SparsityThetaScore(
                            name='objective_sparsity_theta',
                            topic_names=objective_topics),
                        overwrite= True)
            if 'background_sparsity_phi' in scores:
                    model.scores.add(
                        artm.SparsityPhiScore(
                            name='background_sparsity_phi',
                            topic_names=background_topics),
                        overwrite= True)
            if 'background_sparsityity_theta' in scores:
                    model.scores.add(
                        artm.SparsityThetaScore(
                            name='background_sparsity_theta',
                            topic_names=background_topics),
                        overwrite=True)
    else:
        if 'sparsity_phi' in scores:
                    print ('if sparsity_phi in scores:')
                    model.scores.add(
                        artm.SparsityPhiScore(
                            name='sparsity_phi'),
                        overwrite=True)
        if 'sparsity_theta' in scores:
                    print ('sparsity_theta  in scores')
                    model.scores.add(
                        artm.SparsityThetaScore(
                            name='sparsity_theta'),
                        overwrite=True)



In [12]:

def get_scores(topic_names):
    # background_topics = (background_topics if background_topics else topics_amount//10)

    all_topics, objective_topics, background_topics = topic_names
    print("get_scores", all_topics)
    print("get scores : " , background_topics)
    print ("get_scores : " , objective_topics)

    scores_list=[]
    scores_list.append(artm.PerplexityScore(name='objective_perplexity_score',
                                            topic_names=objective_topics))
    scores_list.append(artm.SparsityPhiScore(name='objective_sparsity_phi',
                                             topic_names=objective_topics))
    scores_list.append(artm.SparsityThetaScore(name='objective_sparsity_theta',
                                               topic_names=objective_topics))

    scores_list.append(artm.PerplexityScore(name='perplexity_score',
                                            topic_names=all_topics))

    scores_list.append(artm.SparsityThetaScore(name='background_sparsity_theta',
                                               topic_names=background_topics))
    scores_list.append(artm.SparsityPhiScore(name='background_sparsity_phi',
                                               topic_names=background_topics))
    scores_list.append(artm.TopTokensScore(name="top_words",
                                              num_tokens=10, topic_names=objective_topics))
    return scores_list



## Get Batch vectorizer

In [13]:
batch_vectorizer = get_batch_vectorizer("contents_batches", "contents_wv.txt")

In [14]:
dictionary=get_dict("contents_batches")

In [15]:
T = 27
topic_names = generate_topic_names(T, 3)
all_topics, objective_topics, background_topics = topic_names

In [16]:
devided_model = artm.ARTM(num_topics=T,
                          topic_names = topic_names[0],
                          cache_theta=True,
                          reuse_theta=True,
                          theta_columns_naming="title",
                          seed=4242,
                          num_document_passes=20,
                          num_processors = 10,
                          class_ids = {"@content" :1, "@title":25, "@category_id" : 200})

devided_model.initialize(dictionary)

In [17]:
_num_tokens =20

##                                                 add some scores

In [18]:
devided_model.scores.add(artm.PerplexityScore(name='perplexity_score'))

### top tokens score

In [19]:
devided_model.scores.add(
    artm.TopTokensScore(
        name='top_tokens',
        num_tokens=_num_tokens, 
        class_id="@content", 
        topic_names=all_topics,
        ),overwrite=True)


### objective_sparsity scores

In [20]:
devided_model.scores.add(
    artm.SparsityPhiScore(
        name='objective_sparsity_phi', 
        topic_names=objective_topics), overwrite=True)
devided_model.scores.add(
    artm.SparsityThetaScore(
        name='objective_sparsity_theta',
        topic_names=objective_topics),overwrite=True)

### background_sparsity scores

In [21]:
devided_model.scores.add(
        artm.SparsityPhiScore(
            name='background_sparsity_phi',
            topic_names=background_topics))
devided_model.scores.add(
    artm.SparsityThetaScore(
        name='background_sparsity_theta',
        topic_names=background_topics),overwrite=True)

## add regulirizers

In [32]:
regs = {
    "objective_smooth_sparse_phi":-10,
    'objective_smooth_sparse_theta':-10,   
    'background_smooth_sparse_phi':1, 
    'background_smooth_sparse_theta':1,
    'decorrelator_phi':30000
}

### add objective regulirizers

In [33]:
devided_model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='objective_sparse_phi',
                            topic_names=objective_topics,
                            tau=regs['objective_smooth_sparse_phi']),overwrite=True)
devided_model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='objective_smooth_sparse_theta',
                            topic_names=objective_topics,
                            tau=regs['objective_smooth_sparse_theta']),overwrite=True)

### add background regulirizers

In [34]:
devided_model.regularizers.add(
                        artm.SmoothSparsePhiRegularizer(
                            name='background_smooth_sparse_phi',
                            topic_names=background_topics,
                            tau=regs['background_smooth_sparse_phi']),
                            overwrite= True)
devided_model.regularizers.add(
                        artm.SmoothSparseThetaRegularizer(
                            name='background_smooth_sparse_theta',
                            topic_names=background_topics,
                            tau=regs['background_smooth_sparse_theta']),
                            overwrite=True)

### add decorrelator

In [35]:
devided_model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(
                name='decorrelator_phi',
                topic_names=all_topics,
                tau=regs['decorrelator_phi'],class_ids =["@category_id", "@content", "@title"]),overwrite=True)

## reinitialize

In [36]:
devided_model.initialize(dictionary)

### fit offline

In [37]:
devided_model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=20)

In [38]:
devided_model.score_tracker['top_tokens'].last_tokens.keys()

dict_keys(['objective_topic_4', 'objective_topic_13', 'objective_topic_3', 'objective_topic_11', 'objective_topic_7', 'objective_topic_19', 'objective_topic_16', 'background_topic_1', 'background_topic_2', 'objective_topic_5', 'objective_topic_14', 'objective_topic_2', 'objective_topic_23', 'objective_topic_17', 'objective_topic_1', 'objective_topic_15', 'objective_topic_6', 'objective_topic_21', 'objective_topic_22', 'objective_topic_10', 'objective_topic_12', 'objective_topic_18', 'objective_topic_20', 'objective_topic_8', 'background_topic_0', 'objective_topic_9'])

In [39]:
for topic_name in devided_model.topic_names:
    if topic_name in devided_model.score_tracker['top_tokens'].last_tokens:
        print(topic_name + ': '+ ' '.join(devided_model.score_tracker['top_tokens'].last_tokens[topic_name][:10]))

print ("Perplexity:", devided_model.score_tracker["perplexity_score"].last_value)
print (devided_model.get_phi())
print(devided_model.get_theta())

background_topic_0: имя_муж фамиия_муж имя_жен становиться говорить тема беларусь делать работа новый
background_topic_1: имя_муж фамиия_муж имя_жен тема говорить становиться беларусь делать знать ребенок
background_topic_2: имя_муж фамиия_муж имя_жен тема минск город место говорить становиться беларусь
objective_topic_1: цветок сад деревня
objective_topic_2: платье футбол костюм игра церковь сословие играть код ходить клуб
objective_topic_3: з што имя_жен гэт як па пра ад мян яя
objective_topic_4: психолог мотоцикл домен медицинский сеть писать базовый индия газета трафик
objective_topic_5: ресторан блюдо кухня заведение соус паста повар еда вкус кафе
objective_topic_6: структуризация стрит арт р линди препарат граффити хоп танцевать художественный
objective_topic_7: женщина мужчина имя_жен секс девушка тема становиться сексуальный говорить друг
objective_topic_8: беларус европеец магазин генерал цвет дэ стиль граница официант вильнюс
objective_topic_9: манифест исторический националь

In [43]:
devided_model.get_phi().tail()

Unnamed: 0,background_topic_0,background_topic_1,background_topic_2,objective_topic_0,objective_topic_1,objective_topic_2,objective_topic_3,objective_topic_4,objective_topic_5,objective_topic_6,...,objective_topic_14,objective_topic_15,objective_topic_16,objective_topic_17,objective_topic_18,objective_topic_19,objective_topic_20,objective_topic_21,objective_topic_22,objective_topic_23
тушь,9e-06,5.867241e-06,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.014351,0.0,0.0,0.0,0.0,0.0,0.0,0.0
травматичность,1e-06,9.832995e-07,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
воск,9e-06,5.867241e-06,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.014351,0.0,0.0,0.0,0.0,0.0,0.0,0.0
грукай,3e-06,9.841007e-07,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
помада,9e-06,0.0001486347,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
import datetime

import pickle

import numpy as np

import numpy
import copy
import pickle

class Model:
    def __init__(
        self, phi_matrix, theta_matrix, 
        process_number, iterations_via_document,iterations_via_collection, 
        regulirizers, name, background_topic_number=0):
        self.__phi_matrix = phi_matrix
        self.__topic_names = tuple(self.__phi_matrix.index)
        self.__topic_matrix = phi_matrix.transpose()
        self.__theta_matrix = theta_matrix
        
        self.__topic_number = phi_matrix.shape[1]# int
        self.__process_number = process_number #int
        self.__iterations_via_document = iterations_via_document #int
        self.__iterations_via_collection = iterations_via_collection #int
        self.__regulirizers = regulirizers # dict
        self.__name = name #str
        self.__background_topic_number = background_topic_number
        self.__topic_destribution = TopicDestribution()
        
    def get_phi(self):
        return __phi_matrix
    
    def get_topic_namses(self):
        return __topic_names
    
    def get_theta(self):
        return __theta_matrix
    
    def get_topic_number(self):
        return copy.copy(__topic_number)
        
    def get_process_number(self):
        return copt.copy(__process_number)
    
    def get_iterations_via_document(self):
        return copy.copy(__iterations_via_document)
    
    def get_iterations_via_collection(self):
        return copy.copy(__iterations_via_collection)
        
    def get_regulirizers(self):
        return copt.copy(__regulirizers)
    
    def get_name(self):
        return copy.copy(__name)
    
    def get_token_distribution(self, token, modality_num = 0):
        try:
            token_distribution = self.loc[token]
            if len(token_distribution > 1):
                return token_distribution.iloc[0]
            return token_distribution
        except:
            return pd.Series(
                np.zeros(len(column_names)), 
                index=self.__topic_names)
        
    def save(self, file_to_be_saved_in):
        with open(file_to_pickle_in, "wb") as file_to_be_pickled_in:
            pickle.dump(self, file_to_be_pickled_in)

In [None]:
from scipy.spatial.distance import cosine

class TopicDestribution:
    def __init__(self, _filter, model):
        self.__filter = filter
        self.__model = model
        
    def get_distribution(self, text):
        tokens =  __filter.get_all_tokens(text)
        result_topic_vector = sum(
            self.__model.get_token_distribution(token) for token in tokens)
        return result_topic_vector / sum(result_topic_vector)
    
    def get_similarity(self, distribution_first, distribution_second)
        return cosine(distribution_first, distribution_second)
    
    def get_most_similar_distribution(self, text):
        text_distribution = self.get_distribution(text)
        most_similar_distribution, most_similar_topic_name, similarity_value = (
            self.model.get_topic_matrix().iloc[0], 
            self.model.get_topic_names()[0], np.inf)
        for topic_name in model.get_topic_names():
            if self.get_similarity(
                model.get_topic_matrix.loc[topic_name],
                text_distribution) < similarity_value:
                    most_similar_topic_name = topic_name
                    most_similar_distribution = self.model.get_topic_matrix().loc[topic_name] 
                    similarity_value = self.get_similarity(most_similar_, text_distribution)
        return most_similar_distribution
        

In [527]:
class TopicDestribution:
    def __init__(self, _filter):
        self.filter = __filter
    
    def get_similarity
    
    def get_distribution(self, text):
        tokens =  filter_.get_tokens(text)
        result_topic_vector = sum(
            model.get_token_distribution(token) for token in tokens)
        return result_topic_vector / sum(result_topic_vector)
    
    
    