In [None]:
import numpy as np
import pandas as pd
import pickle

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

import os
import random
from tqdm import tqdm
import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")


### Setting paths

In [None]:
os.chdir("../..")
data_path = os.path.join(os.path.abspath(os.curdir), 'corpus', 'preprocessed')
result_path = os.path.join(os.path.abspath(os.curdir),'models','LDA')

### Loading preprocessed files

In [None]:
corpus = pd.read_pickle(os.path.join(data_path, 'corpus', 'corpus_preprocessed.pkl'))
dictionary = corpora.Dictionary.load_from_text(os.path.join(data_path, 'dictionary', 'dictionary_preprocessed.txt'))
texts = pd.read_pickle(os.path.join(data_path, 'lemmas', 'lemmatized_preprocessed.pkl'))

### Modeling


In [None]:
def topic_search(result_path,
                min_topics,
                max_topics,
                step_size_topics,
                min_random,
                max_random,
                step_size_random,
                chunksize,
                passes):
    '''
    :param result_path: directory the search results are saved to
    :param min_topics: min number of topics in the grid
    :param max_topics: max number of topics in the grid
    :param step_size_topics: step size between topics in the grid
    :param min_random: lowest random value in the grid
    :param max_random: highest random value in the grid
    :param step_size_random: steps between random values considered
    :param chunksize: number of documents to be used in each training chunk
    :param passes: Nnumber of passes through the corpus during training
    :return: a dataframe with the topic coherence for all models that are trained on based on the grids for num_topics and random_state
    '''

    #random parameter
    random_range = range(min_random, max_random, step_size_random)

    #topics parameter
    topics_range = range(min_topics, max_topics, step_size_topics)

    model_results = {'Random_State': [],
                     'Topics': [],
                     'Coherence': []
                     }
    pbar = tqdm(total= len(topics_range)*len(random_range))

    for r in random_range:
        for k in topics_range:
            model = gensim.models.LdaMulticore(corpus=corpus,
                                               id2word=dictionary,
                                               num_topics=k,
                                               random_state=r,
                                               chunksize=chunksize,
                                               passes=passes,
                                               eval_every=None)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            model_results['Topics'].append(k)
            model_results['Random_State'].append(r)
            model_results['Coherence'].append(coherencemodel.get_coherence())

            pbar.update(1)

    model_results = pd.DataFrame.from_dict(model_results, orient='index').T

    path = os.path.join(result_path, "topic_search_results")
    os.makedirs(path, exist_ok=True)

    file_name = os.path.join(path, "topic_search_results.pkl")
    with open(file_name, 'wb') as handle:
        pickle.dump(model_results, handle)


    return model_results

In [None]:
topic_search(result_path=result_path,min_topics=1,max_topics=222,step_size_topics=10,min_random=1,max_random=100,step_size_random=20,chunksize=20000,passes=15)

In [None]:
def alpha_eta_search(result_path,
                     num_topics,
                     min_alpha,
                     max_alpha,
                     num_steps_alpha,
                     min_beta,
                     max_beta,
                     num_steps_beta,
                     random_state,
                     chunksize,
                     passes):
    '''
    :param result_path: directory the search results are saved to
    :param num_topics: number of topics
    :param min_alpha: lowest alpha value in the grid
    :param max_alpha: highest alpha value in the grid
    :param min_beta: lowest beta value in the grid
    :param max_beta: highest beta value in the grid
    :param chunksize: number of documents to be used in each training chunk
    :param random_state: random state of the model
    :param passes: number of passes through the corpus during training
    :return: a dataframe with the topic coherence for all models that are trained on based on the grids for alpha and beta
    '''

    # define alpha grid
    alpha = list(np.linspace(min_alpha, max_alpha, num_steps_alpha))
    alpha = [round(elem, 5) for elem in alpha]

    # define beta grid
    beta = list(np.linspace(min_beta, max_beta, num_steps_beta))
    beta = [round(elem, 5) for elem in beta]

    # instantiate model results dictionary
    model_results = {'Alpha': [],
                     'Eta': [],
                     'Coherence': []
                     }

    pbar = tqdm(total=(len(beta) * len(alpha)))
    for a in alpha:
        for b in beta:
            model = gensim.models.LdaMulticore(corpus=corpus,
                                               id2word=dictionary,
                                               num_topics=num_topics,
                                               alpha=a,
                                               beta=b,
                                               random_state=random_state,
                                               chunksize=chunksize,
                                               passes=passes,
                                               eval_every=None)
            cv = CoherenceModel(model=model,
                                texts=texts,
                                dictionary=dictionary,
                                coherence='c_v')
            # Save the model results
            model_results['Alpha'].append(a)
            model_results['Beta'].append(b)
            model_results['Coherence'].append(cv.get_coherence())

            pbar.update(1)

    model_results = pd.DataFrame.from_dict(model_results, orient='index').T

    path = os.path.join(result_path, "alpha_beta_search_results")
    os.makedirs(path, exist_ok=True)

    file_name = os.path.join(path, "alpha_beta_search_results.pkl")
    with open(file_name, 'wb') as handle:
        pickle.dump(model_results, handle)

    return model_results

###  Train and save final model

In [None]:
def train_lda(corpus, id2word,num_topics,alpha,eta,chunksize,passes,random_state):
    '''
    :return: train a new LDA Model
    '''
    return gensim.models.LdaMulticore(corpus=corpus,
                                      id2word=id2word,
                                      num_topics=num_topics,
                                      alpha=alpha,
                                      eta=eta,
                                      chunksize=chunksize,
                                      passes=passes,
                                      random_state = random_state)

lda = train_lda(corpus=corpus,id2word=dictionary,num_topics=31,random_state=41,alpha=0.45,eta=0.89,chunksize=20000,passes=15)
lda.save(os.path.join(result_path,'model_results', 'lda.model'))


