In [None]:
import os
import gensim
from gensim import corpora
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.corpora import Dictionary
from gensim.corpora import textcorpus

import pandas as pd
import pickle

import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

### Setting paths

In [None]:
os.chdir("../..")
data_path = os.path.join(os.path.abspath(os.curdir), 'corpus', 'preprocessed')
result_path = os.path.join(os.path.abspath(os.curdir),'models','DTM')
dtm_path = "/Users/florianlorisch/Downloads/dtm-master/dtm/dtm"

### Loading preprocessed data

In [None]:
corpus = pd.read_pickle(os.path.join(data_path, 'corpus', 'corpus_preprocessed.pkl'))
dictionary = corpora.Dictionary.load_from_text(os.path.join(data_path, 'dictionary', 'dictionary_preprocessed.txt'))
texts = pd.read_pickle(os.path.join(data_path, 'lemmas', 'lemmatized_preprocessed.pkl'))
electoralTerms_info = pd.read_pickle(os.path.join(data_path, "electoralTerms", "electoralTerms_count.pkl"))

### Creating list with number of speeches per time step as input for DTM

In [None]:
def get_time_slices(data):
    '''

    :param data: Dataframe with speeches/documents per electoral term
    :return:list with number of speeches per term as input for dtm
    '''
    time_frame = data.to_frame()
    time_frame.sort_index(ascending=True, inplace=True)
    time_slices = time_frame['electoralTerm'].tolist()
    return time_slices

time_slices = get_time_slices(data=electoralTerms_info)

### Instantiating and training the dtm Model

In [None]:
def train_dtm(dtm_path, corpus, id2word, time_slices, alpha, num_topics, top_chain_var, rng_seed, initialize_lda, name):
    '''

    :param output_dir: Path to store trained model
    :return:
    '''

    dtm = DtmModel(dtm_path=dtm_path, corpus=corpus, id2word=id2word, time_slices=time_slices, alpha=alpha,
                num_topics=num_topics,top_chain_var=top_chain_var, rng_seed=rng_seed, initialize_lda=initialize_lda)
    dtm.save(os.path.join(result_path, 'model_results', name))


train_dtm(dtm_path=dtm_path, corpus=corpus, id2word=dictionary, time_slices=time_slices, alpha=0.45,
                num_topics=31,top_chain_var= 0.005, rng_seed=41, initialize_lda=True, name='dtm.model')