# Topic Modeling

In [None]:
import psycopg2

from gensim.models import Phrases, LdaModel
from gensim import corpora
from gensim.corpora import Dictionary

import logging
from itertools import repeat, chain

import tables

import numpy as np

In [None]:
import multiprocessing
from multiprocessing import Pool
multiprocessing.set_start_method('fork') # Because Mac OS does not default to forking processes.

## Create bag of words

Load words from SQL database, construct bigrams/trigrams, build Gensim dictionary, build Gensim bag of words

In [None]:
# Load bag of words for just the training data

conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
with conn.cursor() as cursor:
    cursor.execute('''SELECT wordbags.id, wordbags.bag
                        FROM wordbags
                        LEFT JOIN traintest
                        ON wordbags.id = traintest.id
                        WHERE traintest.split = 'train';''')
    article_bow = cursor.fetchall()
conn.close()

In [None]:
bigrams = Phrases((doc for _, doc in article_bow), min_count = 20) # Default threshold is 10
trigrams = Phrases(bigrams[(doc for _, doc in article_bow)], min_count = 50) # And some 4-grams

bigrams.save('GensimModels/train_bigrams')
trigrams.save('GensimModels/train_trigrams')

In [None]:
# May be memory-intensive depending on corpus size.
# Would be better to stream it into a file with smart_open

grams_list = list(trigrams[(bg for bg in bigrams[(doc for _, doc in article_bow)])])

In [None]:
dictionary = Dictionary(grams_list)
dictionary.filter_extremes(no_below = 20, no_above = 0.5)
dictionary.save('GensimModels/train_dictionary')

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in grams_list]
corpora.MmCorpus.serialize('GensimModels/train_corpus.mm', corpus)

## Define LDA model

Gensim uses a variational Bayes approach described in Hoffman *et al*. Online Learning for Latent Dirichlet Allocation, *NIPS* (2010). The posterior distribution is approximated by a trial distribution, and ELBO is maximized with respect to variational parameters using coordinate ascent.

The variational parameters are:
- $\mathbf{\phi}$, parameters for multinomial distributions for topics per word
- $\mathbf{\gamma}$, parameters for Dirichlet distributions for the topic distribution per document
- $\mathbf{\lambda}$, parameters for Dirichlet distributions for the word distribution per topic

For reasons I don't understand, the coordinate ascent is split intoin two steps that are analogous to the EM algorithm. In the E-step, $\mathbf{\gamma}$ and $\mathbf{\phi}$ are iteratively updated with $\mathbf{\lambda}$ fixed. In the M-step, $\mathbf{\lambda}$ is updated.

In [None]:
dictionary[0] # Weirdly, this is necessary or else id2word will be empty
id2word = dictionary.id2token # Mapping from indexes to words

In [None]:
lda_arguments = dict(
    corpus = corpus,
    id2word = id2word,
    chunksize = 2000,
    passes = 50,        # Number of passes through entire corpus
    iterations = 500,   # Maximum number of iterations in the E-step
    eval_every = None,  # Don't compute perplexity
    alpha = 'auto',     # Automatically find the Dirichlet prior for the topic distribution per document
    eta = 'auto',       # Automatically find the Dirichlet prior for the word distribution per topic
    update_every = 1,   # Number of chunks processed in the E-step
)

In [None]:
def train_lda_model(num_topics, lda_parameters):
    logger = logging.getLogger()
    loghandle = logging.FileHandler(f'GensimModels/logs/lda_nTopics={num_topics}.log')
    logformat = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    loghandle.setFormatter(logformat)
    if len(logger.handlers) > 0:
        logger.removeHandler(logger.handlers[0])
    logger.addHandler(loghandle)
    logger.setLevel(logging.INFO)
    params = {'num_topics' : num_topics, **lda_parameters}
    model = LdaModel(**params)
    model.save(f'GensimModels/lda_model_{num_topics}')

## Train LDA models

In [None]:
nTopics = chain(range(5,30,5),range(30, 200 + 1, 10))

with Pool(4) as pool:
    pool.starmap(train_lda_model, zip(nTopics, repeat(lda_arguments)))

## Generate topic vectors

In [None]:
nTopics = chain(range(5,30,5),range(30, 200 + 1, 10))

In [None]:
bigrams = Phrases.load('GensimModels/train_bigrams')
trigrams = Phrases.load('GensimModels/train_trigrams')
dictionary = Dictionary.load('GensimModels/train_dictionary')

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
with conn.cursor() as cursor:
    cursor.execute('''SELECT wordbags.id, wordbags.bag, traintest.split
                        FROM wordbags
                        LEFT JOIN traintest
                        ON wordbags.id = traintest.id;''')
    article_bow = cursor.fetchall()
conn.close()

In [None]:
for num_topics in nTopics:
    class TopicVector(tables.IsDescription):
        id = tables.Int64Col()
        traintest = tables.StringCol(5)
        topicvector = tables.Float32Col(shape=(num_topics))
    model = LdaModel.load(f'GensimModels/lda_model_{num_topics}')
    with tables.open_file('GensimModels/article_data.h5', mode = 'a') as f:
        try:
            group = f.root.topic
        except tables.NoSuchNodeError:
            group = f.create_group('/', 'topic')
        table = f.create_table(group, f'ntopics{num_topics}', TopicVector)
        entry = table.row
        for article in article_bow:
            entry['id'] = article[0]
            entry['traintest'] = article[2]
            entry['topicvector'] = np.array(model.get_document_topics(dictionary.doc2bow(trigrams[bigrams[article[1]]]), minimum_probability = -1))[:, 1]
            entry.append()
        # table.flush()

## Symmetric Beta

Intuitively, a symmetric beta (eta in Gensim) makes more sense, so trying that here.

Everything below is code duplication of ealier sections of this notebook. It would be better to refactor.

In [None]:
nTopics = chain(range(5,30,5),range(30, 200 + 1, 10))

bigrams = Phrases.load('GensimModels/train_bigrams')
trigrams = Phrases.load('GensimModels/train_trigrams')
dictionary = Dictionary.load('GensimModels/train_dictionary')

In [None]:
dictionary[0] # Weirdly, this is necessary or else id2word will be empty
id2word = dictionary.id2token # Mapping from indexes to words

In [None]:
corpus = corpora.MmCorpus('GensimModels/train_corpus.mm')

In [None]:
lda_arguments = dict(
    corpus = corpus,
    id2word = id2word,
    chunksize = 2000,
    passes = 50,        # Number of passes through entire corpus
    iterations = 500,   # Maximum number of iterations in the E-step
    eval_every = None,  # Don't compute perplexity
    alpha = 'auto',     # Automatically find the Dirichlet prior for the topic distribution per document
    eta = 'symmetric',  # SYMMETRIC BETA
    update_every = 1,   # Number of chunks processed in the E-step
)

In [None]:
def train_lda_model_symmetricbeta(num_topics, lda_parameters):
    logger = logging.getLogger()
    loghandle = logging.FileHandler(f'GensimModels/logs/lda_nTopics={num_topics}_symmetricbeta.log')
    logformat = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    loghandle.setFormatter(logformat)
    if len(logger.handlers) > 0:
        logger.removeHandler(logger.handlers[0])
    logger.addHandler(loghandle)
    logger.setLevel(logging.INFO)
    params = {'num_topics' : num_topics, **lda_parameters}
    model = LdaModel(**params)
    model.save(f'GensimModels/lda_model_{num_topics}_symmetricbeta')

In [None]:
with Pool(4) as pool:
    pool.starmap(train_lda_model_symmetricbeta, zip(nTopics, repeat(lda_arguments)))

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
with conn.cursor() as cursor:
    cursor.execute('''SELECT wordbags.id, wordbags.bag, traintest.split
                        FROM wordbags
                        LEFT JOIN traintest
                        ON wordbags.id = traintest.id;''')
    article_bow = cursor.fetchall()
conn.close()

In [None]:
for num_topics in nTopics:
    class TopicVector(tables.IsDescription):
        id = tables.Int64Col()
        traintest = tables.StringCol(5)
        topicvector = tables.Float32Col(shape=(num_topics))
    model = LdaModel.load(f'GensimModels/lda_model_{num_topics}_symmetricbeta')
    with tables.open_file('GensimModels/article_data_symmetricbeta.h5', mode = 'a') as f:
        try:
            group = f.root.topic
        except tables.NoSuchNodeError:
            group = f.create_group('/', 'topic')
        table = f.create_table(group, f'ntopics{num_topics}', TopicVector)
        entry = table.row
        for article in article_bow:
            entry['id'] = article[0]
            entry['traintest'] = article[2]
            entry['topicvector'] = np.array(model.get_document_topics(dictionary.doc2bow(trigrams[bigrams[article[1]]]), minimum_probability = -1))[:, 1]
            entry.append()
        table.flush()