In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

In [2]:
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [3]:
import os
import sys

if os.path.abspath('../..') not in sys.path:
    sys.path.append(os.path.abspath('../..'))

In [66]:
import json
import re
from collections import defaultdict

import tqdm

import numpy as np
import pandas as pd

from gensim.parsing.preprocessing import (
    preprocess_string, 
    strip_tags,
    strip_punctuation, 
    strip_multiple_whitespaces, 
    strip_numeric, 
    lower_to_unicode,
    strip_short,
    remove_stopwords,
)
from gensim.corpora.dictionary import Dictionary
# from gensim.models.ldamodel import LdaModel as _LdaModel
from gensim.models import AuthorTopicModel as GensimAuthorTopicModel
from gensim.test.utils import temporary_file

from src.dataset import load_tweets, load_availability
from src.config import config

In [None]:
def load_keywords(path=None):
    if path is None:
        path = '../../data/external/keywords.v3.2.csv'
    df = pd.read_csv(path)
    output = defaultdict(set)
    for row in df.itertuples():
        phrase, topic = row.raw_keyword, row.label
        for token in phrase.split(' '):
            if len(token) > 2:
                output[topic].add(token)
    return {k: list(v) for k, v in output.items()}

keywords = load_keywords()

In [7]:
class Document:
    def __init__(self, text, author=None):
        self.text = text
        self.author = author
        self.tokens = None
        
    def set_tokens(self, tokens):
        self.tokens = tokens
        return self

def load_documents(path=None):
    if path is None:
        path = '../../data/interim/models/tweets_intra_subject_analysis.jsonl'
    with open(path, 'r') as fp:
        for line in tqdm.tqdm(fp.readlines(), desc='Loading Documents'):
            record = json.loads(line)
            text = record['tweet']['text']
            author = record['subject_id']
            # author = '{}-{}-{}'.format(record['subject_id'], record['event_id'], record['period'])
            d = Document(text=text, author=author)
            yield d

url_pattern = re.compile('http[s]?://\S+')

results = []

for doc in load_documents():
    tokens = preprocess_string(doc.text, filters=[
        lower_to_unicode,
        lambda x: url_pattern.sub(' ', x),
        strip_tags,
        strip_punctuation,
        strip_numeric, 
        remove_stopwords,
        strip_short,
        strip_multiple_whitespaces, 
    ])
    doc.set_tokens(tokens)
    if len(doc.tokens) > 0:
        results.append(doc)

Loading Documents: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 820202/820202 [00:51<00:00, 15845.21it/s]


In [None]:
if os.path.exists('../../data/interim/models/author2doc.json'):
    with open('../../data/interim/models/author2doc.json', 'r', encoding='utf-8') as fp:
        author2doc = json.load(fp)
else:
    author2doc = defaultdict(list)
    tokenized_docs = []
    for i, doc in enumerate(tqdm.tqdm(results, desc='Extracting Tokens')):
        author2doc[doc.author].append(i)
        tokenized_docs.append(doc.tokens)
    author2doc = dict(author2doc)
    with open('../../data/interim/models/author2doc.json', 'w', encoding='utf-8') as fp:
        json.dump(author2doc, fp)
        
if os.path.exists('../../data/interim/models/dictionary.pk'):
    dictionary = Dictionary.load('../../data/interim/models/dictionary.pk')
else:
    dictionary = Dictionary(tokenized_docs)
    dictionary.save('../../data/interim/models/dictionary.pk')

corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tqdm.tqdm(tokenized_docs)]

In [57]:
def create_eta(keywords, vocab, num_topics, pseudo_count=1e7):
    # create a (ntopics, nterms) matrix and fill with 1
    eta = np.full(shape=(num_topics, len(vocab)), fill_value=1)
    # for each topic in the seed dict
    for topic, tokens in keywords.items(): 
        # for each seed token that is in vocab
        for token in tokens:
            if token in vocab:
                eta[topic, vocab.token2id[token]] = pseudo_count
    return np.divide(eta, eta.sum(axis=0))

eta = create_eta(keywords, dictionary, len(keywords) + 1, len(corpus) // 100)

eta.shape

(6, 318470)

In [61]:
with temporary_file('serialized') as s_path:
    model = GensimAuthorTopicModel(
        corpus,
        author2doc=author2doc, 
        id2word=dictionary, 
        num_topics=eta.shape[0],
        eta=eta,
        serialized=True, 
        serialization_path=s_path,
    )

In [63]:
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

In [65]:
# Train the model on the corpus.
for topic_id in range(model.num_topics):
    for term_id, p in model.get_topic_terms(topic_id):
        print('{:>02} {:<30s}{:0.3f}'.format(topic_id, dictionary[term_id], p))

00 trump                         0.006
00 wlrn                          0.005
00 nahj                          0.003
00 journalists                   0.003
00 highcountrynews               0.003
00 city                          0.003
00 journalism                    0.003
00 stories                       0.003
00 craigtimes                    0.003
00 proactive                     0.002
01 broward                       0.009
01 federal                       0.008
01 hospital                      0.005
01 fort                          0.005
01 staff                         0.005
01 conference                    0.005
01 communities                   0.005
01 tested                        0.004
01 editor                        0.004
01 chief                         0.004
02 que                           0.008
02 puerto                        0.006
02 los                           0.005
02 politeicecream                0.005
02 rico                          0.004
02 jgbm                  

In [68]:
model.save('../../data/interim/models/models/v1.pt')