In [2]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

In [3]:
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [4]:
import os
import sys

if os.path.abspath('../../..') not in sys.path:
    sys.path.append(os.path.abspath('../../..'))

In [5]:
import json
import re
from collections import defaultdict

import tqdm

import numpy as np
import pandas as pd

from gensim.parsing.preprocessing import (
    preprocess_string, 
    strip_tags,
    strip_punctuation, 
    strip_multiple_whitespaces, 
    strip_numeric, 
    lower_to_unicode,
    strip_short,
    remove_stopwords,
)
import tomotopy as tp

from src.dataset import load_tweets, load_availability
from src.utils.text import Document, Dictionary
from src.config import config

In [6]:
print(tp.isa)

avx2


In [7]:
def load_keywords(path=None):
    if path is None:
        path = '../../../data/external/keywords.v3.2.csv'
    df = pd.read_csv(path)
    output = defaultdict(set)
    for row in df.itertuples():
        phrase, topic = row.raw_keyword, row.label
        for token in phrase.split(' '):
            if len(token) > 2:
                output[topic].add(token)
    return {k: list(v) for k, v in output.items()}

keywords = load_keywords()

In [8]:
def load_documents(path=None):
    if path is None:
        path = '../../../data/interim/models/tweets_intra_subject_analysis.jsonl'
    with open(path, 'r') as fp:
        for line in tqdm.tqdm(fp.readlines(), desc='Loading Documents'):
            record = json.loads(line)
            text = record['tweet']['text']
            author = record['subject_id']
            # author = '{}-{}-{}'.format(record['subject_id'], record['event_id'], record['period'])
            d = Document(text=text, author=author)
            yield d

url_pattern = re.compile('http[s]?://\S+')

corpus = []

for doc in load_documents():
    tokens = preprocess_string(doc.text, filters=[
        lower_to_unicode,
        lambda x: url_pattern.sub(' ', x),
        strip_tags,
        strip_punctuation,
        strip_numeric, 
        remove_stopwords,
        strip_short,
        strip_multiple_whitespaces, 
    ])
    doc.set_tokens(tokens)
    if len(doc.tokens) > 0:
        corpus.append(doc)
        
vocab = Dictionary(corpus)

Loading Documents: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 820202/820202 [00:45<00:00, 18160.77it/s]


In [9]:
if os.path.exists('../../../data/interim/models/author2doc.json'):
    with open('../../../data/interim/models/author2doc.json', 'r', encoding='utf-8') as fp:
        author2doc = json.load(fp)
else:
    author2doc = defaultdict(list)
    for i, doc in enumerate(tqdm.tqdm(results, desc='Extracting Tokens')):
        author2doc[doc.author].append(i)
    author2doc = dict(author2doc)
    with open('../../../data/interim/models/author2doc.json', 'w', encoding='utf-8') as fp:
        json.dump(author2doc, fp)

In [10]:
def create_topic_word_mat(keywords, vocab, num_topics, pseudo_count=1e7):
    # create a (ntopics, nterms) matrix and fill with 1
    topic_word_mat = np.full(shape=(num_topics, len(vocab)), fill_value=1)
    # for each topic in the seed dict
    topic2id = {topic: i for i, topic in enumerate(keywords)}
    for topic, tokens in keywords.items(): 
        # for each seed token that is in vocab
        for token in tokens:
            if token in vocab:
                topic_word_mat[topic2id[topic], vocab.token2id[token]] = pseudo_count
    # denom = topic_word_mat.sum(axis=0)
    # topic_word_mat = np.divide(topic_word_mat, pseudo_count)
    return topic_word_mat

topic_word_mat = create_topic_word_mat(keywords, vocab, len(keywords) + 1, len(corpus) // 100)

topic_word_mat.shape

(6, 318470)

In [18]:
import inspect

inspect.signature(model.add_doc)

<Signature (self, words, numeric_metadata=[], metadata='', multi_metadata=[])>

In [19]:
k = topic_word_mat.shape[0]
model = tp.GDMRModel(k=k, seed=42, degrees=[1])

for doc in tqdm.tqdm(corpus, desc='Adding Documents'):
    model.add_doc(doc.tokens, numeric_metadata=[], metadata='', multi_metadata=[str(doc.author)])

Adding Documents:   0%|                                                                                                                          | 0/812951 [00:00<?, ?it/s]


TypeError: function takes at most 3 arguments (4 given)

In [None]:
eta = model.eta
for i, token in enumerate(model.vocabs):
    model.set_word_prior(token, topic_word_mat[:, vocab.token2id[token]] + eta) 

In [None]:
desc_format = 'Traing Model - Iteration: {:>3.0f}, Log-likelihood: {:>3.4f}'
iterations = 100
chunksize = 10
pbar = tqdm.tqdm(range(0, iterations, chunksize), desc=desc_format.format(0, 0))
for i in pbar:
    model.train(chunksize)
    pbar.desc = desc_format.format(i + 1, model.ll_per_word)

In [None]:
for k in range(model.k):
    print('Top 10 words of topic #{}'.format(k))
    print(model.get_topic_words(k, top_n=10))

In [None]:
docs = [model.make_doc(doc.tokens) for doc in corpus]

In [15]:
model.infer(docs[:10])

([array([4.5481458e-04, 3.0764390e-04, 2.8886209e-04, 6.0548410e-03,
         1.6251026e-02, 9.7664279e-01], dtype=float32),
  array([6.7029160e-04, 4.5339603e-04, 4.2571602e-04, 8.9234365e-03,
         2.3950258e-02, 9.6557689e-01], dtype=float32),
  array([1.9896778e-04, 1.3458502e-04, 1.2636854e-04, 2.6488120e-03,
         9.9153662e-01, 5.3546205e-03], dtype=float32),
  array([1.7443637e-04, 1.1799156e-04, 1.1078813e-04, 2.3222310e-03,
         9.9258012e-01, 4.6944311e-03], dtype=float32),
  array([2.3152816e-04, 1.5660937e-04, 1.4704831e-04, 9.8496121e-01,
         8.2727568e-03, 6.2308852e-03], dtype=float32),
  array([1.0791211e-04, 7.2993484e-05, 6.8537200e-05, 1.4366088e-03,
         9.9540985e-01, 2.9041301e-03], dtype=float32),
  array([1.7443637e-04, 1.1799156e-04, 1.1078813e-04, 2.3222310e-03,
         9.9258012e-01, 4.6944311e-03], dtype=float32),
  array([1.9896778e-04, 1.3458502e-04, 1.2636854e-04, 2.6488120e-03,
         9.9153662e-01, 5.3546205e-03], dtype=float32),
