In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

In [2]:
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [3]:
import os
import sys

if os.path.abspath('../../..') not in sys.path:
    sys.path.append(os.path.abspath('../../..'))

In [4]:
import json
import re
from collections import defaultdict

import tqdm

import numpy as np
import pandas as pd

from gensim.parsing.preprocessing import (
    preprocess_string, 
    strip_tags,
    strip_punctuation, 
    strip_multiple_whitespaces, 
    strip_numeric, 
    lower_to_unicode,
    strip_short,
    remove_stopwords,
)
import tomotopy as tp

from src.dataset import load_tweets, load_availability
from src.utils.text import Document, Dictionary
from src.config import config

In [5]:
print(tp.isa)

avx2


In [6]:
def load_keywords(path=None):
    if path is None:
        path = '../../../data/external/keywords.v3.2.csv'
    df = pd.read_csv(path)
    output = defaultdict(set)
    for row in df.itertuples():
        phrase, topic = row.raw_keyword, row.label
        for token in phrase.split(' '):
            if len(token) > 2:
                output[topic].add(token)
    return {k: list(v) for k, v in output.items()}

keywords = load_keywords()

In [17]:
def load_documents(path=None):
    if path is None:
        path = '../../../data/interim/models/tweets_intra_subject_analysis.jsonl'
    with open(path, 'r') as fp:
        for line in tqdm.tqdm(fp.readlines(), desc='Loading Documents'):
            record = json.loads(line)
            text = record['tweet']['text']
            author = record['subject_id']
            # author = '{}-{}-{}'.format(record['subject_id'], record['event_id'], record['period'])
            d = Document(text=text, author=author)
            yield d

url_pattern = re.compile('http[s]?://\S+')

corpus = []

for doc in load_documents():
    tokens = preprocess_string(doc.text, filters=[
        lower_to_unicode,
        lambda x: url_pattern.sub(' ', x),
        strip_tags,
        strip_punctuation,
        strip_numeric, 
        remove_stopwords,
        strip_short,
        strip_multiple_whitespaces, 
    ])
    doc.set_tokens(tokens)
    if len(doc.tokens) > 0:
        corpus.append(doc)
        
vocab = Dictionary(corpus)

Loading Documents: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 820202/820202 [00:49<00:00, 16429.74it/s]


In [20]:
if os.path.exists('../../../data/interim/models/author2doc.json'):
    with open('../../../data/interim/models/author2doc.json', 'r', encoding='utf-8') as fp:
        author2doc = json.load(fp)
else:
    author2doc = defaultdict(list)
    for i, doc in enumerate(tqdm.tqdm(results, desc='Extracting Tokens')):
        author2doc[doc.author].append(i)
    author2doc = dict(author2doc)
    with open('../../../data/interim/models/author2doc.json', 'w', encoding='utf-8') as fp:
        json.dump(author2doc, fp)

In [87]:
def create_topic_word_mat(keywords, vocab, num_topics, pseudo_count=1e7):
    # create a (ntopics, nterms) matrix and fill with 1
    topic_word_mat = np.full(shape=(num_topics, len(vocab)), fill_value=1)
    # for each topic in the seed dict
    topic2id = {topic: i for i, topic in enumerate(keywords)}
    for topic, tokens in keywords.items(): 
        # for each seed token that is in vocab
        for token in tokens:
            if token in vocab:
                topic_word_mat[topic2id[topic], vocab.token2id[token]] = pseudo_count
    # denom = topic_word_mat.sum(axis=0)
    # topic_word_mat = np.divide(topic_word_mat, pseudo_count)
    return topic_word_mat

topic_word_mat = create_topic_word_mat(keywords, vocab, len(keywords) + 1, len(corpus) // 100)

topic_word_mat.shape

(6, 318470)

In [88]:
k = topic_word_mat.shape[0]
model = tp.LDAModel(k=k, seed=42)

for doc in tqdm.tqdm(corpus, desc='Adding Documents'):
    model.add_doc(doc.tokens)

Adding Documents: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 812951/812951 [00:06<00:00, 131081.29it/s]


In [89]:
eta = model.eta
for i, token in enumerate(model.vocabs):
    model.set_word_prior(token, topic_word_mat[:, vocab.token2id[token]] + eta) 

In [90]:
desc_format = 'Traing Model - Iteration: {:>3.0f}, Log-likelihood: {:>3.4f}'
iterations = 100
chunksize = 10
pbar = tqdm.tqdm(range(0, iterations, chunksize), desc=desc_format.format(0, 0))
for i in pbar:
    model.train(chunksize)
    pbar.desc = desc_format.format(i + 1, model.ll_per_word)

Traing Model - Iteration:  91, Log-likelihood: -9.2750: 100%|███████████████████████████████████████████████████████████████████████████████| 10/10 [02:51<00:00, 17.16s/it]


In [91]:
for k in range(model.k):
    print('Top 10 words of topic #{}'.format(k))
    print(model.get_topic_words(k, top_n=10))

Top 10 words of topic #0
[('gas', 0.03282236307859421), ('emissions', 0.027694014832377434), ('fossil', 0.024523762986063957), ('greenhouse', 0.02424403466284275), ('coal', 0.0201413556933403), ('carbon', 0.018649471923708916), ('fuel', 0.015013006515800953), ('fuels', 0.014733278192579746), ('plants', 0.01408057939261198), ('power', 0.01408057939261198)]
Top 10 words of topic #1
Top 10 words of topic #2
[('energy', 0.03311057761311531), ('solar', 0.022377027198672295), ('carbon', 0.020436499267816544), ('wind', 0.01722249761223793), ('power', 0.016130950301885605), ('climate', 0.01603998802602291), ('global', 0.014311703853309155), ('change', 0.014099459163844585), ('clean', 0.01337176002562046), ('renewable', 0.01276534516364336)]
Top 10 words of topic #3
[('new', 0.0058397939428687096), ('people', 0.004690014757215977), ('amp', 0.004454811103641987), ('like', 0.003920449875295162), ('today', 0.0038117500953376293), ('covid', 0.0034364552702754736), ('story', 0.0034170113503932953), 

In [93]:
model.summary()

<Basic Info>
| LDAModel (current version: 0.12.2)
| 812951 docs, 8736916 words
| Total Vocabs: 318470, Used Vocabs: 318470
| Entropy of words: 9.15080
| Entropy of term-weighted words: 9.15080
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 100, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -9.27501
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 6 (the number of topics between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 42 (random seed)
| trained in version 0.12.2
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)
|  [0.00024405 