In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

In [2]:
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [3]:
import os
import sys

if os.path.abspath('../..') not in sys.path:
    sys.path.append(os.path.abspath('../..'))

In [4]:
import json
import re
import logging

from collections import defaultdict

import tqdm

import numpy as np
import pandas as pd

from flashtext import KeywordProcessor

from gensim.parsing.preprocessing import (
    preprocess_string, 
    strip_tags,
    strip_punctuation, 
    strip_multiple_whitespaces, 
    strip_numeric, 
    lower_to_unicode,
    strip_short,
    remove_stopwords,
)
from gensim.corpora.dictionary import Dictionary
from gensim.models import AuthorTopicModel as GensimAuthorTopicModel
from gensim.test.utils import temporary_file
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

from src.dataset import load_tweets, load_availability
from src.config import config

In [5]:
logging.basicConfig(
    format='%(asctime)s:%(levelname)s:%(message)s', 
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
    ]
)

In [6]:
def load_keywords(path=None):
    if path is None:
        path = '../../data/external/keywords.v3.2.csv'
    df = pd.read_csv(path)
    output = defaultdict(set)
    for row in df.itertuples():
        phrase, topic = row.raw_keyword, row.label
        for token in phrase.split(' '):
            if len(token) > 2:
                output[topic].add(token)
    return {k: list(v) for k, v in output.items()}

keywords = load_keywords()

In [7]:
keyword_processor = KeywordProcessor()

keyword_processor.add_keywords_from_dict(keywords)

In [8]:
url_pattern = re.compile('\Bhttp[s]?://\S+')
hashtag_pattern = re.compile('\B\#[a-zA-Z0-9_]+')
mention_pattern = re.compile('\B\@[a-zA-Z0-9_]+')

In [9]:
class Document:
    def __init__(self, text, author=None):
        self.text = text
        self.author = author
        self.tokens = None
        
    def set_tokens(self, tokens):
        self.tokens = tokens
        return self
    
def load_documents(path=None):
    if path is None:
        path = '../../data/interim/models/tweets_intra_subject_analysis.jsonl'
    with open(path, 'r') as fp:
        for line in tqdm.tqdm(fp.readlines(), desc='Loading Documents'):
            record = json.loads(line)
            text = record['tweet']['text']
            author = record['subject_id']
            d = Document(text=text, author=author)
            yield d

In [10]:
results = []

for doc in load_documents():
    keywords_found = keyword_processor.extract_keywords(doc.text)
    if len(keywords_found) == 0:
        continue
    hashtags = hashtag_pattern.findall(doc.text)
    tokens = preprocess_string(doc.text, filters=[
        lower_to_unicode,
        lambda x: url_pattern.sub(' ', x),
        lambda x: hashtag_pattern.sub(' ', x),
        lambda x: mention_pattern.sub(' ', x),
        strip_tags,
        strip_punctuation,
        strip_numeric,
        lambda x: x + ' '.join(hashtags),
        remove_stopwords,
        strip_short,
        strip_multiple_whitespaces, 
    ])
    doc.set_tokens(tokens)
    if len(doc.tokens) > 5:
        results.append(doc)

Loading Documents: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 820202/820202 [01:08<00:00, 11996.85it/s]


In [None]:
tokenized_docs = []
for i, doc in enumerate(tqdm.tqdm(results, desc='Extracting Tokens')):
    tokenized_docs.append(doc.tokens)

author2doc = defaultdict(list)
for i, doc in enumerate(tqdm.tqdm(results, desc='Extracting Author Docs')):
    author2doc[doc.author].append(i)
author2doc = dict(author2doc)
        
dictionary = Dictionary(tokenized_docs)

corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tqdm.tqdm(tokenized_docs)]

In [12]:
def create_eta(keywords, vocab, num_topics, pseudo_count=1e7, normalize=True):
    # create a (ntopics, nterms) matrix and fill with 1
    eta = np.full(shape=(num_topics, len(vocab)), fill_value=1)
    # for each topic in the seed dict
    for topic, tokens in keywords.items(): 
        # for each seed token that is in vocab
        for token in tokens:
            if token in vocab:
                eta[topic, vocab.token2id[token]] = pseudo_count
    if normalize:
        eta = np.divide(eta, eta.sum(axis=0))
    return eta

eta = create_eta(keywords, dictionary, len(keywords) + 1, len(corpus) // 100, normalize=False)

eta.shape

(6, 427395)

In [None]:
# Set training parameters.
kwargs = {
    'passes': 20,
    'iterations': 50,
}

# training_corpus = corpus
# # define perplexity callback for hold_out and test corpus
# pl_holdout = PerplexityMetric(corpus=holdout_corpus, logger="visdom", title="Perplexity (hold_out)")
# pl_test = PerplexityMetric(corpus=test_corpus, logger="visdom", title="Perplexity (test)")
# # define other remaining metrics available
# ch_umass = CoherenceMetric(corpus=training_corpus, coherence="u_mass", logger="visdom", title="Coherence (u_mass)")
# ch_cv = CoherenceMetric(corpus=training_corpus, texts=training_texts, coherence="c_v", logger="visdom", title="Coherence (c_v)")
# diff_kl = DiffMetric(distance="kullback_leibler", logger="visdom", title="Diff (kullback_leibler)")
# convergence_kl = ConvergenceMetric(distance="jaccard", logger="visdom", title="Convergence (jaccard)")

# callbacks = [ch_umass, diff_kl, convergence_kl]

with temporary_file('serialized') as s_path:
    model = GensimAuthorTopicModel(
        corpus,
        author2doc=author2doc, 
        passes=kwargs.get('passes', 1),
        iterations=kwargs.get('iterations', 50),
        id2word=dictionary, 
        num_topics=eta.shape[0],
        eta=eta,
        serialized=True, 
        serialization_path=s_path,
        eval_every=None,
    )

2022-04-19 18:43:17,285:DEBUG:{'uri': '/var/folders/j6/3yj400mn1k1c_5czn3j_1mm00000gp/T/tmpcyq8n7g1/serialized', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'compression': None, 'transport_params': None}
2022-04-19 18:43:17,289:DEBUG:{'uri': '/var/folders/j6/3yj400mn1k1c_5czn3j_1mm00000gp/T/tmpcyq8n7g1/serialized', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'compression': None, 'transport_params': None}
2022-04-19 18:43:17,294:DEBUG:{'uri': '/var/folders/j6/3yj400mn1k1c_5czn3j_1mm00000gp/T/tmpcyq8n7g1/serialized', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'compression': None, 'transport_params': None}
2022-04-19 18:43:17,299:DEBUG:{'uri': '/var/folders/j6/3yj400mn1k1c_5czn3j_1mm00000gp/T/tmpcyq8n7g1/serialized',

In [None]:
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

In [None]:
# Train the model on the corpus.
for topic_id in range(model.num_topics):
    for term_id, p in model.get_topic_terms(topic_id):
        print('{:>02} {:<30s}{:0.3f}'.format(topic_id, dictionary[term_id], p))

In [None]:
model.save('../../data/interim/models/models/v3.pt')