In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

In [2]:
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [3]:
import os
import sys

if os.path.abspath('../..') not in sys.path:
    sys.path.append(os.path.abspath('../..'))

In [4]:
import json
import re
from collections import defaultdict

import tqdm

import numpy as np
import pandas as pd

from flashtext import KeywordProcessor

from gensim.parsing.preprocessing import (
    preprocess_string, 
    strip_tags,
    strip_punctuation, 
    strip_multiple_whitespaces, 
    strip_numeric, 
    lower_to_uniDictionary,
    strip_short,
    remove_stopwords,
)
from gensim.corpora.dictionary import Dictionary
from gensim.models import AuthorTopicModel as GensimAuthorTopicModel
from gensim.test.utils import temporary_file

from src.dataset import load_tweets, load_availability
from src.config import config

In [5]:
def load_keywords(path=None):
    if path is None:
        path = '../../data/external/keywords.v3.2.csv'
    df = pd.read_csv(path)
    output = defaultdict(set)
    for row in df.itertuples():
        phrase, topic = row.raw_keyword, row.label
        for token in phrase.split(' '):
            if len(token) > 2:
                output[topic].add(token)
    return {k: list(v) for k, v in output.items()}

keywords = load_keywords()

In [None]:
keyword_processor = KeywordProcessor()

keyword_processor.add_keywords_from_dict(keywords)

{'analysis', 'causes', 'description', 'problem', 'solution'}

In [13]:
class Document:
    def __init__(self, text, author=None):
        self.text = text
        self.author = author
        self.tokens = None
        
    def set_tokens(self, tokens):
        self.tokens = tokens
        return self

def load_documents(path=None):
    if path is None:
        path = '../../data/interim/models/tweets_intra_subject_analysis.jsonl'
    with open(path, 'r') as fp:
        for line in tqdm.tqdm(fp.readlines(), desc='Loading Documents'):
            record = json.loads(line)
            text = record['tweet']['text']
            author = record['subject_id']
            d = Document(text=text, author=author)
            yield d

url_pattern = re.compile('http[s]?://\S+')

results = []

for doc in load_documents():
    keywords_found = keyword_processor.extract_keywords(doc.text)
    if len(keywords_found) == 0:
        continue
    tokens = preprocess_string(doc.text, filters=[
        lower_to_unicode,
        lambda x: url_pattern.sub(' ', x),
        strip_tags,
        strip_punctuation,
        strip_numeric, 
        remove_stopwords,
        strip_short,
        strip_multiple_whitespaces, 
    ])
    doc.set_tokens(tokens)
    if len(doc.tokens) > 0:
        results.append(doc)

Loading Documents: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 820202/820202 [01:10<00:00, 11600.77it/s]


In [14]:
tokenized_docs = []
for i, doc in enumerate(tqdm.tqdm(results, desc='Extracting Tokens')):
    tokenized_docs.append(doc.tokens)

author2doc = defaultdict(list)
for i, doc in enumerate(tqdm.tqdm(results, desc='Extracting Author Docs')):
    author2doc[doc.author].append(i)
author2doc = dict(author2doc)
        
dictionary = Dictionary(tokenized_docs)

corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tqdm.tqdm(tokenized_docs)]

Extracting Tokens: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 478927/478927 [00:00<00:00, 1704669.45it/s]
Extracting Author Docs: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 478927/478927 [00:00<00:00, 1728318.16it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 478927/478927 [00:08<00:00, 58655.08it/s]


In [15]:
def create_eta(keywords, vocab, num_topics, pseudo_count=1e7):
    # create a (ntopics, nterms) matrix and fill with 1
    eta = np.full(shape=(num_topics, len(vocab)), fill_value=1)
    # for each topic in the seed dict
    for topic, tokens in keywords.items(): 
        # for each seed token that is in vocab
        for token in tokens:
            if token in vocab:
                eta[topic, vocab.token2id[token]] = pseudo_count
    return np.divide(eta, eta.sum(axis=0))

eta = create_eta(keywords, dictionary, len(keywords) + 1, len(corpus) // 100)

eta.shape

(6, 211574)

In [16]:
with temporary_file('serialized') as s_path:
    model = GensimAuthorTopicModel(
        corpus,
        author2doc=author2doc, 
        id2word=dictionary, 
        num_topics=eta.shape[0],
        eta=eta,
        serialized=True, 
        serialization_path=s_path,
    )

In [17]:
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

In [18]:
# Train the model on the corpus.
for topic_id in range(model.num_topics):
    for term_id, p in model.get_topic_terms(topic_id):
        print('{:>02} {:<30s}{:0.3f}'.format(topic_id, dictionary[term_id], p))

00 que                           0.002
00 floridamuseum                 0.001
00 russcontreras                 0.000
00 maryjowebster                 0.000
00 rebeccaaguilar                0.000
00 seashells                     0.000
00 ensiamedia                    0.000
00 jretis                        0.000
00 por                           0.000
00 arelisrhdz                    0.000
01 civilbeat                     0.002
01 hinews                        0.001
01 ndn                           0.001
01 nmpol                         0.000
01 wfdd                          0.000
01 thenewspress                  0.000
01 nathaneagle                   0.000
01 nickgrube                     0.000
01 chadgillisnp                  0.000
01 patjriley                     0.000
02 storm                         0.025
02 tropical                      0.015
02 potential                     0.008
02 nhc                           0.007
02 track                         0.007
02 hurricane             

In [19]:
model.save('../../data/interim/models/models/v2.pt')