In [2]:
# [TODO] create conda virtual environment for package management
import os
os.chdir(os.environ['PROJECT_DIR'])

import pandas as pd
import pickle
import numpy as np
from datasets import load_dataset # hugging face datasets

from tqdm import tqdm
from p_tqdm import p_map # multiprocessing tqdm
from multiprocessing import Pool

import matplotlib.pyplot as plt
from graphviz import Source # graphing dependency tree

# [TODO] determine when is causing kernel restart from gensim
import gensim.parsing.preprocessing as gsp
import gensim.corpora as corpora
from gensim import utils
from gensim.models.coherencemodel import CoherenceModel

import stanza # stanford corenlp
try:
    corenlp = stanza.Pipeline('en', processors="tokenize,mwt,pos,lemma,depparse", verbose=False, use_gpu=False)
except:
    stanza.download('en') # download corenlp neural model
    corenlp = stanza.Pipeline('en', processors="tokenize,mwt,pos,lemma,depparse", verbose=False, use_gpu=False)

import nltk
from nltk import tokenize
try:
    nltk.corpus.stopwords.words('english')
except LookupError:
    nltk.download('stopwords') # download nltk stopwords

from lda.LDAMallet import LdaMallet # gensim LDA (gibbs sampling) mallet wrapper 

In [3]:
# preprocess dataset using gensim filters
FILTERS = [
    gsp.strip_tags,
    gsp.strip_punctuation, 
    gsp.strip_multiple_whitespaces,
    gsp.strip_numeric,
    gsp.strip_short
]

# use nltk stopwords
STOP_WORDS = nltk.corpus.stopwords.words('english')

In [17]:
def _get_word_reln_pairs(doc: any, stop_words: list[str] = STOP_WORDS) -> list[tuple]:
    '''Loop through sentences and words in sentences to get dependency relationships'''
    
    valid_word = lambda word: not word in stop_words and word.isalpha() and len(word) > 2
    
    pairs = []
    for sent in doc.sentences:
        # since IDs can be two idxs: 1-2 we need to create a hashmap to access ids
        tokens = {
            str(word.id):word 
            for word in sent.words
        }
        for word in sent.words:
            word_text = utils.to_unicode(word.text.lower())
            
            # don't include root of sentence or non valid words
            if not word.head or not valid_word(word_text):
                continue
            
            # append both dep and gov of dependency relationship
            head = tokens[str(word.head)]
            pairs.extend(
                [(word_text, f'{word.deprel}.dep'), (head.text, f'{word.deprel}.gov')]
            )
            
    return pairs
            

def _process_pairs(pairs: list[tuple], stop_words: list[str] = STOP_WORDS) -> list[tuple]:
    '''Remove words that are stop words, non-alphabetic, and less than 3 characters long'''
    
    valid_word = lambda word: not word in stop_words and word.isalpha() and len(word) > 2
    
    processed_pairs = []
    for word, reln in pairs:
        # [TODO] unicode and lower full documents instead of each word for increase efficiency
        processed_word = utils.to_unicode(word.lower())
        if valid_word(processed_word):
            processed_pairs.append((processed_word, reln))
    
    return processed_pairs

def _concatenate_pairs(pairs: list[tuple], sep: str = "%") -> list[str]:
    '''Join dependency relational pairs into single strings using seperator'''
    
    join_tuple = lambda pair: sep.join(pair)
    strs = list(map(join_tuple, pairs))
    
    return strs

def coherence_optimization(tokens: list[any], id2word: dict, corpus: list[any], topics_range: iter) -> tuple[list[any], list[any]]:
    '''
    Description
        Perform coherence optimization on LDA Mallet model. This finds the model that has the best coherence
        in relation to number of topics. Essentially finds the best number of topics for a given corpus.
    
    Params
        tokens: tokenized documents
        id2word: a Gensim dictionary mapping of id to word.
        corpus: list of documents in bag of word (BoW) format
    
    Returns
        model_list -> list[LdaMallet]
        coherence_values -> list[float]
    '''
    
    model_list, coherence_values = [], []
    for n_topics in tqdm(topics_range):
        model = LdaMallet(os.environ['MALLET_DIR'], corpus=corpus, num_topics=n_topics, id2word=id2word)
        coherence_model = CoherenceModel(model=model, texts=tokens, coherence='c_npmi')

        model_list.append(model)
        coherence_values.append(coherence_model.get_coherence())
    
    return model_list, coherence_values

def get_tokens(text: str) -> list[str]:
    '''Obtain concatenated dependency relational pairs of text'''

    doc = corenlp(text)
    word_reln_pairs = _get_word_reln_pairs(doc)
    processed_pairs = _process_pairs(word_reln_pairs)
    word_reln_strs = _concatenate_pairs(processed_pairs, sep="?")
    
    return word_reln_strs

def get_topics(model: any, n_topics: int) -> dict:
    '''Returns dictionary of topics'''
    
    topics_dict = dict(model.print_topics(num_topics=n_topics))
    topics_dict = {int(k):v for k,v in topics_dict.items()}
    
    return topics_dict

def get_doc_top_matrix(model: any, n_topics: int) -> list[any]:
    '''Sort document topic matrix and add probability of 0 for topics that aren't included in documents'''
    
    doc_top_matrix = [*model.load_document_topics()]
    
    new_doc_top_matrix = []
    for doc_top in doc_top_matrix:
        _dict = dict(doc_top)
        for key in range(n_topics):
            if key not in _dict:
                _dict[key] = 0
        new_doc_top_matrix.append(list(_dict.items()))

    doc_top_matrix = [sorted(arr) for arr in new_doc_top_matrix]
    
    return doc_top_matrix

In [11]:
# # load dataset
# newsgroup_dataset = load_dataset('newsgroup', '18828_alt.atheism')
# atheism_texts = newsgroup_dataset['train']['text']

# https://huggingface.co/datasets/cnn_dailymail/viewer/2.0.0/
# load dataset
dailymail = load_dataset('cnn_dailymail', '2.0.0')
texts = dailymail['train']['article'][:2000]

Found cached dataset cnn_dailymail (/Users/chasemattingly/.cache/huggingface/datasets/cnn_dailymail/2.0.0/2.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
# process texts 
tokens = p_map(get_tokens, texts)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [23]:
id2word = corpora.Dictionary(tokens)
corpus = list(map(lambda x: id2word.doc2bow(x), tokens))

In [24]:
model = LdaMallet(os.environ['MALLET_DIR'], corpus=corpus, num_topics=5, id2word=id2word)

Coded LDA: 5 topics, 3 topic bits, 111 topic mask
max tokens: 1754
total tokens: 1170980
<10> LL/token: -12.37567
<20> LL/token: -12.00686
<30> LL/token: -11.81946
<40> LL/token: -11.72619

0	10	would?aux.dep said?ccomp.gov said?nsubj.gov new?amod.dep obama?nsubj.dep president?flat.gov also?advmod.dep democratic?amod.dep national?amod.dep president?nsubj.dep obama?flat.dep presidential?amod.dep going?xcomp.gov political?amod.dep white?amod.dep former?amod.dep john?flat.gov american?amod.dep could?aux.dep 
1	10	said?nsubj.gov said?ccomp.gov united?amod.dep two?nummod.dep according?case.dep officials?nsubj.dep said?obl.gov president?flat.gov also?advmod.dep said?parataxis.dep international?amod.dep states?amod.gov minister?amod.gov minister?flat.gov military?amod.dep security?compound.dep iraqi?amod.dep government?amod.gov report?obl.dep 
2	10	says?ccomp.gov says?nsubj.gov people?nsubj.dep also?advmod.dep years?obl.dep would?aux.dep one?nummod.dep one?nmod.gov say?ccomp.gov could?aux.dep

<260> LL/token: -11.51798
<270> LL/token: -11.51625
<280> LL/token: -11.51479
<290> LL/token: -11.51556

0	10	said?ccomp.gov said?nsubj.gov would?aux.dep president?flat.gov obama?nsubj.dep new?amod.dep democratic?amod.dep president?nsubj.dep former?amod.dep also?advmod.dep political?amod.dep american?amod.dep obama?flat.dep presidential?amod.dep said?obl.gov john?flat.gov white?amod.dep national?amod.dep president?amod.gov 
1	10	said?ccomp.gov said?nsubj.gov united?amod.dep two?nummod.dep minister?amod.gov according?case.dep minister?flat.gov military?amod.dep also?advmod.dep president?flat.gov security?compound.dep iraqi?amod.dep international?amod.dep officials?nsubj.dep states?amod.gov government?amod.gov said?parataxis.dep last?amod.dep government?nsubj.dep 
2	10	says?ccomp.gov says?nsubj.gov new?amod.dep million?nummod.dep would?aux.dep like?case.dep could?aux.dep even?advmod.dep one?nummod.dep may?aux.dep many?amod.dep years?obl.dep people?nsubj.dep one?nmod.gov year?obl:tmod.dep

<510> LL/token: -11.50928
<520> LL/token: -11.51031
<530> LL/token: -11.50893
<540> LL/token: -11.50951

0	10	said?ccomp.gov said?nsubj.gov would?aux.dep president?flat.gov obama?nsubj.dep new?amod.dep president?nsubj.dep democratic?amod.dep american?amod.dep said?obl.gov former?amod.dep obama?flat.dep presidential?amod.dep also?advmod.dep political?amod.dep national?amod.dep white?amod.dep john?flat.gov president?amod.gov 
1	10	said?nsubj.gov said?ccomp.gov united?amod.dep two?nummod.dep minister?amod.gov according?case.dep minister?flat.gov military?amod.dep also?advmod.dep security?compound.dep president?flat.gov international?amod.dep iraqi?amod.dep states?amod.gov government?amod.gov last?amod.dep officials?nsubj.dep said?obl.gov government?nsubj.dep 
2	10	says?ccomp.gov says?nsubj.gov new?amod.dep million?nummod.dep also?advmod.dep like?case.dep would?aux.dep may?aux.dep one?nummod.dep could?aux.dep even?advmod.dep people?nsubj.dep many?amod.dep years?obl.dep one?nmod.gov make?ob

<760> LL/token: -11.51207
<770> LL/token: -11.51209
<780> LL/token: -11.51061
<790> LL/token: -11.51248

0	10	said?ccomp.gov said?nsubj.gov would?aux.dep president?flat.gov obama?nsubj.dep new?amod.dep democratic?amod.dep former?amod.dep also?advmod.dep president?nsubj.dep american?amod.dep said?obl.gov obama?flat.dep presidential?amod.dep white?amod.dep john?flat.gov political?amod.dep watch?obj.gov national?amod.dep 
1	10	said?nsubj.gov said?ccomp.gov united?amod.dep two?nummod.dep minister?amod.gov also?advmod.dep minister?flat.gov military?amod.dep states?amod.gov president?flat.gov international?amod.dep security?compound.dep iraqi?amod.dep last?amod.dep government?amod.gov said?obl.gov according?case.dep officials?nsubj.dep government?nsubj.dep 
2	10	new?amod.dep million?nummod.dep says?ccomp.gov says?nsubj.gov said?ccomp.gov also?advmod.dep would?aux.dep could?aux.dep may?aux.dep like?case.dep even?advmod.dep one?nummod.dep one?nmod.gov people?nsubj.dep make?obj.gov many?amod.de

In [29]:
# for coherence optimization
# plt.plot(range(5,31, 5), coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.show()

In [25]:
get_topics(model, 5)

{0: '0.009*"said?ccomp.gov" + 0.008*"said?nsubj.gov" + 0.006*"would?aux.dep" + 0.003*"president?flat.gov" + 0.003*"obama?nsubj.dep" + 0.002*"new?amod.dep" + 0.002*"democratic?amod.dep" + 0.002*"president?nsubj.dep" + 0.002*"former?amod.dep" + 0.002*"also?advmod.dep"',
 1: '0.013*"said?ccomp.gov" + 0.012*"said?nsubj.gov" + 0.003*"united?amod.dep" + 0.003*"two?nummod.dep" + 0.002*"also?advmod.dep" + 0.002*"minister?amod.gov" + 0.002*"military?amod.dep" + 0.002*"minister?flat.gov" + 0.002*"states?amod.gov" + 0.002*"president?flat.gov"',
 2: '0.003*"new?amod.dep" + 0.003*"million?nummod.dep" + 0.002*"says?nsubj.gov" + 0.002*"said?ccomp.gov" + 0.002*"also?advmod.dep" + 0.002*"says?ccomp.gov" + 0.002*"would?aux.dep" + 0.002*"may?aux.dep" + 0.002*"could?aux.dep" + 0.001*"like?case.dep"',
 3: '0.019*"said?nsubj.gov" + 0.018*"said?ccomp.gov" + 0.004*"said?parataxis.dep" + 0.003*"told?nsubj.gov" + 0.003*"told?ccomp.gov" + 0.003*"told?obj.gov" + 0.003*"police?nsubj.dep" + 0.003*"two?nummod.dep" +

In [41]:
pickle.dump(model, open('models/naive_model.pkl', 'wb'))