In [183]:
import os
os.chdir(os.environ['PROJECT_DIR'])

from graphviz import Source
from datasets import load_dataset # hugging face datasets
from tqdm import tqdm
import pickle

import nltk
try:
    nltk.corpus.stopwords.words('english')
except LookupError:
    nltk.download('stopwords') # download nltk stopwords
    
import gensim.parsing.preprocessing as gsp
from gensim import utils
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
    
# from nltk.parse.malt import MaltParser 
from malt.malt import MaltParser # source code from nltk library
from lda.LDAMallet import LdaMallet # gensim LDA (gibbs sampling) mallet wrapper 

In [98]:
with open("data/utils/stopwords.txt") as f:
    stop_words = f.read().splitlines()

In [176]:
def reduce_parser(parser: any) -> list[any]:
    ''' Reduce iter of iters into list of dependency trees'''
    return [next(list_it) for list_it in parser]

def tokenize_doc(doc: str) -> list[list[str]]:
    ''' Tokenize document into sentences represented as tokens '''
    return [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(doc)]

def tag_sents(sents: list[list[str]]) -> list[list[str]]:
    ''' POS tag sentences '''
    return list(map(pos_tag, sents))

def get_dependency_trees(docs: str, malt_parser_version='maltparser-1.7.2', model_version='engmalt.linear-1.7.mco') -> list[list[any]]:
    ''' Calculate dependency relation trees using malt parser '''
    # initalize malt parser model
    mp = MaltParser(malt_parser_version, model_version, tagger=nltk.pos_tag)
    
    # create <doc_idx, tokenized_sent> list of sents
    sents = [
        (i, nltk.word_tokenize(sent))
        for i, doc in enumerate(docs)
        for sent in nltk.sent_tokenize(utils.to_unicode(doc.lower())) # convert doc to lowercase, and sentence tokenized.
    ]
        
    # unzip list of tuples
    doc_idxs, sents = zip(*sents)
    
    # create parser <generator> and loop through parser to produce dependency tree for each sentence
    parser = mp.parse_sents(sents, verbose=True)
    dependency_trees = reduce_parser(parser) 
    
    return doc_idxs, dependency_trees

def parse_dependency_trees(idxs, trees, sep="%"):
    ''' Convert nltk DependencyGraphs to <dep, reln> pairs'''
    # define valid word
    valid_word = lambda word: not word in stop_words and word.isalpha() and len(word) > 2
    # initialize document hashmap
    doc_reln_pairs = {i:[] for i in set(idxs)}
    
    # parse trees 
    for i, tree in zip(idxs, trees):
        # [TODO]: remove lazy try except
        try:
            tree.tree()
        except:
            continue
            
        for gov, reln, dep in tree.triples():
            if not valid_word(gov[0]) or not valid_word(dep[0]):
                continue

            doc_reln_pairs[i].extend([f"{gov[0]}{sep}{reln}.gov", f"{dep[0]}{sep}{reln}.dep"])

    return doc_reln_pairs

def get_topics(model: any, n_topics: int) -> dict:
    '''Returns dictionary of topics'''
    
    topics_dict = dict(model.print_topics(num_topics=n_topics))
    topics_dict = {int(k):v for k,v in topics_dict.items()}
    
    return topics_dict

In [24]:
# https://huggingface.co/datasets/cnn_dailymail/viewer/2.0.0/
dailymail = load_dataset('cnn_dailymail', '2.0.0')
texts = dailymail['train']['article'][:2000]

Found cached dataset cnn_dailymail (/Users/chasemattingly/.cache/huggingface/datasets/cnn_dailymail/2.0.0/2.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

In [177]:
doc_idxs, dependency_trees = get_dependency_trees(texts)

-----------------------------------------------------------------------------
                          MaltParser 1.7.2                             
-----------------------------------------------------------------------------
         MALT (Models and Algorithms for Language Technology) Group          
             Vaxjo University and Uppsala University                         
                             Sweden                                          
-----------------------------------------------------------------------------

Started: Mon Feb 20 22:34:46 EST 2023
  Transition system    : Projective
  Parser configuration : Stack
  Feature model        : eng-liblinear.xml
  Classifier           : liblinear
  Data Format          : /engmalt.linear-1.7/conllx.xml
.          	      1	      3s	    356MB
.          	     10	      3s	    361MB
.          	    100	      3s	    320MB
..........	   1000	      5s	    526MB
..........	   2000	      6s	    469MB
..........	   3000	      8s

In [178]:
doc_reln_pairs = parse_dependency_trees(doc_idxs, dependency_trees)
tokens = list(doc_reln_pairs.values())

In [179]:
id2word = corpora.Dictionary(tokens)
corpus = list(map(lambda x: id2word.doc2bow(x), tokens))

In [186]:
model = LdaMallet(os.environ['MALLET_DIR'], corpus=corpus, num_topics=5, id2word=id2word)

Coded LDA: 5 topics, 3 topic bits, 111 topic mask
max tokens: 1308
total tokens: 824530
<10> LL/token: -12.48341
<20> LL/token: -12.17335
<30> LL/token: -12.00986
<40> LL/token: -11.92467

0	10	his%poss.dep not%neg.dep new%amod.dep she%nsubj.dep democratic%amod.dep former%amod.dep her%poss.dep first%amod.dep obama%nn.gov john%nn.dep presidential%amod.dep obama%nsubj.dep told%dobj.gov national%amod.dep political%amod.dep white%amod.dep watch%nn.dep barack%nn.dep told%nsubj.gov 
1	10	his%poss.dep she%nsubj.dep her%poss.dep very%advmod.dep really%advmod.dep not%neg.dep new%amod.dep first%amod.dep him%dobj.dep want%xcomp.gov even%advmod.dep get%dobj.gov never%neg.dep good%amod.dep back%advmod.dep still%advmod.dep well%advmod.dep best%amod.dep here%advmod.dep 
2	10	her%poss.dep she%nsubj.dep told%dobj.gov told%nsubj.gov not%neg.dep people%nsubj.dep she%nsubjpass.dep according%prep.dep center%nn.gov found%dobj.gov his%poss.dep watch%nn.dep across%pobj.gov across%prep.dep found%nsubjpass.gov 

<270> LL/token: -11.6808
<280> LL/token: -11.68097
<290> LL/token: -11.68006

0	10	his%poss.dep not%neg.dep new%amod.dep she%nsubj.dep her%poss.dep former%amod.dep democratic%amod.dep president%nn.dep john%nn.dep obama%nn.gov white%amod.dep presidential%amod.dep obama%nsubj.dep political%amod.dep american%amod.dep first%amod.dep national%amod.dep watch%nn.dep bush%nn.gov 
1	10	his%poss.dep she%nsubj.dep very%advmod.dep her%poss.dep first%amod.dep really%advmod.dep new%amod.dep good%amod.dep back%advmod.dep not%neg.dep get%dobj.gov well%advmod.dep even%advmod.dep best%amod.dep never%neg.dep still%advmod.dep off%prt.dep him%dobj.dep great%amod.dep 
2	10	her%poss.dep his%poss.dep she%nsubj.dep told%dobj.gov told%nsubj.gov not%neg.dep according%prep.dep police%nn.dep center%nn.gov told%ccomp.gov found%dobj.gov watch%nn.dep police%nsubj.dep she%nsubjpass.dep her%dobj.dep died%nsubj.gov found%nsubjpass.gov authorities%nsubj.dep reported%nsubj.gov 
3	10	united%amod.dep states%amod.gov not%neg

<560> LL/token: -11.67222
<570> LL/token: -11.67197
<580> LL/token: -11.67348
<590> LL/token: -11.67185

0	10	his%poss.dep not%neg.dep new%amod.dep her%poss.dep former%amod.dep she%nsubj.dep democratic%amod.dep president%nn.dep john%nn.dep obama%nn.gov white%amod.dep presidential%amod.dep obama%nsubj.dep political%amod.dep american%amod.dep national%amod.dep bush%nn.gov first%amod.dep watch%nn.dep 
1	10	his%poss.dep she%nsubj.dep her%poss.dep very%advmod.dep first%amod.dep new%amod.dep back%advmod.dep really%advmod.dep not%neg.dep good%amod.dep best%amod.dep get%dobj.gov him%dobj.dep well%advmod.dep never%neg.dep even%advmod.dep still%advmod.dep here%advmod.dep little%amod.dep 
2	10	her%poss.dep his%poss.dep she%nsubj.dep told%dobj.gov told%nsubj.gov not%neg.dep according%prep.dep police%nn.dep police%nsubj.dep told%ccomp.gov watch%nn.dep she%nsubjpass.dep found%dobj.gov her%dobj.dep died%nsubj.gov authorities%nsubj.dep before%prep.dep center%nn.gov found%nsubjpass.gov 
3	10	united%amo

<860> LL/token: -11.6678
<870> LL/token: -11.66873
<880> LL/token: -11.66941
<890> LL/token: -11.66971

0	10	his%poss.dep not%neg.dep new%amod.dep former%amod.dep democratic%amod.dep her%poss.dep president%nn.dep she%nsubj.dep john%nn.dep obama%nn.gov white%amod.dep presidential%amod.dep obama%nsubj.dep american%amod.dep political%amod.dep national%amod.dep bush%nn.gov barack%nn.dep watch%nn.dep 
1	10	his%poss.dep she%nsubj.dep her%poss.dep very%advmod.dep first%amod.dep really%advmod.dep back%advmod.dep not%neg.dep new%amod.dep good%amod.dep him%dobj.dep never%neg.dep well%advmod.dep best%amod.dep last%amod.dep still%advmod.dep get%dobj.gov second%amod.dep here%advmod.dep 
2	10	her%poss.dep his%poss.dep told%dobj.gov she%nsubj.dep not%neg.dep told%nsubj.gov according%prep.dep police%nn.dep told%ccomp.gov watch%nn.dep she%nsubjpass.dep found%dobj.gov police%nsubj.dep authorities%nsubj.dep before%prep.dep her%dobj.dep cnn%dobj.dep contributed%nsubj.gov died%nsubj.gov 
3	10	united%amod.d

In [188]:
get_topics(model, 5)

{0: '0.012*"his%poss.dep" + 0.008*"not%neg.dep" + 0.005*"new%amod.dep" + 0.003*"she%nsubj.dep" + 0.003*"democratic%amod.dep" + 0.003*"her%poss.dep" + 0.003*"former%amod.dep" + 0.003*"president%nn.dep" + 0.003*"obama%nn.gov" + 0.003*"white%amod.dep"',
 1: '0.016*"his%poss.dep" + 0.005*"her%poss.dep" + 0.005*"she%nsubj.dep" + 0.003*"first%amod.dep" + 0.003*"very%advmod.dep" + 0.002*"really%advmod.dep" + 0.002*"not%neg.dep" + 0.002*"back%advmod.dep" + 0.002*"new%amod.dep" + 0.002*"good%amod.dep"',
 2: '0.008*"her%poss.dep" + 0.006*"his%poss.dep" + 0.006*"told%dobj.gov" + 0.005*"she%nsubj.dep" + 0.005*"told%nsubj.gov" + 0.004*"not%neg.dep" + 0.003*"according%prep.dep" + 0.002*"police%nn.dep" + 0.002*"she%nsubjpass.dep" + 0.002*"watch%nn.dep"',
 3: '0.005*"united%amod.dep" + 0.003*"states%amod.gov" + 0.003*"not%neg.dep" + 0.003*"military%amod.dep" + 0.003*"international%amod.dep" + 0.003*"last%amod.dep" + 0.003*"security%nn.dep" + 0.003*"government%amod.gov" + 0.002*"cnn%dep.dep" + 0.002*"a

In [184]:
pickle.dump(model, open('models/naive_malet_model.pkl', 'wb'))