In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
import sqlite3
import sqllite_handler
import copy

  from .autonotebook import tqdm as notebook_tqdm


### TOPIC MODELING w/ LDA and Dynamic LDA

#### a) preprocessing

In [2]:
# LDA topic modeling

import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.ldaseqmodel import LdaSeqModel
import spacy
import pickle
import math
import ray
ray.init(num_cpus=12)

pyLDAvis.enable_notebook()
nlp = spacy.load('en_core_web_sm')

In [3]:
doc_df = pd.read_csv('tables/tables_69_76/doc.csv')
doc_df = doc_df[doc_df['subtype']!='editorial-note']
free_text_list = doc_df['text'].values
free_text_list = list(map(lambda x: ' ' if not x or (isinstance(x, float) and math.isnan(x)) else x, free_text_list))

In [4]:
@ray.remote
def preprocess(row):
    return [t.lemma_.lower() for t in nlp(row) if not t.is_punct and not t.is_stop and t.ent_iob_=='O' and len(t.text)>=3]
# and t.pos_ in ['PROPN','NOUN']

futures = [preprocess.remote(row) for row in free_text_list]
processed_free_text_list = ray.get(futures)



In [None]:
'''with open("tables/tables_69_76/lda_processed_entremoved_onlynoun_free_text_list", "wb") as fp:
    pickle.dump(processed_free_text_list, fp)'''

In [5]:
# for dynamic lda

year_list = list(map(lambda x: int(x),doc_df['year']))
year_text_tuple = list(zip(year_list, processed_free_text_list))
year_text_tuple = sorted(year_text_tuple, key=lambda x: x[0])
sorted_year_list = list(map(lambda x: x[0],year_text_tuple))
sorted_text_list = list(map(lambda x: x[1],year_text_tuple))

#### b.1) gensim

In [6]:
# LDA
dictionary = Dictionary(processed_free_text_list)

dictionary.filter_extremes(keep_n=10000)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_free_text_list]

lda = LdaModel(corpus = doc_term_matrix, id2word=dictionary, num_topics=30)

gensimvis.prepare(lda, doc_term_matrix, dictionary)

In [None]:
# dynamic LDA (Gensim's dynamic lda is too slow. this code below will not terminate)
dictionary = Dictionary(sorted_text_list)

dictionary.filter_extremes(keep_n=100)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in sorted_text_list]

time_slice = pd.Series(sorted_year_list).value_counts(sort=False).values

ldaseq = LdaSeqModel(corpus = doc_term_matrix, time_slice=list(time_slice), id2word=dictionary, num_topics=5)

#### b.2) tomotopy

In [38]:
# LDA
import tomotopy as tp
import sys

# define model
mdl = tp.LDAModel(k=30,tw=tp.TermWeight.ONE)

# add documents
for txt in processed_free_text_list:
    mdl.add_doc(txt)

# train model
mdl.train(100)

topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

pyLDAvis.prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency,start_index=0,sort_topics=False)

In [40]:
doc_inst = mdl.docs[0]

In [46]:
mdl.get_topic_words(17)

[('say', 0.1309700310230255),
 ('president', 0.04592324048280716),
 ('ask', 0.03636263683438301),
 ('note', 0.016635632142424583),
 ('point', 0.015993118286132812),
 ('problem', 0.015185386873781681),
 ('reply', 0.015020170249044895),
 ('want', 0.012909053824841976),
 ('tell', 0.012497845105826855),
 ('think', 0.011403736658394337)]

In [43]:
doc_inst.get_topics()

[(13, 0.16549718379974365),
 (0, 0.15226240456104279),
 (19, 0.15165044367313385),
 (10, 0.1446617841720581),
 (24, 0.12272237241268158),
 (15, 0.09451723098754883),
 (11, 0.06528868526220322),
 (8, 0.029181264340877533),
 (12, 0.02236236073076725),
 (17, 0.022185254842042923)]

In [14]:
# dynamic LDA (very fast)

# define model
mdl = tp.DTModel(k=30, t=len(np.unique(sorted_year_list)), tw=tp.TermWeight.ONE)

# add documents
for idx,txt in enumerate(sorted_text_list):
    time_pt = sorted_year_list[idx]-min(sorted_year_list)
    mdl.add_doc(words=txt, timepoint=time_pt)

# train model
mdl.train(100)



In [36]:
for t in range(mdl.num_timepoints):
    results = mdl.get_topic_words(topic_id=15,timepoint=t,top_n=5)
    print(f't:{t}, top words: {list(map(lambda x: x[0],results))}')

t:0, top words: ['time', 'standard', 'ask', 'give', 'feel']
t:1, top words: ['give', 'ask', 'feel', 'conversation', 'chance']
t:2, top words: ['ask', 'give', 'feel', 'wish', 'say']
t:3, top words: ['say', 'ask', 'government', 'feel', 'war']
t:4, top words: ['ask', 'decision', 'feel', 'result', 'accept']
t:5, top words: ['agreement', 'agree', 'position', 'ask', 'accept']
t:6, top words: ['ask', 'agree', 'position', 'agreement', 'secretary']
t:7, top words: ['secretary', 'ask', 'agree', 'position', 'agreement']
t:8, top words: ['secretary', 'state', 'ask', 'negotiation', 'agree']
t:9, top words: ['secretary', 'state', 'know', 'right', 'intervene']
t:10, top words: ['state', 'secretary', 'talk', 'nature', 'salt']
t:11, top words: ['state', 'oil', 'instruction', 'multiple', 'provide']
t:12, top words: ['oil', 'state', 'subject', 'statement', 'company']
t:13, top words: ['oil', 'market', 'supply', 'world', 'state']
t:14, top words: ['oil', 'concern', 'price', 'time', 'believe']


### below is for parallel removal of named entities from free texts. (save in original code)

In [1]:
import pandas as pd
import numpy as np
import spacy
import pickle
import math
import ray
ray.init(num_cpus=12)

nlp = spacy.load('en_core_web_sm')

name_extension = '_52_88_entremoved'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@ray.remote
def remove_entities(txt):
    document = nlp(txt)

    edited_txt = ""
    for token in document:
        
        if token.ent_iob_=='O':
            if token.whitespace_:
                edited_txt += token.text+ ' '
            else:
                edited_txt += token.text
    
    return edited_txt

In [3]:
doc_df = pd.read_csv('tables/tables_52_88/doc.csv')
free_text_list = doc_df['text'].values
free_text_list = list(map(lambda x: ' ' if not x or (isinstance(x, float) and math.isnan(x)) else x, free_text_list))

futures = [remove_entities.remote(txt) for txt in free_text_list]
free_text_list = ray.get(futures)
print('entities removed from free texts.')

with open("tables/tables_52_88/free_text_list"+name_extension, "wb") as fp:
    pickle.dump(free_text_list, fp)



entities removed from free texts.


### INSTITUTION EXTRACTION