In [6]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
import sqlite3
import sqllite_handler
import copy

### transfer below block into analysis.py (ALREADY DONE, DELETE BELOW)

In [None]:
conn = sqlite3.connect('tables/texts_69_76.db')
cur = conn.cursor()

res = cur.execute("SELECT TEXT FROM transcript")
fetched = res.fetchall()
free_text_list = list(map(lambda x: x[0], fetched))

topic_model = BERTopic.load("plots/topic_model_69_76")

doc_df = pd.read_csv('tables/doc_69_76.csv')

messy_doc_topic_df = topic_model.get_document_info(free_text_list)

topic_desc_df = messy_doc_topic_df[['Name','Top_n_words']].drop_duplicates(ignore_index=True)
doc_topic_df = pd.DataFrame({'id_to_text':doc_df['id_to_text'],'assigned_topic':messy_doc_topic_df['Name']})

topic_desc_df.to_csv('tables/topic_descp_69_76.csv')
doc_topic_df.to_csv('tables/doc_topic_69_76.csv')

### construction below

### LDA and Dynamic LDA

In [3]:
# LDA topic modeling

import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.ldaseqmodel import LdaSeqModel
import spacy
import pickle
import math
import ray
ray.init(num_cpus=12)

pyLDAvis.enable_notebook()
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm
  import imp


In [7]:
doc_df = pd.read_csv('tables/tables_69_76/doc_69_76v30.csv')
doc_df = doc_df[doc_df['subtype']!='editorial-note']
free_text_list = doc_df['text'].values
free_text_list = list(map(lambda x: ' ' if not x or (isinstance(x, float) and math.isnan(x)) else x, free_text_list))

In [8]:
@ray.remote
def preprocess(row):
    return [t.lemma_.lower() for t in nlp(row) if not t.is_punct and not t.is_stop and t.ent_iob_=='O' and len(t.text)>=3]
# and t.pos_ in ['PROPN','NOUN']

futures = [preprocess.remote(row) for row in free_text_list]
processed_free_text_list = ray.get(futures)



In [None]:
'''with open("tables/tables_69_76/lda_processed_entremoved_onlynoun_free_text_list", "wb") as fp:
    pickle.dump(processed_free_text_list, fp)'''

In [8]:
# LDA
dictionary = Dictionary(processed_free_text_list)

dictionary.filter_extremes(keep_n=10000)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_free_text_list]

lda = LdaModel(corpus = doc_term_matrix, id2word=dictionary, num_topics=30)

gensimvis.prepare(lda, doc_term_matrix, dictionary)

In [14]:
#DYNAMIC LDA

year_list = list(map(lambda x: int(x),doc_df['year']))
year_text_tuple = list(zip(year_list, processed_free_text_list))
year_text_tuple = sorted(year_text_tuple, key=lambda x: x[0])
sorted_year_list = list(map(lambda x: x[0],year_text_tuple))
sorted_text_list = list(map(lambda x: x[1],year_text_tuple))

In [None]:
# Gensim's dynamic lda is too slow.
dictionary = Dictionary(sorted_text_list)

dictionary.filter_extremes(keep_n=100)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in sorted_text_list]

time_slice = pd.Series(sorted_year_list).value_counts(sort=False).values

ldaseq = LdaSeqModel(corpus = doc_term_matrix, time_slice=list(time_slice), id2word=dictionary, num_topics=5)

In [27]:
# new library LDA fitting

import tomotopy as tp
import sys

# define model
mdl = tp.LDAModel(k=5)

# add documents
for txt in processed_free_text_list:
    mdl.add_doc(txt)

# train model
for i in range(0, 1000, 10):
    mdl.train(10)

topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

pyLDAvis.prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency,start_index=0,sort_topics=False)

In [28]:
# new library dynamic LDA fitting

import tomotopy as tp
import sys

# define model
mdl = tp.DTModel(k=5, t=len(np.unique(sorted_year_list)), tw=tp.TermWeight.ONE)

# add documents
for idx,txt in enumerate(sorted_text_list):
    time_pt = sorted_year_list[idx]-min(sorted_year_list)
    mdl.add_doc(words=txt, timepoint=time_pt)

# train model
mdl.train(1000)

In [32]:
for t in range(mdl.num_timepoints):
    print(mdl.get_topic_words(topic_id=3,timepoint=t,top_n=10))
    print('+++')

[('community', 0.029121089726686478), ('island', 0.0221136212348938), ('violence', 0.01251459401100874), ('cyprus', 0.009521933272480965), ('cypriot', 0.009519968181848526), ('intercommunal', 0.009180083870887756), ('cypriots', 0.007131969556212425), ('independence', 0.004716104827821255), ('police', 0.0037465342320501804), ('nation', 0.0037308295723050833)]
+++
[('rough', 0.012905229814350605), ('incidental', 0.010170548222959042), ('intelligence', 0.010099736973643303), ('wh43529', 0.009641645476222038), ('intra', 0.009039051830768585), ('necessitate', 0.00892449077218771), ('durable', 0.008824620395898819), ('yak', 0.008353336714208126), ('primin', 0.008077332749962807), ('party', 0.008073585107922554)]
+++
+++
[('minister', 0.053487710654735565), ('foreign', 0.045222993940114975), ('secretary', 0.030277006328105927), ('alexandraki', 0.017301175743341446), ('tzounis', 0.01692361757159233), ('question', 0.005949827842414379), ('greeks', 0.005328564904630184), ('know', 0.0047304276376

### below is for parallel removal of named entities from free texts. (save in original code)

In [1]:
import pandas as pd
import numpy as np
import spacy
import pickle
import math
import ray
ray.init(num_cpus=12)

nlp = spacy.load('en_core_web_sm')

name_extension = '_52_88_entremoved'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@ray.remote
def remove_entities(txt):
    document = nlp(txt)

    edited_txt = ""
    for token in document:
        
        if token.ent_iob_=='O':
            if token.whitespace_:
                edited_txt += token.text+ ' '
            else:
                edited_txt += token.text
    
    return edited_txt

In [3]:
doc_df = pd.read_csv('tables/tables_52_88/doc.csv')
free_text_list = doc_df['text'].values
free_text_list = list(map(lambda x: ' ' if not x or (isinstance(x, float) and math.isnan(x)) else x, free_text_list))

futures = [remove_entities.remote(txt) for txt in free_text_list]
free_text_list = ray.get(futures)
print('entities removed from free texts.')

with open("tables/tables_52_88/free_text_list"+name_extension, "wb") as fp:
    pickle.dump(free_text_list, fp)



entities removed from free texts.


### INSTITUTION EXTRACTION