In [1]:
# Catherine Christenson 
# Code utilized to pre-process and run LDAseq model for Christenson, Cardiff 2024 published in Hydrogeology Journal
# Final product of this code is the completed model and matrix of topic-document probabilities

In [1]:
# import models 
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import LdaSeqModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
import pickle as pkl

from wordcloud import WordCloud 
import pingouin as pg
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(dask_version) > LooseVersion("2.9.0"):
  if LooseVersion(dask_version) > LooseVersion("2.9.0"):
  if LooseVersion(pd.__version__) < "0.25.0":
  other = LooseVersion(other)
  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)
  **kwargs


In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'copyright','all','rights','reserved','blackwell','wiley','american','geophysical','union'])

In [4]:
# Raw Data File
GWarticles = 'GW_Hydro_Master_1.csv'
# read csv into pandas dataframe
df = pd.read_csv(GWarticles)
df.head()

# read abstract field only into a dataframe 
df = pd.read_csv(GWarticles, usecols=['Authors','Title','Year','Journal','Abstract'])
len(df)

39850

In [6]:
#order dataframe according to timestamps
df.sort_values(by=['Year'], inplace=True, ascending=True)

#remove NaN abstracts
df = df[df.Abstract != '[No abstract available]']
len(df)

37705

In [7]:
#resetting index; drop empty rows
df2 = df.reset_index()
len(df2)

37705

In [23]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [24]:
# for lda seq, corpus must be saved in the order of the time slices 
df3 = df2.sort_values(by=['Year'])
len(df3)

37547

In [31]:
df3

Unnamed: 0,index,Authors,Title,Year,Journal,Abstract
0,235,"Klaer F.H., Jr.",Bacteriological and Chemical Factors in Induce...,1963,GW,The lowering of ground?water levels by pumping...
23,221,"Sampayo F.F., Wilke H.R.",Temperature and Phosphates as Ground?Water Tra...,1963,GW,This study was undertaken to determine the eff...
24,222,Hancock J.C.,Public Health Aspects of Individual Water Wells,1963,GW,"Local government in Michigan, and perhaps in o..."
25,223,"Harshbarger J.W., Ferris J.G.",Interdisciplinary Training Program in Scientif...,1963,GW,Hydrology is the science underlying the develo...
26,224,Belter W.G.,Waste Management Activities in the Atomic Ener...,1963,GW,The technical and administrative aspects of ra...
...,...,...,...,...,...,...
36638,33477,Sha B.; Johansson J.H.; Tunved P.; Bohlin-Nizz...,Sea Spray Aerosol (SSA) as a Source of Perfluo...,2022,ES&T,The effective enrichment of perfluoroalkyl aci...
36637,33478,Zhou R.; Zhan H.; Wang Y.,On the role of rock matrix to heat transfer in...,2022,JoCH,"In this study, a fully coupled analytical mode..."
36636,33479,Van Thang N.; Thu H.N.P.; Hao L.C.,Uranium isotopes in groundwater in Ho Chi Minh...,2022,JoCH,Groundwater is regularly used for many purpose...
36634,33481,Zhou Z.; Fu Q.-L.; Fujii M.; Waite T.D.,Complementary Elucidation of the Molecular Cha...,2022,ES&T,The formula assignment of the Fourier transfor...


In [26]:
# Determine lengths for Timeslices
df_ts = df3.groupby('Year').count()
print(df_ts)

time_slice = (405,1305,2592,5425,8966,13939,4915)
print(time_slice)

      index  Authors  Title  Journal  Abstract
Year                                          
1963     41       41     41       41        41
1964     35       35     35       35        35
1965     49       49     49       49        49
1966     57       57     57       57        57
1967     62       62     62       62        62
1968     80       80     80       80        80
1969     81       81     81       81        81
1970    105      105    105      105       105
1971    108      108    108      108       108
1972    125      125    125      125       125
1973    111      111    111      111       111
1974    119      118    119      119       119
1975    132      132    132      132       132
1976    133      132    133      133       133
1977    135      135    135      135       135
1978    144      144    144      144       144
1979    193      193    193      193       193
1980    158      158    158      158       158
1981    197      197    197      197       197
1982    199  

In [27]:
print(time_slice)

(405, 1305, 2592, 5425, 8966, 13939, 4915)


In [28]:
from gensim.test.utils import datapath
import os

In [39]:
#convert to string
df3['Abstract']=df3['Abstract'].apply(str)

#Convert to List
data = df3['Abstract'].values.tolist()
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove single quotes
data = [re.sub("\'", "", sent) for sent in data]

#pprint(data[:1])
    
data_words = list(sent_to_words(data))

print(data_words[:1])

[['the', 'lowering', 'of', 'ground', 'water', 'levels', 'by', 'pumping', 'from', 'horizontal', 'or', 'vertical', 'wells', 'near', 'surface', 'stream', 'may', 'cause', 'water', 'to', 'move', 'from', 'the', 'stream', 'into', 'the', 'water', 'bearing', 'materials', 'by', 'the', 'process', 'known', 'as', 'induced', 'infiltration', 'in', 'such', 'cases', 'the', 'natural', 'deposits', 'of', 'sand', 'and', 'gravel', 'serve', 'as', 'large', 'natural', 'filter', 'beds', 'effectively', 'removing', 'or', 'reducing', 'turbidity', 'organic', 'matter', 'and', 'pathogenic', 'bacteria', 'this', 'paper', 'discusses', 'the', 'general', 'processes', 'by', 'which', 'such', 'removal', 'is', 'accomplished', 'as', 'well', 'as', 'the', 'significance', 'of', 'certain', 'changes', 'in', 'chemical', 'characteristics', 'of', 'the', 'water', 'as', 'it', 'passes', 'from', 'surface', 'source', 'to', 'an', 'underground', 'point', 'of', 'collection', 'by', 'better', 'understanding', 'of', 'the', 'processes', 'involved

In [40]:
# remove stopwords prior to 
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [41]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [42]:
len(data_words_nostops)

37547

In [43]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words_nostops, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words_nostops], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [44]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# make sure the spacy english dataset is downloaded to system (python3 -m spacy download en)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# complete lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[['lower', 'ground', 'water', 'level', 'pump', 'horizontal', 'vertical', 'well', 'surface', 'stream', 'cause', 'water', 'move', 'stream', 'water', 'bearing', 'material', 'process', 'know', 'induce', 'infiltration', 'case', 'natural', 'deposit', 'serve', 'large', 'natural', 'filter', 'bed', 'effectively', 'remove', 'reduce', 'turbidity', 'paper_discusse', 'general', 'process', 'removal', 'accomplish', 'well', 'significance', 'certain', 'change', 'chemical', 'characteristic', 'water', 'pass', 'surface', 'source', 'underground', 'point', 'collection', 'well', 'understanding', 'process', 'involve', 'bacteriological', 'chemical', 'quantity', 'infiltrate', 'water', 'supply', 'improve']]
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 3), (33, 1), (34, 1), (35

[[('accomplish', 1),
  ('bacteriological', 1),
  ('bearing', 1),
  ('bed', 1),
  ('case', 1),
  ('cause', 1),
  ('certain', 1),
  ('change', 1),
  ('characteristic', 1),
  ('chemical', 2),
  ('collection', 1),
  ('deposit', 1),
  ('effectively', 1),
  ('filter', 1),
  ('general', 1),
  ('ground', 1),
  ('horizontal', 1),
  ('improve', 1),
  ('induce', 1),
  ('infiltrate', 1),
  ('infiltration', 1),
  ('involve', 1),
  ('know', 1),
  ('large', 1),
  ('level', 1),
  ('lower', 1),
  ('material', 1),
  ('move', 1),
  ('natural', 2),
  ('paper_discusse', 1),
  ('pass', 1),
  ('point', 1),
  ('process', 3),
  ('pump', 1),
  ('quantity', 1),
  ('reduce', 1),
  ('removal', 1),
  ('remove', 1),
  ('serve', 1),
  ('significance', 1),
  ('source', 1),
  ('stream', 2),
  ('supply', 1),
  ('surface', 2),
  ('turbidity', 1),
  ('underground', 1),
  ('understanding', 1),
  ('vertical', 1),
  ('water', 5),
  ('well', 3)]]

In [99]:
import pickle

# save term document frequency
with open("data/id2word_full.pkl", 'wb') as f:
      pickle.dump(id2word, f)
        
# save corpus
with open("data/cleaned_corpus_full.pkl", 'wb') as f:
      pickle.dump(corpus, f)


In [34]:
num_topics = 30

#run lda seq 
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, 
                                 id2word=id2word, 
                                 time_slice=time_slice, 
                                 num_topics=num_topics)

Number of Topics:  18


  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  convergence = np.fabs((bound - old_bound) / old_bound)
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old 

In [261]:
#save results
lda_model.save(f"trained_models/trained_lda_model_search_broad_{num_topics}")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
from gensim.models import LdaSeqModel
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

# Load your LDAseq model
lda_seq_model = LdaSeqModel.load(f"trained_models/trained_lda_model_search_broad_{num_topics}")

In [9]:
# Get the topics at a specific time slice (e.g., at time 0)
topics_at_time_6 = lda_seq_model.print_topics(time=6)
topics_at_time_6

[[('method', 0.04253720744448855),
  ('estimate', 0.03071736355004643),
  ('datum', 0.030054105004887255),
  ('use', 0.02871803544050183),
  ('parameter', 0.016825508781782024),
  ('base', 0.015983608497957182),
  ('uncertainty', 0.01523905383427519),
  ('approach', 0.01444132157031081),
  ('study', 0.012891786285284161),
  ('error', 0.011032182341607676),
  ('analysis', 0.010709912553560491),
  ('estimation', 0.010709589650874929),
  ('result', 0.010534742523175315),
  ('information', 0.008103625356790288),
  ('apply', 0.007639053705841975),
  ('value', 0.007595795698803409),
  ('obtain', 0.006256518529909115),
  ('propose', 0.006085613679159823),
  ('provide', 0.00607801703720052),
  ('technique', 0.00586044025047609)],
 [('solution', 0.036780626824410946),
  ('flow', 0.03128995017483652),
  ('equation', 0.02570147634204754),
  ('method', 0.019331259075562678),
  ('numerical', 0.016176219555421364),
  ('analytical', 0.013883027726836342),
  ('use', 0.012906950895981727),
  ('boundary

In [5]:
# print topics by time slice
lda_seq_model.print_topics()
#num_topics = 45

[[('parameter', 0.05117686510762252),
  ('method', 0.04186290321778942),
  ('estimate', 0.029651607060827742),
  ('datum', 0.027490347712265165),
  ('use', 0.02630106146198818),
  ('base', 0.0130575204491179),
  ('uncertainty', 0.01290188822378545),
  ('approach', 0.011852308815696088),
  ('estimation', 0.010913151601543549),
  ('error', 0.010856810371293869),
  ('analysis', 0.01041490998683603),
  ('result', 0.00922141914822811),
  ('measurement', 0.008866192756074594),
  ('technique', 0.008420752307661697),
  ('information', 0.007967706715804492),
  ('apply', 0.007425323650984531),
  ('set', 0.007409654736463187),
  ('value', 0.00720526229358254),
  ('study', 0.006599405963864493),
  ('provide', 0.005997506804259293)],
 [('solution', 0.03218311835246815),
  ('flow', 0.02787649437541428),
  ('equation', 0.0255906129086066),
  ('problem', 0.02090293181806645),
  ('method', 0.018286540230348893),
  ('use', 0.014026174116145668),
  ('numerical', 0.012271915947649443),
  ('dimensional', 0

In [6]:
import gc
gc.collect()

46

In [7]:
df_full = pd.DataFrame()

# create matrix of topic probabilities for every document (df_full)
for i in range(0,37547):
    #doc_topics - checks the topic proportions of documents 
    doc = lda_seq_model.doc_topics(i)
    dataframe = pd.DataFrame(doc)
    df10 = dataframe.T
    df_full = df_full.append(df10, ignore_index=True,sort=False)

In [8]:
type(lda_seq_model)

gensim.models.ldaseqmodel.LdaSeqModel

In [9]:
df_full.head

<bound method NDFrame.head of              0         1         2         3         4         5         6   \
0      0.000158  0.000158  0.224843  0.087036  0.000158  0.000158  0.000158   
1      0.000076  0.000076  0.000076  0.000076  0.119629  0.000076  0.092476   
2      0.000175  0.000175  0.740496  0.075089  0.000175  0.000175  0.000175   
3      0.000088  0.026215  0.857437  0.000088  0.000088  0.000088  0.000088   
4      0.000060  0.000060  0.654061  0.036312  0.017148  0.000060  0.000060   
...         ...       ...       ...       ...       ...       ...       ...   
37542  0.000101  0.000101  0.000101  0.000101  0.000101  0.000101  0.101279   
37543  0.000069  0.086232  0.000069  0.000069  0.222444  0.000069  0.000069   
37544  0.000140  0.000140  0.000140  0.060503  0.000140  0.019207  0.206539   
37545  0.246117  0.045967  0.000095  0.000095  0.000095  0.000095  0.000095   
37546  0.000078  0.076817  0.071262  0.447977  0.018151  0.000078  0.027360   

             7       