In [1]:
# Goal: Data preprocessing, NLP, topic models with HDP (Gensim), visualizaion with pyLDAvis (preliminary)
# Result: implementations of topic models with high coherence scores from each library
# Output: pyLDAvis, wordcloud, csvs (document-topic, topic-document...)

# Load helper libraries
import re
from pprint import pprint
import pandas as pd
import numpy as np

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Enable Gensim logging and warnings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Text pre-processing model (lemmatization) from spacy (en) and nltk
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn
import wordcloud
#import sklearn
#import bokeh

In [2]:
# DATA Read in docs (corpus) from csv; report length of abstract text
docs = pd.read_csv("data/ERI-combined-2009-2019.csv")
len(docs)

# Concatenate title and abstract to new column for topic model
docs['combined'] = docs['title'].astype(str) + ' ' + docs['abstract'].astype(str)
docs.head()

# Calculate lengths of combined title and abstract; add columns to dataframe for length of abstracts
docs['title_len'] = docs['title'].apply(len)
docs['abstract_len'] = docs['abstract'].apply(len)
docs['combined_len'] = docs['combined'].apply(len)
docs.head()

# Calculate summary statistics for combined title and abstract lengths to determine text suitability
docs.combined_len.describe()

count    3770.000000
mean     1678.432626
std       687.609548
min       128.000000
25%      1247.000000
50%      1630.000000
75%      1993.000000
max      7083.000000
Name: combined_len, dtype: float64

In [3]:
# DATA Title and abstract to list, removing new line characters, quotations
#docs.head()
data = docs['combined'].values.tolist()
data = [re.sub('\s+', ' ', sent) for sent in data] #remove new line characters
data = [re.sub("\'", "", sent) for sent in data] #remove single quotes
pprint(data[:1])

  data = [re.sub('\s+', ' ', sent) for sent in data] #remove new line characters


['Streams and Urbanization Urbanization encompasses a diverse array of '
 'watershed alterations that influence the physical, chemical, and biological '
 'characteristics of streams. In this chapter, we summarize lessons learned '
 'from the last half century of research on urban streams and provide a '
 'critique of various mitigation strategies, including recent approaches that '
 'explicitly address geomorphic processes. We focus first on the abiotic '
 'conditions (primarily hydrologic and geomorphic) and their changes in '
 'streams that accompany urbanization, recognizing that these changes may vary '
 'with geomorphic context and climatic region. We then discuss technical '
 'approaches and limitations to (1) mitigating water-quantity and '
 'water-quality degradation through site design, riparian protection, and '
 'structural stormwater-management strategies; and (2) restoring urban streams '
 'in those watersheds where the economic, social, and political contexts can '
 'supp

In [4]:
# NLP Load stopwords from NLTK, extend default list with custom stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['data', 'study', 'project', 'research', 'collaborative'])

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""


In [5]:
# NLP Tokenize each sentence into a list of lowercase words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True remove punctuation

data_words = list(sent_to_words(data))

print(data_words[:1])

[['streams', 'and', 'urbanization', 'urbanization', 'encompasses', 'diverse', 'array', 'of', 'watershed', 'alterations', 'that', 'influence', 'the', 'physical', 'chemical', 'and', 'biological', 'characteristics', 'of', 'streams', 'in', 'this', 'chapter', 'we', 'summarize', 'lessons', 'learned', 'from', 'the', 'last', 'half', 'century', 'of', 'research', 'on', 'urban', 'streams', 'and', 'provide', 'critique', 'of', 'various', 'mitigation', 'strategies', 'including', 'recent', 'approaches', 'that', 'explicitly', 'address', 'geomorphic', 'processes', 'we', 'focus', 'first', 'on', 'the', 'abiotic', 'conditions', 'primarily', 'hydrologic', 'and', 'geomorphic', 'and', 'their', 'changes', 'in', 'streams', 'that', 'accompany', 'urbanization', 'recognizing', 'that', 'these', 'changes', 'may', 'vary', 'with', 'geomorphic', 'context', 'and', 'climatic', 'region', 'we', 'then', 'discuss', 'technical', 'approaches', 'and', 'limitations', 'to', 'mitigating', 'water', 'quantity', 'and', 'water', 'qua

In [7]:
# NLP Bigram and trigram models (words frequently occurring together in the doc)
bigram = gensim.models.Phrases(data_words, min_count=3, threshold=80) # higher threshold fewer phrases
trigram = gensim.models.Phrases(bigram[data_words], threshold=80)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See bigram example (given a specific record)
print(trigram_mod[bigram_mod[data_words[1]]])

['evidence', 'for', 'widespread', 'creep', 'on', 'the', 'flanks', 'of', 'the', 'sea', 'of', 'marmara', 'transform', 'basin', 'from', 'marine', 'geophysical', 'data', 'wave', 'fields', 'have', 'long', 'been', 'recognized', 'in', 'marine', 'sediments', 'on', 'the', 'flanks', 'of', 'basins', 'and', 'oceans', 'in', 'both', 'tectonically_active', 'and', 'inactive', 'environments', 'the', 'origin', 'of', 'waves', 'hereafter', 'called', 'undulations', 'is', 'controversial', 'competing', 'models', 'ascribe', 'them', 'to', 'depositional', 'processes', 'gravity', 'driven', 'downslope', 'creep', 'or', 'collapse', 'and', 'or', 'tectonic', 'shortening', 'here', 'we', 'analyze', 'pervasive', 'undulation', 'fields', 'identified', 'in', 'swath', 'bathymetry', 'and', 'new', 'high', 'resolution', 'multichannel_seismic', 'mcs', 'reflection', 'data', 'from', 'the', 'sea', 'of', 'marmara', 'turkey', 'although', 'they', 'exhibit', 'some', 'of', 'the', 'classical', 'features', 'of', 'sediment', 'waves', 'the

In [8]:
# NLP Define functions for stopwords, bigrams, trigrams, and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# NLP Remove stopwords, make bigrams, make trigrams, and lemmatize
data_words_nostops = remove_stopwords(data_words) # stop words
data_words_bigrams = make_bigrams(data_words_nostops) # bigrams
data_words_trigrams = make_trigrams(data_words_bigrams) # trigrams

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])

# Lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['stream', 'urbanization', 'urbanization', 'encompass', 'diverse', 'array', 'watershed', 'alteration', 'influence', 'physical', 'chemical', 'biological', 'characteristic', 'stream', 'chapter', 'summarize', 'lessons_learn', 'last', 'half', 'century', 'urban', 'stream', 'provide', 'critique', 'various', 'mitigation', 'strategy', 'include', 'recent', 'approach', 'explicitly', 'address', 'geomorphic', 'process', 'focus', 'first', 'abiotic', 'condition', 'primarily', 'hydrologic', 'geomorphic', 'change', 'stream', 'accompany', 'urbanization', 'recognize', 'change', 'may', 'vary', 'geomorphic', 'context', 'climatic', 'region', 'discuss', 'technical', 'approach', 'limitation', 'mitigate', 'water', 'quantity', 'water', 'quality', 'degradation', 'site', 'design', 'riparian', 'protection', 'structural', 'stormwater', 'management', 'strategy', 'restore', 'urban', 'stream', 'watershed', 'economic', 'social', 'political', 'context', 'support', 'activity']]


In [11]:
# NLP Create dictionary and corpus for topic modeling
# Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View corpus based on term-frequency
#print(corpus[:1])
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

# Save dictionary and corpus for future use - optional
#import pickle
#pickle.dump(corpus, open('corpus.pkl', 'wb'))
#id2word.save('dictionary.gensim')

[[('abiotic', 1),
  ('accompany', 1),
  ('activity', 1),
  ('address', 1),
  ('alteration', 1),
  ('approach', 2),
  ('array', 1),
  ('biological', 1),
  ('century', 1),
  ('change', 2),
  ('chapter', 1),
  ('characteristic', 1),
  ('chemical', 1),
  ('climatic', 1),
  ('condition', 1),
  ('context', 2),
  ('critique', 1),
  ('degradation', 1),
  ('design', 1),
  ('discuss', 1),
  ('diverse', 1),
  ('economic', 1),
  ('encompass', 1),
  ('explicitly', 1),
  ('first', 1),
  ('focus', 1),
  ('geomorphic', 3),
  ('half', 1),
  ('hydrologic', 1),
  ('include', 1),
  ('influence', 1),
  ('last', 1),
  ('lessons_learn', 1),
  ('limitation', 1),
  ('management', 1),
  ('may', 1),
  ('mitigate', 1),
  ('mitigation', 1),
  ('physical', 1),
  ('political', 1),
  ('primarily', 1),
  ('process', 1),
  ('protection', 1),
  ('provide', 1),
  ('quality', 1),
  ('quantity', 1),
  ('recent', 1),
  ('recognize', 1),
  ('region', 1),
  ('restore', 1),
  ('riparian', 1),
  ('site', 1),
  ('social', 1),
  

In [12]:
# TOPIC MODEL (HDP - Hierarchical Dirichlet Process)
# Fully unsupervised; infers the number of topics through "posterior inference"
# hdp_to_lda() Get corresponding alpha and beta values of a LDA almost equivalent to current HDP.
# suggested_lda_model() .

hdpmodel = gensim.models.HdpModel(corpus=corpus, id2word=id2word)
hdp_topics = hdpmodel.get_topics() #Get the term topic matrix learned during inference
len(hdp_topics) #150
hdpmodel.show_topics(num_topics=150) #Give the most probable num_words words from num_topics topics
hdp_lda = hdpmodel.suggested_lda_model() #Get a trained ldamodel object which is closest to the current hdp model
hdp_coherence = CoherenceModel(model=hdpmodel,texts=texts,dictionary=id2word,coherence='c_v').get_coherence()
print(hdp_coherence)

  start_time = time.clock()


0.4983231369481387


In [16]:
# TOPIC MODEL (LSI - Latent Semantic Indexing)
# LSI ranks topics by itself and outputs topics in ranked order; requires num_topics to be defined
lsimodel = gensim.models.LsiModel(corpus=corpus, num_topics=5, id2word=id2word) # Trying 10 topics as default
lsimodel.show_topics() 
lsitopics = lsimodel.print_topics()
#len(lsitopics)
pprint(lsitopics)
lsi_coherence = CoherenceModel(model=lsimodel,texts=texts,dictionary=id2word,coherence='c_v').get_coherence()
print(lsi_coherence)
#lsi_lda = lsimodel.suggested_lda_model() #Get a trained ldamodel object which is closest to the current lsi model

[(0,
  '0.343*"model" + 0.314*"use" + 0.198*"water" + 0.159*"high" + 0.157*"change" '
  '+ 0.141*"result" + 0.126*"increase" + 0.112*"specie" + 0.109*"base" + '
  '0.105*"scale"'),
 (1,
  '-0.739*"model" + 0.223*"water" + 0.196*"soil" + 0.161*"increase" + '
  '0.155*"plant" + 0.136*"change" + 0.135*"specie" + 0.126*"ecosystem" + '
  '0.103*"high" + 0.094*"effect"'),
 (2,
  '0.708*"water" + -0.333*"specie" + 0.169*"snow" + -0.137*"effect" + '
  '0.136*"use" + 0.122*"surface" + -0.122*"plant" + -0.115*"population" + '
  '-0.114*"model" + -0.106*"change"'),
 (3,
  '-0.379*"water" + -0.246*"model" + 0.203*"fault" + 0.197*"high" + '
  '-0.173*"specie" + 0.165*"sediment" + -0.159*"soil" + -0.144*"ecosystem" + '
  '0.144*"rate" + 0.141*"sample"'),
 (4,
  '0.485*"use" + 0.268*"specie" + -0.255*"model" + -0.250*"soil" + '
  '-0.215*"change" + -0.192*"climate" + -0.135*"rate" + 0.130*"method" + '
  '0.110*"base" + -0.108*"increase"')]
0.3733688463572714


In [14]:
# VISUALIZATION (HDP --> LDA), Gensim - Show model in pyLDAvis (150 topics)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(hdp_lda, corpus, id2word)
#pyLDAvis.save_html(vis, 'pyLDAvis/hdp-gensim-150.html') #saves pyLDAvis graphs as standalone webpage
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
