# Latent Dirichlet Allocation

## Packages

In [1]:
# Import Libraries
import pandas as pd
import numpy as np

# Formatting
import ast

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# LDA
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models

  from imp import reload


In [2]:
# Load NLTK tools
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/cindy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data

In [3]:
# Load data
data_folder = '../data/'
df = pd.read_csv(data_folder + 'scraped_data.csv')
transcript_df = pd.DataFrame(df['transcript'])

## Data Formatting

In [4]:
# Format to list of string
df['locations'] = df['locations'].apply(ast.literal_eval)

In [5]:
los_alamos_df = df[df['locations'].apply(lambda locs: 'Los Alamos' in locs)]
# oak_ridge_df = df[df['locations'].apply(lambda locs: 'Oak Ridge' in locs)]
transcript_df = pd.DataFrame(los_alamos_df['transcript'])

## Text Processing: NLP pipeline

In [6]:
# Tokenization
transcript_df['tokenized'] = transcript_df['transcript'].apply(tokenizer.tokenize)

In [7]:
# Removing stopwords and Casefolding
transcript_df['no_stopwords'] = transcript_df['tokenized'].apply(
    lambda l: [s.casefold() for s in l if s.casefold() not in stop_words and s not in stop_words])

In [8]:
# Lemmatization
transcript_df['lemmatized'] = transcript_df['no_stopwords'].apply(
    lambda l: [lemmatizer.lemmatize(s) for s in l])

In [9]:
transcript_df.head()

Unnamed: 0,transcript,tokenized,no_stopwords,lemmatized
0,"Cindy Kelly: This is Wednesday, March 20. Inge...","[Cindy, Kelly, This, is, Wednesday, March, 20,...","[cindy, kelly, wednesday, march, 20, inge, jul...","[cindy, kelly, wednesday, march, 20, inge, jul..."
5,[Many thanks to Jonathan Sheline for donating ...,"[Many, thanks, to, Jonathan, Sheline, for, don...","[many, thanks, jonathan, sheline, donating, re...","[many, thanks, jonathan, sheline, donating, re..."
15,"Cindy Kelly: It is Monday, May 14. I am in Pal...","[Cindy, Kelly, It, is, Monday, May, 14, I, am,...","[cindy, kelly, monday, may, 14, palo, alto, st...","[cindy, kelly, monday, may, 14, palo, alto, st..."
21,"Cindy Kelly: I’m Cindy Kelly, Atomic Heritage ...","[Cindy, Kelly, I, m, Cindy, Kelly, Atomic, Her...","[cindy, kelly, cindy, kelly, atomic, heritage,...","[cindy, kelly, cindy, kelly, atomic, heritage,..."
43,[Many thanks to Jonathan Sheline for donating ...,"[Many, thanks, to, Jonathan, Sheline, for, don...","[many, thanks, jonathan, sheline, donating, vi...","[many, thanks, jonathan, sheline, donating, vi..."


---

## Bag of Words (BoW)

In [10]:
# Create a gensim dictionary
dictionary = Dictionary(transcript_df['lemmatized'])

In [11]:
# Filter out tokens that appear in less than 20 documents and more than 50% documents, keep only the first 100000 most frequent tokens
dictionary.filter_extremes(no_below=30, no_above=0.5, keep_n=100000)

In [12]:
# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(text) for text in transcript_df['lemmatized']]

## Topic Modelling: LDA

In [13]:
# Train an LDA model on the corpus
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

In [14]:
# Visualize the results using pyLDAvis
pyLDAvis.enable_notebook()
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis_data, '../results/lda_los_alamos_100.html')

  default_term_info = default_term_info.sort_values(
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = Loo

---