# Topic Modelling

## Packages

In [1]:
# Import Libraries
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models

  from imp import reload


In [2]:
# Load NLTK tools
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/cindy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data

In [3]:
# Load data
data_folder = '../data/'
df = pd.read_csv(data_folder + 'scraped_data.csv')
transcript_df = pd.DataFrame(df['transcript'])

## Text Processing: NLP pipeline

In [4]:
# Tokenization
transcript_df['tokenized'] = df['transcript'].apply(tokenizer.tokenize)

In [5]:
# POS + NER
def extract_ne(token_list: list, entity_list=["PERSON"]) -> set:
    entity_set = set(entity_list)
    tags = nltk.pos_tag(token_list)
    tree = nltk.ne_chunk(tags, binary=False)
    return set(
        " ".join(i[0] for i in t)
        for t in tree
        if hasattr(t, "label") and t.label() in entity_set
     )
transcript_df['NER'] = transcript_df['tokenized'].apply(extract_ne)

In [6]:
# Removing stopwords and Casefolding
transcript_df['no_stopwords'] = transcript_df['tokenized'].apply(
    lambda l: [s.casefold() for s in l if s.casefold() not in stop_words and s not in stop_words])

In [7]:
# Lemmatization
transcript_df['lemmatized'] = transcript_df['no_stopwords'].apply(
    lambda l: [lemmatizer.lemmatize(s) for s in l])

In [8]:
transcript_df.head()

Unnamed: 0,transcript,tokenized,NER,no_stopwords,lemmatized
0,"Cindy Kelly: This is Wednesday, March 20. Inge...","[Cindy, Kelly, This, is, Wednesday, March, 20,...","{Kip Thorne, Willy, Sackmann Christy, Robert, ...","[cindy, kelly, wednesday, march, 20, inge, jul...","[cindy, kelly, wednesday, march, 20, inge, jul..."
1,"Trisha Pritikin: Okay. It is January 15th, 20...","[Trisha, Pritikin, Okay, It, is, January, 15th...","{Columbia River, Foulds Yes Dorn Steele, Ellis...","[trisha, pritikin, okay, january, 15th, 2019, ...","[trisha, pritikin, okay, january, 15th, 2019, ..."
2,Karen Dorn Steele: Our second interview is wit...,"[Karen, Dorn, Steele, Our, second, interview, ...","{Dusty Washington, Tom, Ellis, William Etter, ...","[karen, dorn, steele, second, interview, richa...","[karen, dorn, steele, second, interview, richa..."
3,"Karen Dorn Steele: It’s April 29, 2019. Our fi...","[Karen, Dorn, Steele, It, s, April, 29, 2019, ...","{Cook Well, Cook Pigford, Robert, Hazel R, Ph,...","[karen, dorn, steele, april, 29, 2019, first, ...","[karen, dorn, steele, april, 29, 2019, first, ..."
4,[The Atomic Heritage Foundation is very gratef...,"[The, Atomic, Heritage, Foundation, is, very, ...","{Guise PH, Mother George Oh, Mark Yeah, Mark, ...","[atomic, heritage, foundation, grateful, mark,...","[atomic, heritage, foundation, grateful, mark,..."


In [13]:
# Export the processed transcripts to a csv file
transcript_df.to_csv(data_folder + 'processed_transcripts.csv')

## Bag of Words (BoW)

In [9]:
# Create a gensim dictionary
dictionary = Dictionary(transcript_df['lemmatized'])

In [10]:
# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(text) for text in transcript_df['lemmatized']]

## Topic Modelling: LDA

In [11]:
# Train an LDA model on the corpus
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)

In [12]:
# Visualize the results using pyLDAvis
pyLDAvis.enable_notebook()
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis_data, '../results/topic_modeling.html')

  default_term_info = default_term_info.sort_values(


---