# Latent Dirichlet Allocation

## Packages

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# LDA
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models

from gensim.models.coherencemodel import CoherenceModel

  from imp import reload


In [2]:
# Load NLTK tools
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/cindy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data

In [3]:
# Transcripts
data_folder = '../data/'
df = pd.read_csv(data_folder + 'scraped_data_no_names.csv')
transcript_df = pd.DataFrame(df['transcript'])

In [4]:
# Paragraphs
paragraph_df = pd.read_csv(data_folder + 'paragraphs_dataset.csv')
paragraph_df.dropna(inplace=True) # Remove missing values

## Text Processing: NLP pipeline

In [5]:
def nlp_processing(df, column_name):
    # Tokenization
    df['tokenized'] = df[column_name].apply(tokenizer.tokenize)
    
    # Removing stopwords and Casefolding
    df['no_stopwords'] = df['tokenized'].apply(
        lambda l: [s.casefold() for s in l if s.casefold() not in stop_words and s not in stop_words])
    
    # Lemmatization
    df['lemmatized'] = df['no_stopwords'].apply(
        lambda l: [lemmatizer.lemmatize(s) for s in l])

In [6]:
nlp_processing(transcript_df, 'transcript')

In [7]:
nlp_processing(paragraph_df, 'paragraph')

---

## Bag of Words (BoW)

In [8]:
# Create a gensim dictionary
dictionary = Dictionary(transcript_df['lemmatized'])

In [9]:
# Filter out tokens that appear in less than 30 documents and more than 50% documents, keep only the first 100000 most frequent tokens
dictionary.filter_extremes(no_below=30, no_above=0.5, keep_n=100000)

In [10]:
# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(text) for text in transcript_df['lemmatized']]

---

## Topic Modelling: LDA

### Training

In [11]:
# Define range of number of topics
num_topics_range = range(2, 12)

# Compute coherence scores for different number of topics
coherence_scores = []
for num_topics in num_topics_range:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)
    
    coherence_model = CoherenceModel(model=lda_model, texts=transcript_df['lemmatized'], corpus=corpus, coherence='c_v', topn=20)
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)
    print(f"Number of topics: {num_topics}. Coherence score: {coherence_score}")

Number of topics: 2. Coherence score: 0.37923667672558947
Number of topics: 3. Coherence score: 0.4473216941747626
Number of topics: 4. Coherence score: 0.46385772490314464
Number of topics: 5. Coherence score: 0.4341760000865092
Number of topics: 6. Coherence score: 0.48432344950920414
Number of topics: 7. Coherence score: 0.5397399082777165
Number of topics: 8. Coherence score: 0.5112633546266613
Number of topics: 9. Coherence score: 0.5058716022974443
Number of topics: 10. Coherence score: 0.530195506167002
Number of topics: 11. Coherence score: 0.5210125550025098


In [12]:
# Train an LDA model on the corpus
num_topics = num_topics_range[np.array(coherence_scores).argmax()]
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)

In [13]:
# Visualize the results using pyLDAvis
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis_data, f'../results/lda_{num_topics}.html')

  default_term_info = default_term_info.sort_values(


Following several qualitative and quantitative evaluation, we choose to train three LDA models with 7, 30 and 90 topics.

In [11]:
lda_model_7 = LdaModel(corpus=corpus, id2word=dictionary, num_topics=7, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)

In [12]:
lda_model_30 = LdaModel(corpus=corpus, id2word=dictionary, num_topics=30, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)

In [13]:
lda_model_90 = LdaModel(corpus=corpus, id2word=dictionary, num_topics=90, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)

In [16]:
# Create a dataframe storing the top 20 words for each topic of these models
models = {
    7: lda_model_7,
    30: lda_model_30,
    90: lda_model_90
}

lda_words = pd.DataFrame(columns=['model', 'topic'] + [f'word_{i}' for i in range(1, 21)])

for model_idx, model in models.items():
    # Get the number of topics for the current model
    num_topics = model.num_topics

    for topic in range(num_topics):
        # Get the top words for the current topic and model
        topic_words = model.show_topic(topic, 20)
        word_list = [word for word, _ in topic_words]

        # Create a row for the current topic and model
        row = [f'lda_model_{model_idx}', topic+1] + word_list

        # Append the row to the DataFrame
        lda_words.loc[len(lda_words)] = row

In [19]:
lda_words

Unnamed: 0,model,topic,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,...,word_11,word_12,word_13,word_14,word_15,word_16,word_17,word_18,word_19,word_20
0,lda_model_7,1,father,mother,child,santa,married,fe,town,dad,...,kid,husband,bus,road,barrack,wonderful,camp,hall,sister,hill
1,lda_model_7,2,grove,barrier,engineering,equipment,diffusion,metal,operation,president,...,committee,decision,construction,york,dupont,keith,lawrence,chemical,columbia,colonel
2,lda_model_7,3,fermi,szilard,reactor,chemistry,plutonium,student,wigner,experiment,...,science,professor,neutron,chain,enrico,graduate,hanford,dupont,dr,chemist
3,lda_model_7,4,hanford,richland,river,dupont,waste,construction,000,reactor,...,town,tank,columbia,00,study,worker,report,cleanup,camp,law
4,lda_model_7,5,reactor,plutonium,radiation,fuel,power,hanford,design,neutron,...,facility,dupont,system,tube,radioactive,operation,separation,level,engineering,rod
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,lda_model_90,86,barrier,grove,reactor,columbia,dunning,kellex,carbide,mr,...,keith,urey,york,diffusion,c,baker,plastic,power,graphite,production
123,lda_model_90,87,teller,refugee,jew,plutonium,helium,edward,000,3,...,neutron,explosive,weapon,jewish,proton,temperature,camp,germany,begin,low
124,lda_model_90,88,dupont,design,construction,engineering,equipment,hanford,chemical,cell,...,crane,explosive,operation,foot,facility,powder,division,concrete,steel,safety
125,lda_model_90,89,engine,division,trailer,computer,camp,brother,husband,chuckle,...,crew,inaudible,kid,card,barrack,bridge,clean,horse,construction,cut


In [21]:
lda_words.to_csv('../results/lda_top20_words.csv', index=False)

### Application od the LDA models on the paragraphs

In [47]:
def get_document_topic(document, lda_model):
    # Create a bag-of-words representation
    bow = lda_model.id2word.doc2bow(document)
    
    # Get the topic distribution for the document
    topic_distribution = lda_model.get_document_topics(bow)

    # Sort the topics by their probability in descending order
    sorted_topics = sorted(topic_distribution, key=lambda x: x[1], reverse=True)

    # Get the top topic ID and its contribution
    top_topic_id = None
    top_topic_contribution = 0.0

    if len(sorted_topics) > 0:
        top_topic_id = sorted_topics[0][0]
        top_topic_contribution = sorted_topics[0][1]
        
    # Correct the index
    top_topic_id += 1

    return int(top_topic_id), top_topic_contribution

In [48]:
paragraph_df[['topic_7', 'contribution_topic_7']] = paragraph_df['lemmatized'].apply(
    lambda x: pd.Series(get_document_topic(x, lda_model_7)))

In [40]:
paragraph_df[['topic_30', 'contribution_topic_30']] = paragraph_df['lemmatized'].apply(
    lambda x: pd.Series(get_document_topic(x, lda_model_30)))

In [41]:
paragraph_df[['topic_90', 'contribution_topic_90']] = paragraph_df['lemmatized'].apply(
    lambda x: pd.Series(get_document_topic(x, lda_model_90)))

In [42]:
paragraph_df.rename(columns={'name': 'interviewee'}, inplace=True)

In [50]:
# convert topic ids to integer
paragraph_df[['topic_7', 'topic_30', 'topic_90']] = paragraph_df[['topic_7', 'topic_30', 'topic_90']].apply(pd.to_numeric, downcast='integer', errors='coerce')

In [51]:
paragraph_df.head()

Unnamed: 0,interviewee,paragraph,tokenized,no_stopwords,lemmatized,topic_7,contribution_topic_7,topic_30,contribution_topic_30,topic_90,contribution_topic_90
0,Inge-Juliana Sackmann Christy,"This is Wednesday, March 20.","[This, is, Wednesday, March, 20]","[wednesday, march, 20]","[wednesday, march, 20]",1,0.913163,10,0.812532,10,0.775581
1,Inge-Juliana Sackmann Christy,March 20.,"[March, 20]","[march, 20]","[march, 20]",1,0.876739,10,0.743623,10,0.697666
2,Inge-Juliana Sackmann Christy,"2019. I’m Cindy Kelly, and I’m in Pasadena, C...","[2019, I, m, Cindy, Kelly, and, I, m, in, Pasa...","[2019, cindy, kelly, pasadena, california, ing...","[2019, cindy, kelly, pasadena, california, ing...",1,0.932992,9,0.852029,17,0.821436
3,Inge-Juliana Sackmann Christy,Inge-Juliana Sackmann Christy. And should I s...,"[Inge, Juliana, Sackmann, Christy, And, should...","[inge, juliana, sackmann, christy, spell]","[inge, juliana, sackmann, christy, spell]",1,0.787578,1,0.596854,13,0.02409
4,Inge-Juliana Sackmann Christy,"Yes, please.","[Yes, please]","[yes, please]","[yes, please]",1,0.787563,27,0.600599,27,0.537536


In [53]:
paragraph_df[['interviewee', 'paragraph', 'topic_7', 'contribution_topic_7', 'topic_30', 'contribution_topic_30', 'topic_90', 'contribution_topic_90']].to_csv(data_folder + 'paragraph_topic.csv')

---