# Latent Dirichlet Allocation

## Packages

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# LDA
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models

from gensim.models.coherencemodel import CoherenceModel

  from imp import reload


In [2]:
# Load NLTK tools
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/cindy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data

In [3]:
# Transcripts
data_folder = '../data/'
df = pd.read_csv(data_folder + 'scraped_data_no_names.csv')
transcript_df = pd.DataFrame(df['transcript'])

In [4]:
# Paragraphs
paragraph_df = pd.read_csv(data_folder + 'paragraphs_dataset.csv')
paragraph_df.dropna(inplace=True) # Remove missing values

## Text Processing: NLP pipeline

In [5]:
def nlp_processing(df, column_name):
    # Tokenization
    df['tokenized'] = df[column_name].apply(tokenizer.tokenize)
    
    # Removing stopwords and Casefolding
    df['no_stopwords'] = df['tokenized'].apply(
        lambda l: [s.casefold() for s in l if s.casefold() not in stop_words and s not in stop_words])
    
    # Lemmatization
    df['lemmatized'] = df['no_stopwords'].apply(
        lambda l: [lemmatizer.lemmatize(s) for s in l])

In [6]:
nlp_processing(transcript_df, 'transcript')

In [7]:
nlp_processing(paragraph_df, 'paragraph')

---

## Bag of Words (BoW)

In [8]:
# Create a gensim dictionary
dictionary = Dictionary(transcript_df['lemmatized'])

In [9]:
# Filter out tokens that appear in less than 30 documents and more than 50% documents, keep only the first 100000 most frequent tokens
dictionary.filter_extremes(no_below=30, no_above=0.5, keep_n=100000)

In [10]:
# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(text) for text in transcript_df['lemmatized']]

---

## Topic Modelling: LDA

### Training

In [11]:
# Define range of number of topics
num_topics_range = range(2, 12)

# Compute coherence scores for different number of topics
coherence_scores = []
for num_topics in num_topics_range:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)
    
    coherence_model = CoherenceModel(model=lda_model, texts=transcript_df['lemmatized'], corpus=corpus, coherence='c_v', topn=20)
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)
    print(f"Number of topics: {num_topics}. Coherence score: {coherence_score}")

Number of topics: 2. Coherence score: 0.37923667672558947
Number of topics: 3. Coherence score: 0.4473216941747626
Number of topics: 4. Coherence score: 0.46385772490314464
Number of topics: 5. Coherence score: 0.4341760000865092
Number of topics: 6. Coherence score: 0.48432344950920414
Number of topics: 7. Coherence score: 0.5397399082777165
Number of topics: 8. Coherence score: 0.5112633546266613
Number of topics: 9. Coherence score: 0.5058716022974443
Number of topics: 10. Coherence score: 0.530195506167002
Number of topics: 11. Coherence score: 0.5210125550025098


In [12]:
# Train an LDA model on the corpus
num_topics = num_topics_range[np.array(coherence_scores).argmax()]
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)

In [13]:
# Visualize the results using pyLDAvis
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis_data, f'../results/lda_{num_topics}.html')

  default_term_info = default_term_info.sort_values(


Following several qualitative and quantitative evaluation, we choose to train three LDA models with 7, 30 and 90 topics.

In [11]:
lda_model_7 = LdaModel(corpus=corpus, id2word=dictionary, num_topics=7, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)

In [12]:
lda_model_30 = LdaModel(corpus=corpus, id2word=dictionary, num_topics=30, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)

In [13]:
lda_model_90 = LdaModel(corpus=corpus, id2word=dictionary, num_topics=90, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)

### Application od the LDA models on the paragraphs

In [34]:
def get_document_topic_info(document, lda_model, num_words=20):
    # Create a bag-of-words representation
    bow = lda_model.id2word.doc2bow(document)
    
    # Get the topic distribution for the document
    topic_distribution = lda_model.get_document_topics(bow)
    
    # Sort the topics by their probability in descending order
    sorted_topics = sorted(topic_distribution, key=lambda x: x[1], reverse=True)

    # Get the top N words for the most probable topic
    top_topic = 0
    top_topic_words = []
    top_topic_contribution = 0.0

    if len(sorted_topics) > 0:
        top_topic_id = sorted_topics[0][0]
        top_topic = lda_model.print_topic(top_topic_id)
        top_topic_contribution = sorted_topics[0][1]
        topic_words = lda_model.show_topic(top_topic_id, num_words)
        top_topic_words = [word for word, _ in topic_words]
        
    top_topic_id += 1

    return top_topic_id, top_topic_words, top_topic_contribution

In [48]:
paragraph_df[['topic_7', 'words_topic_7', 'contribution_topic_7']] = paragraph_df['lemmatized'].apply(
    lambda x: pd.Series(get_document_topic_info(x, lda_model_7, num_words=10)))

In [49]:
paragraph_df[['topic_30', 'words_topic_30', 'contribution_topic_30']] = paragraph_df['lemmatized'].apply(
    lambda x: pd.Series(get_document_topic_info(x, lda_model_30, num_words=10)))

In [50]:
paragraph_df[['topic_90', 'words_topic_90', 'contribution_topic_90']] = paragraph_df['lemmatized'].apply(
    lambda x: pd.Series(get_document_topic_info(x, lda_model_90, num_words=10)))

In [39]:
paragraph_df.rename(columns={'name': 'interviewee'}, inplace=True)

In [51]:
paragraph_df.head()

Unnamed: 0,interviewee,paragraph,tokenized,no_stopwords,lemmatized,topic_7,words_topic_7,contribution_topic_7,topic_30,words_topic_30,contribution_topic_30,topic_90,words_topic_90,contribution_topic_90
0,Inge-Juliana Sackmann Christy,"This is Wednesday, March 20.","[This, is, Wednesday, March, 20]","[wednesday, march, 20]","[wednesday, march, 20]",1,"[father, mother, child, santa, married, fe, to...",0.913163,10,"[chemistry, barrier, berkeley, plutonium, colu...",0.812533,85,"[unit, lunch, hall, mess, chemistry, box, ohio...",0.775456
1,Inge-Juliana Sackmann Christy,March 20.,"[March, 20]","[march, 20]","[march, 20]",1,"[father, mother, child, santa, married, fe, to...",0.876739,10,"[chemistry, barrier, berkeley, plutonium, colu...",0.743623,10,"[berkeley, caltech, lawrence, barrier, feynman...",0.697666
2,Inge-Juliana Sackmann Christy,"2019. I’m Cindy Kelly, and I’m in Pasadena, C...","[2019, I, m, Cindy, Kelly, and, I, m, in, Pasa...","[2019, cindy, kelly, pasadena, california, ing...","[2019, cindy, kelly, pasadena, california, ing...",1,"[father, mother, child, santa, married, fe, to...",0.932992,9,"[santa, fe, dorothy, hotel, mexico, bus, broth...",0.852029,1,"[mother, happy, father, child, indian, husband...",0.821683
3,Inge-Juliana Sackmann Christy,Inge-Juliana Sackmann Christy. And should I s...,"[Inge, Juliana, Sackmann, Christy, And, should...","[inge, juliana, sackmann, christy, spell]","[inge, juliana, sackmann, christy, spell]",1,"[father, mother, child, santa, married, fe, to...",0.787578,13,"[father, mother, child, dad, kid, sister, pare...",0.605069,13,"[father, mother, child, dad, sister, parent, m...",0.024054
4,Inge-Juliana Sackmann Christy,"Yes, please.","[Yes, please]","[yes, please]","[yes, please]",1,"[father, mother, child, santa, married, fe, to...",0.787563,27,"[girl, bus, married, road, food, dance, badge,...",0.600599,13,"[father, mother, child, dad, sister, parent, m...",0.02409


In [52]:
paragraph_df[['interviewee', 'paragraph', 'topic_7', 'words_topic_7', 'contribution_topic_7', 'topic_30', 'words_topic_30', 'contribution_topic_30', 'topic_90', 'words_topic_90', 'contribution_topic_90']].to_csv(data_folder + 'paragraph_topic.csv')

---