# Latent Dirichlet Allocation

## Packages

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# LDA
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models

from gensim.models.coherencemodel import CoherenceModel

  from imp import reload


In [2]:
# Load NLTK tools
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/cindy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data

In [3]:
# Transcripts
data_folder = '../data/'
df = pd.read_csv(data_folder + 'scraped_data_no_names.csv')
transcript_df = pd.DataFrame(df['transcript'])

In [4]:
# Paragraphs
paragraph_df = pd.read_csv(data_folder + 'paragraphs_dataset.csv')
paragraph_df.dropna(inplace=True) # Remove missing values

## Text Processing: NLP pipeline

In [5]:
def nlp_processing(df, column_name):
    # Tokenization
    df['tokenized'] = df[column_name].apply(tokenizer.tokenize)
    
    # Removing stopwords and Casefolding
    df['no_stopwords'] = df['tokenized'].apply(
        lambda l: [s.casefold() for s in l if s.casefold() not in stop_words and s not in stop_words])
    
    # Lemmatization
    df['lemmatized'] = df['no_stopwords'].apply(
        lambda l: [lemmatizer.lemmatize(s) for s in l])
    
    # # Removing custom stopwords
    # custom_stop_words = {'000', 'b', 'k', '25', '29'}
    # df['final'] = df['lemmatized'].apply(
    #     lambda l: [s for s in l if s not in custom_stop_words])

In [6]:
nlp_processing(transcript_df, 'transcript')

In [7]:
nlp_processing(paragraph_df, 'paragraph')

---

## Bag of Words (BoW)

In [8]:
# Create a gensim dictionary
dictionary = Dictionary(transcript_df['lemmatized'])

In [9]:
# Filter out tokens that appear in less than 30 documents and more than 50% documents, keep only the first 100000 most frequent tokens
dictionary.filter_extremes(no_below=30, no_above=0.5, keep_n=100000)

In [10]:
# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(text) for text in transcript_df['lemmatized']]

---

## Topic Modelling: LDA

### Training

In [11]:
# Define range of number of topics
num_topics_range = range(2, 12)

# Compute coherence scores for different number of topics
coherence_scores = []
for num_topics in num_topics_range:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)
    
    coherence_model = CoherenceModel(model=lda_model, texts=transcript_df['lemmatized'], corpus=corpus, coherence='c_v', topn=20)
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)
    print(f"Number of topics: {num_topics}. Coherence score: {coherence_score}")

Number of topics: 2. Coherence score: 0.37923667672558947
Number of topics: 3. Coherence score: 0.4473216941747626
Number of topics: 4. Coherence score: 0.46385772490314464
Number of topics: 5. Coherence score: 0.4341760000865092
Number of topics: 6. Coherence score: 0.48432344950920414
Number of topics: 7. Coherence score: 0.5397399082777165
Number of topics: 8. Coherence score: 0.5112633546266613
Number of topics: 9. Coherence score: 0.5058716022974443
Number of topics: 10. Coherence score: 0.530195506167002
Number of topics: 11. Coherence score: 0.5210125550025098


In [12]:
# Train an LDA model on the corpus
num_topics = num_topics_range[np.array(coherence_scores).argmax()]
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha='auto', eta='auto', passes=10, iterations=500, random_state=42)

In [13]:
# Visualize the results using pyLDAvis
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis_data, f'../results/lda_{num_topics}.html')

  default_term_info = default_term_info.sort_values(


### Get the top topic for each paragraph

In [14]:
def get_document_topic(document, lda_model):
    # Create a bag-of-words representation
    bow = lda_model.id2word.doc2bow(document)
    
    # Get the topic distribution for the document
    topic_distribution = lda_model.get_document_topics(bow)
    
    # Sort the topics by probability (in descending order)
    sorted_topics = sorted(topic_distribution, key=lambda x: x[1], reverse=True)
    
    # Return the topic with the highest probability
    return sorted_topics[0][0] + 1 

In [15]:
paragraph_df['topic'] = paragraph_df['lemmatized'].apply(lambda x: get_document_topic(x, lda_model))

In [16]:
paragraph_df.head(10)

Unnamed: 0,name,paragraph,tokenized,no_stopwords,lemmatized,topic
0,Inge-Juliana Sackmann Christy,"This is Wednesday, March 20.","[This, is, Wednesday, March, 20]","[wednesday, march, 20]","[wednesday, march, 20]",1
1,Inge-Juliana Sackmann Christy,March 20.,"[March, 20]","[march, 20]","[march, 20]",1
2,Inge-Juliana Sackmann Christy,"2019. I’m Cindy Kelly, and I’m in Pasadena, C...","[2019, I, m, Cindy, Kelly, and, I, m, in, Pasa...","[2019, cindy, kelly, pasadena, california, ing...","[2019, cindy, kelly, pasadena, california, ing...",1
3,Inge-Juliana Sackmann Christy,Inge-Juliana Sackmann Christy. And should I s...,"[Inge, Juliana, Sackmann, Christy, And, should...","[inge, juliana, sackmann, christy, spell]","[inge, juliana, sackmann, christy, spell]",1
4,Inge-Juliana Sackmann Christy,"Yes, please.","[Yes, please]","[yes, please]","[yes, please]",1
5,Inge-Juliana Sackmann Christy,"I-n-g-e, Juliana, J-u-l-i-a-n-a, Sackmann, S ...","[I, n, g, e, Juliana, J, u, l, i, a, n, a, Sac...","[n, g, e, juliana, j, u, l, n, sackmann, like,...","[n, g, e, juliana, j, u, l, n, sackmann, like,...",1
6,Inge-Juliana Sackmann Christy,But I want to start with you. I want you to t...,"[But, I, want, to, start, with, you, I, want, ...","[want, start, want, tell, us, born, became, in...","[want, start, want, tell, u, born, became, int...",6
7,Inge-Juliana Sackmann Christy,"Well, that’s a long story. My family backgrou...","[Well, that, s, a, long, story, My, family, ba...","[well, long, story, family, background, german...","[well, long, story, family, background, german...",1
8,Inge-Juliana Sackmann Christy,They were offered something by the Russian Cza...,"[They, were, offered, something, by, the, Russ...","[offered, something, russian, czar, could, go,...","[offered, something, russian, czar, could, go,...",1
9,Inge-Juliana Sackmann Christy,"Napoleon had just gone through much of Europe,...","[Napoleon, had, just, gone, through, much, of,...","[napoleon, gone, much, europe, invaded, much, ...","[napoleon, gone, much, europe, invaded, much, ...",1


In [20]:
# paragraph_df[['name', 'paragraph', 'topic']].to_csv(data_folder + 'paragraph_topic.csv')

In [18]:
paragraph_df['topic'].value_counts(normalize=True)

1    0.389489
6    0.152956
2    0.114355
5    0.105793
7    0.082253
3    0.078198
4    0.076955
Name: topic, dtype: float64

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


---