# Latent Dirichlet Allocation

In [2]:
import os
import pandas as pd

from gensim import corpora, models

In [3]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter

<br>

## Data Acquisition and Pre-Processing

In [4]:
# Read all folder names in the OCR (or a specified) directory
# ocred_path = '/work/otb-lab/OCRed'
ocred_path = '/Users/nitingupta/Desktop/OTB/OCRed'

years = [name for name in os.listdir(ocred_path) if not name.startswith('.')]
years.sort()
print(years)

['1873-1874', '1892', '1893', '1894', '1901', '1918', '1921', '1928', '1948', '1956']


In [5]:
df = pd.read_csv('../Splitting/final-update/final-splits/final_splits.csv', index_col = 0, usecols=['id', 'sentence'])
# df = pd.read_csv('../split/updated/results/final_splits.csv', index_col = 0, usecols=['id', 'sentence'])

FileNotFoundError: [Errno 2] No such file or directory: '../Splitting/final-update/final-splits/final_splits.csv'

In [None]:
df['year'] = df.index.str.split("_").str[0]
df.set_index('year', inplace=True)

In [None]:
df

In [None]:
stop_words = stopwords.words('english')

# Add some custom words to the list
stop_words.append('said')
stop_words.append('shall')
stop_words.append('ee')
stop_words.append('00')
stop_words.append('state')
stop_words.append('may')
stop_words.append('src')
stop_words.append('sec')
stop_words.append('sec.')
stop_words.append('town')
stop_words.append('section')
stop_words.append('county')
stop_words.append('act')
stop_words.append('board')
stop_words.append('000')
stop_words.append(';')
stop_words.append('approved')
stop_words.append('one')
stop_words.append('general')
stop_words.append('upon')
stop_words.append('hereby')

In [None]:
stop_words = set(stop_words)

In [None]:
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean(sentence):
    """
    Perform a basic cleaning that includes:
        - Hyphen removals from words appeared at the end of a sentence and were split to the next line.
        - Lowercasing
        - Tokenization
        - Removal of words that do not exclusively contain letters
        - Removing stopwords
        - Lemmatization
    """

    # Hyphen removal
    sentence = re.sub(r'(—|_|-)( )*', '', sentence)
    
    # Lowercase and tokenize
    tokens = word_tokenize(sentence.lower())
    
    # Keep only letters
    words_alpha = [word for word in tokens if word.isalpha()]
    
    # Stopword Removal
    filtered_tokens = [word for word in words_alpha if word not in stop_words]
    
    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return lemmatized_words

In [None]:
df['cleaned_sent'] = df['sentence'].apply(lambda x: clean(x))

In [None]:
df['cleaned_sent']

### Removing Rare Words
Filter out extreme words that won't be helpful in the topic modeling.

In [None]:
# A list of all cleaned words from the corpus
allwords = []
df['cleaned_sent'].apply(lambda x: [allwords.append(word) for word in x])
len(allwords)

In [None]:
# Count word frequencies across the entire corpus
word_counts = Counter(allwords)

# Frequency threshold for rare words
frequency_threshold = 10

# Remove rare words
df['cleaned_sent'] = df['cleaned_sent'].apply(lambda x: [word for word in x if word_counts[word] >= frequency_threshold])

In [None]:
df

<br>

## LDA

In [None]:
dictionary = corpora.Dictionary(df['cleaned_sent'])

### LDA Model Training

In [None]:
# Create a dictionary and a corpus (Bag of Words)
id2word = corpora.Dictionary(df['cleaned_sent'])  # a mapping between words and their integer ids
corpus = [id2word.doc2bow(text) for text in df['cleaned_sent']]  # Convert document into the bag-of-words format

In [None]:
lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=id2word,
                                     num_topics=10,
                                     per_word_topics=True)

### Topic Interpretation

In [None]:
# Print the topics and associated words
topics = lda_model.print_topics()
for topic in topics:
    print(topic)

## Evaluation Metrics

Two evaluation metrics to consider for topic modeling: perplexity and coherence.

In [None]:
from gensim.models import CoherenceModel

In [None]:
# Compute Perplexity (lower is better)
print('Perplexity:', round(lda_model.log_perplexity(corpus), 2))

In [None]:
# Compute Coherence Score (higher is better)
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['cleaned_sent'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score:', round(coherence_lda, 2))

## Visualization

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [29]:
import pyLDAvis
import pyLDAvis.gensim

In [30]:
pyLDAvis.enable_notebook()

In [32]:
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

In [34]:
# p