# Latent Dirichlet Allocation

In [1]:
import os
import pandas as pd

from gensim import corpora, models

In [2]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter

<br>

## Data Acquisition and Pre-Processing

In [3]:
# Read all folder names in the OCR (or a specified) directory
# ocred_path = '/work/otb-lab/OCRed'
ocred_path = '/Users/nitingupta/Desktop/OTB/OCRed'

years = [name for name in os.listdir(ocred_path) if not name.startswith('.')]
years.sort()
print(years)

['1873-1874', '1892', '1893', '1894', '1901', '1918', '1921', '1928', '1948', '1956']


In [4]:
df = pd.read_csv('../Splitting/final-update/final-splits/final_splits.csv', index_col = 0, usecols=['id', 'sentence'])
# df = pd.read_csv('../split/updated/results/final_splits.csv', index_col = 0, usecols=['id', 'sentence'])

In [5]:
df['year'] = df.index.str.split("_").str[0]
df.set_index('year', inplace=True)

In [6]:
df

Unnamed: 0_level_0,sentence
year,Unnamed: 1_level_1
1873-1874,AN ACT TO REPEAT SECTION FOUR (4) OR AN ACT EN...
1873-1874,| Section 1. Be it enacted by the Senate and H...
1873-1874,Sec. 2. That so much of Section seventytwo (72...
1873-1874,"AN ACT to Revive, RENEw AND AMEND AN ACT ENTIT..."
1873-1874,AN ACT To IncorPorRaTE THE REFORM Apotto Socle...
...,...
1956,1132 An Act To Require The Commissioner Of Agr...
1956,Be it enacted by the General Assembly of the S...
1956,The Commissioner of Agriculture of South Carol...
1956,Any such plants or weeds unlawfully imported i...


In [7]:
stop_words = stopwords.words('english')

# Add some custom words to the list
stop_words.append('said')
stop_words.append('shall')
stop_words.append('ee')
stop_words.append('00')
stop_words.append('state')
stop_words.append('may')
stop_words.append('src')
stop_words.append('sec')
stop_words.append('sec.')
stop_words.append('town')
stop_words.append('section')
stop_words.append('county')
stop_words.append('act')
stop_words.append('board')
stop_words.append('000')
stop_words.append(';')
stop_words.append('approved')
stop_words.append('one')
stop_words.append('general')
stop_words.append('upon')
stop_words.append('hereby')

In [8]:
stop_words = set(stop_words)

In [9]:
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean(sentence):
    """
    Perform a basic cleaning that includes:
        - Hyphen removals from words appeared at the end of a sentence and were split to the next line.
        - Lowercasing
        - Tokenization
        - Removal of words that do not exclusively contain letters
        - Removing stopwords
        - Lemmatization
    """

    # Hyphen removal
    sentence = re.sub(r'(—|_|-)( )*', '', sentence)
    
    # Lowercase and tokenize
    tokens = word_tokenize(sentence.lower())
    
    # Keep only letters
    words_alpha = [word for word in tokens if word.isalpha()]
    
    # Stopword Removal
    filtered_tokens = [word for word in words_alpha if word not in stop_words]
    
    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return lemmatized_words

In [10]:
df['cleaned_sent'] = df['sentence'].apply(lambda x: clean(x))

In [11]:
df['cleaned_sent']

year
1873-1874    [repeat, four, entitled, relieve, south, carol...
1873-1874    [enacted, senate, house, representative, south...
1873-1874    [much, seventytwo, chapter, xii, title, iii, d...
1873-1874    [revive, renew, amend, entitled, acr, incorpor...
1873-1874    [incorporrate, reform, apotto, soclery, charle...
                                   ...                        
1956         [require, commissioner, agriculture, south, ca...
1956         [enacted, assembly, south, carolina, commissio...
1956         [commissioner, agriculture, south, carolina, d...
1956         [plant, weed, unlawfully, imported, seized, co...
1956         [time, effective, take, effect, approval, gove...
Name: cleaned_sent, Length: 30148, dtype: object

### Removing Rare Words
Filter out extreme words that won't be helpful in the topic modeling.

In [12]:
# A list of all cleaned words from the corpus
allwords = []
df['cleaned_sent'].apply(lambda x: [allwords.append(word) for word in x])
len(allwords)

732259

In [13]:
# Count word frequencies across the entire corpus
word_counts = Counter(allwords)

# Frequency threshold for rare words
frequency_threshold = 10

# Remove rare words
df['cleaned_sent'] = df['cleaned_sent'].apply(lambda x: [word for word in x if word_counts[word] >= frequency_threshold])

In [14]:
df

Unnamed: 0_level_0,sentence,cleaned_sent
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1873-1874,AN ACT TO REPEAT SECTION FOUR (4) OR AN ACT EN...,"[repeat, four, entitled, relieve, south, carol..."
1873-1874,| Section 1. Be it enacted by the Senate and H...,"[enacted, senate, house, representative, south..."
1873-1874,Sec. 2. That so much of Section seventytwo (72...,"[much, seventytwo, chapter, title, iii, audito..."
1873-1874,"AN ACT to Revive, RENEw AND AMEND AN ACT ENTIT...","[renew, amend, entitled, incorporate, home, in..."
1873-1874,AN ACT To IncorPorRaTE THE REFORM Apotto Socle...,"[charleston, south, carolina]"
...,...,...
1956,1132 An Act To Require The Commissioner Of Agr...,"[require, commissioner, agriculture, south, ca..."
1956,Be it enacted by the General Assembly of the S...,"[enacted, assembly, south, carolina, commissio..."
1956,The Commissioner of Agriculture of South Carol...,"[commissioner, agriculture, south, carolina, d..."
1956,Any such plants or weeds unlawfully imported i...,"[plant, unlawfully, seized, confiscated, addit..."


<br>

## LDA

In [15]:
dictionary = corpora.Dictionary(df['cleaned_sent'])

### LDA Model Training

In [16]:
# Create a dictionary and a corpus (Bag of Words)
id2word = corpora.Dictionary(df['cleaned_sent'])  # a mapping between words and their integer ids
corpus = [id2word.doc2bow(text) for text in df['cleaned_sent']]  # Convert document into the bag-of-words format

In [17]:
lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=id2word,
                                     num_topics=10,
                                     per_word_topics=True)

### Topic Interpretation

In [18]:
# Print the topics and associated words
topics = lda_model.print_topics()
for topic in topics:
    print(topic)

(0, '0.032*"education" + 0.022*"service" + 0.020*"elector" + 0.020*"total" + 0.019*"voting" + 0.018*"ey" + 0.017*"eee" + 0.016*"magistrate" + 0.016*"precinct" + 0.015*"commission"')
(1, '0.018*"person" + 0.014*"vehicle" + 0.011*"provided" + 0.010*"corporation" + 0.010*"provision" + 0.008*"court" + 0.008*"company" + 0.008*"law" + 0.008*"commission" + 0.007*"land"')
(2, '0.051*"dollar" + 0.027*"thousand" + 0.021*"hundred" + 0.020*"per" + 0.018*"cent" + 0.015*"sum" + 0.011*"purpose" + 0.011*"equipment" + 0.011*"building" + 0.010*"education"')
(3, '0.043*"bond" + 0.024*"commission" + 0.021*"fund" + 0.019*"year" + 0.019*"interest" + 0.013*"authorized" + 0.012*"retirement" + 0.011*"loan" + 0.011*"issued" + 0.011*"commissioner"')
(4, '0.055*"election" + 0.051*"act" + 0.030*"part" + 0.025*"item" + 0.021*"day" + 0.020*"repealed" + 0.020*"inconsistent" + 0.018*"april" + 0.016*"provision" + 0.013*"person"')
(5, '0.039*"term" + 0.032*"elected" + 0.027*"member" + 0.020*"office" + 0.019*"year" + 0.0

## Evaluation Metrics

Two evaluation metrics to consider for topic modeling: perplexity and coherence.

In [19]:
from gensim.models import CoherenceModel

In [20]:
# Compute Perplexity (lower is better)
print('Perplexity:', round(lda_model.log_perplexity(corpus), 2))

Perplexity: -7.13


In [21]:
# Compute Coherence Score (higher is better)
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['cleaned_sent'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score:', round(coherence_lda, 2))

Coherence Score: 0.54


## Visualization

In [28]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [29]:
import pyLDAvis
import pyLDAvis.gensim

In [30]:
pyLDAvis.enable_notebook()

In [32]:
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

In [34]:
# p