In [283]:
import os
import random
import re
import pandas as pd
import numpy as np
import string


from gensim import models
from collections import defaultdict
from gensim import corpora
 
from gensim import similarities 
from sklearn.feature_extraction.text import CountVectorizer

import nltk

import pprint

STOPLIST = list(nltk.corpus.stopwords.words('english'))

# Import data

In [284]:
df = pd.read_csv("chunked_meetings.csv", sep = "|")
print(len(df))
df.sample()

7729


Unnamed: 0,document_id,leaid,state,district,document_year,document_month,document_day,text,chunk_number,chunk_id
4008,OREGON_DALLAS SD 2_2019_11_25_1,4103860.0,OREGON,DALLAS SD 2,2019.0,11.0,25.0,261.0 262.0 263.0 264.0 265.0 266.0 267.0 268....,1,OREGON_DALLAS SD 2_2019_11_25_1chunk1


# Clean and process text

In [285]:
def process_text(
    text: str,
    lower_case: bool = True,
    remove_punct: bool = True,
    remove_stopwords: bool = False,
    lemma: bool = False,
    string_or_list: str = "str",
):
    tokens = nltk.word_tokenize(text)

    if lower_case:
        tokens = [token.lower() if token.isalpha() else token for token in tokens]

    if remove_punct:
        tokens = [token for token in tokens if token.isalpha()]

    if remove_stopwords:
        tokens = [token for token in tokens if not token in STOPLIST]

    if lemma:
        tokens = [nltk.wordnet.WordNetLemmatizer().lemmatize(token) for token in tokens]

    if string_or_list != "list":
        doc = " ".join(tokens)
    else:
        doc = tokens

    return doc

In [286]:
df["tokens"] = df.text.apply(process_text, remove_stopwords = False, string_or_list = "list")
df.sample()

Unnamed: 0,document_id,leaid,state,district,document_year,document_month,document_day,text,chunk_number,chunk_id,tokens
1597,NEBRASKA_RAYMOND CENTRAL PUBLIC SCHOOLS_2021_5...,3104950.0,NEBRASKA,RAYMOND CENTRAL PUBLIC SCHOOLS,2021.0,5.0,12.0,been an active advocate for all students at Ra...,2,NEBRASKA_RAYMOND CENTRAL PUBLIC SCHOOLS_2021_5...,"[been, an, active, advocate, for, all, student..."


# Create a list of tokenized documents, called docs_list

In [287]:
docs_list = list(df.tokens)
# docs_list[0]

# Bi Grams & Tri Grams

The Phrases class from Gensim can be used to identify bigrams (two words that co-occur). 

Key parameters include: 

*min_count* set sets the minimum frequency threshold for a bigram to be considered (default = 5).
*delimeter* describes how bigrams will be combined into one token (default = "_")

*connector_words* allows you to optionally provide a list of connector words that are included in the bigram. For example, "of" would allow "bank of america" to be included. Default is no connector words. A simple option is setting connector_words=phrases.ENGLISH_CONNECTOR_WORDS

The resulting object from Phrases is a trained model (really, list of bigrams) that can be used to transform new documents by combining common phrases into single tokens.

In [288]:
from gensim.models import phrases

bigram_model = Phrases(docs_list, min_count=20, delimiter = "_", connector_words = phrases.ENGLISH_CONNECTOR_WORDS)

# Print bigram in document
for token in bigram_model[docs_list[10]]:
    if "_" in token:
        print(token)

call_to_order
roll_call
pledge_of_allegiance
action_items
memorandum_of_understanding
tuscaloosa_county
board_of_education
not_to_exceed
lowest_responsible
lowest_responsible
listed_below
agreement_between
tuscaloosa_county
delegate_assembly
consent_items
contract_between
elementary_school
fifth_grade
field_trip
contract_between
elementary_school
contract_between
elementary_school
contract_between
elementary_school


In [289]:
## Replace each bigram with a unigram delimited by a _
def replace_bigrams_with_phrases(token_list):
    return bigram_model[token_list]

df["tokens_phrases"] = df.tokens.apply(replace_bigrams_with_phrases)
docs_list = list(df.tokens_phrases)

In [290]:
trigram_model = Phrases(docs_list, min_count=10, delimiter = "_", connector_words = phrases.ENGLISH_CONNECTOR_WORDS)

# Print bigram in document
for token in trigram_model[docs_list[6]]:
    if "_" in token:
        print(token)

does_not
discriminate_on_the_basis
race_color
national_origin
these_policies
th_street
district_office
schuyler_ne
questions_regarding
career_and_technical_education
department_of_education
phone_fax
schuyler_community_schools
de_la
district_office
th_street
e_l
full_day
schuyler_community_schools
start_date


In [291]:
def replace_trigrams_with_phrases(token_list):
    return trigram_model[token_list]
df["tokens_phrases"] = df.tokens_phrases.apply(replace_trigrams_with_phrases)
docs_list = list(df.tokens_phrases)

# Removing single instances of letters

In [292]:
def remove_single_letters(token_list):
    return [token for token in token_list if len(token) > 1]

df["tokens_phrases"] = df.tokens_phrases.apply(remove_single_letters)

# Now Remove Stop Words

In [293]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define a function to remove stop words
def remove_stop_words(token_list):
    return [token for token in token_list if token not in stop_words]

# Apply the function to remove stop words from your 'tokens_phrases' column
df['tokens_phrases'] = df.tokens_phrases.apply(remove_stop_words)

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


# Create gensim formatted corpus

Instead of using CountVectorizer to create a document-term matrix, gensim requires a special module specific format of a document term matrix. 

The Dictionary class generates a unique integer id for each unique token in the input list of documents. It then stores the mapping between the tokens and their ids in a dictionary-like object, which can be accessed using the token2id attribute.

In [294]:
from gensim.corpora import Dictionary

gensim_dictionary = Dictionary(df.tokens_phrases)
gensim_dictionary.token2id

{'accounts': 0,
 'activities': 1,
 'activity': 2,
 'activity_account': 3,
 'activity_fund': 4,
 'ad': 5,
 'administrators': 6,
 'advertisement': 7,
 'any_other': 8,
 'applicants': 9,
 'attend': 10,
 'board_room': 11,
 'book': 12,
 'box': 13,
 'building': 14,
 'calendar': 15,
 'cards': 16,
 'charge': 17,
 'comp': 18,
 'concession': 19,
 'contracts': 20,
 'conversations': 21,
 'custodial': 22,
 'directors': 23,
 'discuss': 24,
 'discussion': 25,
 'discussion_items': 26,
 'district': 27,
 'employees': 28,
 'essers': 29,
 'etc': 30,
 'event': 31,
 'floor': 32,
 'following': 33,
 'food_service': 34,
 'gate': 35,
 'get': 36,
 'gibbons': 37,
 'if_you_are': 38,
 'iii': 39,
 'introductions': 40,
 'involves': 41,
 'jamie': 42,
 'leadership_team': 43,
 'make_sure': 44,
 'management': 45,
 'managing': 46,
 'meeting': 47,
 'monday_july': 48,
 'money': 49,
 'mundil': 50,
 'ne': 51,
 'new': 52,
 'new_hires': 53,
 'newspaper': 54,
 'one': 55,
 'open': 56,
 'opening': 57,
 'options': 58,
 'please': 59,

We can limit the token dictionary to only include tokens that are present a certain number of times.

The filter_extremes() method is used to remove tokens from the dictionary that are either too rare or too common. It takes two arguments:

*no_below*: an integer that specifies the minimum number of documents that a token must appear in to be included in the dictionary (i.e., W). Tokens that appear in fewer than no_below documents are removed.

*no_above*: a float that specifies the maximum fraction of documents that a token can appear in to be included in the dictionary. Tokens that appear in more than no_above of the documents are removed.

In [295]:
gensim_dictionary.filter_extremes(no_below=20, no_above=0.65)

# Creating BOW & TF-IDF representations

Once the dictionary is created, it can be used to convert a document from a list of tokens to a bag-of-words representation, which is a sparse vector where each element represents the count of a particular token in the document. This is done using the doc2bow() method of the Dictionary class.

In [296]:
from gensim.models import TfidfModel

# Step 1: Create a list of BOW representations for the corpus
corpus_bow = []
for doc in docs_list:
    corpus_bow.append(gensim_dictionary.doc2bow(doc))

# Step 2: Train the TF-IDF model using the BoW corpus
tfidf_model = TfidfModel(corpus_bow)

# Step 3: Transform the BoW corpus into a TF-IDF corpus using the trained TF-IDF model
corpus_tfidf = tfidf_model[corpus_bow]

print("BoW Representation of the first document:", corpus_bow[0])
print("TF-IDF Representation of the first document:", list(corpus_tfidf[0]))

BoW Representation of the first document: [(0, 1), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 3), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 3), (22, 2), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 2), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 7), (41, 1), (42, 2), (43, 1), (44, 2), (45, 1), (46, 1), (47, 1), (48, 1), (49, 2), (50, 1), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 1), (57, 3), (58, 1), (59, 1), (60, 1), (61, 2), (62, 1), (63, 1), (64, 6), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 4), (71, 3), (72, 2), (73, 1), (74, 1), (75, 3), (76, 3), (77, 3), (78, 1), (79, 1), (80, 3), (81, 2), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1)]
TF-IDF Representation of the first document: [(0, 0.07341136143576983), (1, 0.09502698293416723), (2, 0.11894296431064792), (3, 0.

# Topic Modeling

# Train

We can use gensim.models.LdaModel to train an LDA model. 

Key parameters include:

*corpus*: list of documents represented using gensim dictionary doc-term matrix

*id2word*: a dictionary that maps integer word ids to the corresponding word strings. This is used to convert the bag-of-words and TF-IDF vectors back to human-readable text.

*num_topics*: the number of topics to learn from the corpus


In [297]:
from gensim.models import LdaModel

NUM_TOPICS_50 = 50
NUM_TOPICS_75 = 75
NUM_TOPICS_100 = 100

RANDOM_STATE = 5643

temp = gensim_dictionary[0] # required to initialize
word_id = gensim_dictionary.id2token

# Model Descriptions

- LDA Model 1 is trained with 50 topics and BOW representation of the documents
- LDA Model 2 is trained with 50 topics and BOW representation of the documents
- LDA Model 3 is trained with 75 topics and BOW representation of the documents
- LDA Model 4 is trained with 75 topics and TF-IDF representation of the documents
- LDA Model 5 is trained with 100 topics and TF-IDF representation of the documents
- LDA Model 6 is trained with 100 topics and TF-IDF representation of the documents

# BOW Models

In [298]:
lda_model_1 = LdaModel(
    corpus=corpus_bow,
    id2word=word_id,
    num_topics=NUM_TOPICS_50,
    random_state = RANDOM_STATE,
)

lda_model_2 = LdaModel(
    corpus=corpus_bow,
    id2word=word_id,
    num_topics=NUM_TOPICS_75,
    random_state = RANDOM_STATE,
)

lda_model_3 = LdaModel(
    corpus=corpus_bow,
    id2word=word_id,
    num_topics=NUM_TOPICS_100,
    random_state = RANDOM_STATE,
)

# TF-IDF Models

In [299]:
lda_model_4 = LdaModel(
    corpus=corpus_tfidf,
    id2word=word_id,
    num_topics=NUM_TOPICS_50,
    random_state = RANDOM_STATE,
)

lda_model_5 = LdaModel(
    corpus=corpus_tfidf,
    id2word=word_id,
    num_topics=NUM_TOPICS_75,
    random_state = RANDOM_STATE,
)

lda_model_6 = LdaModel(
    corpus=corpus_tfidf,
    id2word=word_id,
    num_topics=NUM_TOPICS_100,
    random_state = RANDOM_STATE,
)

After training the model, we can view the words that are important for each topic using the show_topics method.

Key parameters include:
    

*num_topics*: The number of topics to be selected, if -1 - all topics will be in result.

*num_words*: The number of words to be included per topics.

*formatted*: The formatted=False argument indicates that the method should return a list of (word, probability) pairs for each word in the topic, rather than a formatted string.


# Model 1 with 50 topics and BOW representation of the documents

In [300]:
lda_model_1.show_topics(num_topics = -1, num_words= 10, formatted = False)[0]

(0,
 [('records', 0.029021408),
  ('program', 0.014085579),
  ('school', 0.011822059),
  ('university', 0.009700679),
  ('disposition', 0.009389855),
  ('records_retention', 0.0065970924),
  ('services', 0.0059656654),
  ('senior_high_school', 0.005943884),
  ('disposed', 0.005894095),
  ('school_counselors', 0.0055533773)])

# Model 2 with 50 topics and BOW representation of the documents

In [301]:
lda_model_2.show_topics(num_topics = -1, num_words= 10, formatted = False)[0]

(0,
 [('school', 0.011963719),
  ('hardin', 0.011544713),
  ('license', 0.010524992),
  ('be_granted', 0.010330215),
  ('red', 0.009486529),
  ('cases', 0.009424991),
  ('acting', 0.008764949),
  ('positions', 0.007855811),
  ('public', 0.0078251725),
  ('program', 0.0065753376)])

# Model 3 with 75 topics and BOW representation of the documents

In [302]:
lda_model_3.show_topics(num_topics = -1, num_words= 10, formatted = False)[0]

(0,
 [('program', 0.012152751),
  ('license', 0.011158229),
  ('temporary', 0.010714946),
  ('increase', 0.0103770895),
  ('agency', 0.00971529),
  ('records', 0.009712331),
  ('university', 0.0084626665),
  ('renewal', 0.008063529),
  ('school', 0.007848108),
  ('district', 0.007722864)])

# Model 4 with 75 topics and TF-IDF representation of the documents

In [303]:
lda_model_4.show_topics(num_topics = -1, num_words= 10, formatted = False)[0]

(0,
 [('motion_unanimously_carried', 0.012804213),
  ('permanent_record', 0.012618397),
  ('extra_curricular', 0.008057409),
  ('tucker', 0.007540574),
  ('erickson', 0.0056745373),
  ('trainings', 0.0054784752),
  ('dunn', 0.0050404966),
  ('enrichment', 0.004779847),
  ('virtual', 0.0043417634),
  ('soil', 0.0042633316)])

# Model 5 with 100 topics and TF-IDF representation of the documents

In [304]:
lda_model_5.show_topics(num_topics = -1, num_words= 10, formatted = False)[0]

(0,
 [('member', 0.008603532),
  ('policy', 0.008278606),
  ('agenda_item', 0.006793281),
  ('executive_session', 0.0061175684),
  ('zoom', 0.0052946387),
  ('board', 0.0049904846),
  ('public', 0.004968534),
  ('checks', 0.004936207),
  ('total_amount', 0.004444237),
  ('boe', 0.004436045)])

# Model 6 with 100 topics and TF-IDF representation of the documents

In [305]:
lda_model_6.show_topics(num_topics = -1, num_words= 10, formatted = False)[0]

(0,
 [('motion_unanimously_carried', 0.021804113),
  ('instr', 0.012906829),
  ('ba_step', 0.0109865535),
  ('new_business', 0.010129189),
  ('nj', 0.009757953),
  ('erickson', 0.009587086),
  ('unfinished_business', 0.009351331),
  ('social_studies', 0.0093494225),
  ('public_comment_none', 0.009218349),
  ('grade', 0.008941581)])

# Coherence

In [306]:
from gensim.models.coherencemodel import CoherenceModel

def calculate_average_coherence(lda_model, corpus, gensim_dictionary, num_topics, topn=10):
    """
    Calculate the average topic coherence for a given LDA model.

    Parameters:
    - lda_model: The LDA model to evaluate.
    - corpus: The corpus used by the LDA model (BoW or TF-IDF).
    - gensim_dictionary: The Gensim dictionary of the corpus.
    - num_topics: The number of topics used in the LDA model.
    - topn: The number of top terms to consider for calculating coherence (default=10).

    Returns:
    - The average coherence score across all topics in the LDA model.
    """
    topic_coherence_scores = []
    for n in range(lda_model.num_topics):
        topn_terms = [term for term, _ in lda_model.show_topic(n, topn=topn)]
        coherence = CoherenceModel(topics=[topn_terms], corpus=corpus, dictionary=gensim_dictionary, coherence='u_mass', topn=topn)
        score = coherence.get_coherence()
        topic_coherence_scores.append(score)

    avg_coherence = sum(topic_coherence_scores) / num_topics
    return avg_coherence

In [307]:
# Assuming the lda_model_x, corpus_bow, corpus_tfidf, gensim_dictionary, and NUM_TOPICS_X variables are already defined

mod_1_avg_coherence = calculate_average_coherence(lda_model_1, corpus_bow, gensim_dictionary, NUM_TOPICS_50)
mod_2_avg_coherence = calculate_average_coherence(lda_model_2, corpus_bow, gensim_dictionary, NUM_TOPICS_75)
mod_3_avg_coherence = calculate_average_coherence(lda_model_3, corpus_bow, gensim_dictionary, NUM_TOPICS_100)

mod_4_avg_coherence = calculate_average_coherence(lda_model_4, corpus_tfidf, gensim_dictionary, NUM_TOPICS_50)
mod_5_avg_coherence = calculate_average_coherence(lda_model_5, corpus_tfidf, gensim_dictionary, NUM_TOPICS_75)
mod_6_avg_coherence = calculate_average_coherence(lda_model_6, corpus_tfidf, gensim_dictionary, NUM_TOPICS_100)

# You can print the average coherence scores for each model to see the results
print("Model 1 Average Coherence:", mod_1_avg_coherence)
print("Model 2 Average Coherence:", mod_2_avg_coherence)
print("Model 3 Average Coherence:", mod_3_avg_coherence)
print("Model 4 Average Coherence:", mod_4_avg_coherence)
print("Model 5 Average Coherence:", mod_5_avg_coherence)
print("Model 6 Average Coherence:", mod_6_avg_coherence)

Model 1 Average Coherence: -3.3775469953314956
Model 2 Average Coherence: -3.7645702456361656
Model 3 Average Coherence: -4.676187474459707
Model 4 Average Coherence: -8.92425426798147
Model 5 Average Coherence: -10.613656592226308
Model 6 Average Coherence: -9.030562039997365


## Create dataframe of top words per topic for our best model: Model 2

In [308]:
list_of_topic_tables = []
for topic in lda_model_1.show_topics(
    num_topics=-1, num_words=10, formatted=False
):
    list_of_topic_tables.append(
        pd.DataFrame(
            data = topic[1],
            columns=["Word" + "_" + str(topic[0]), "Prob" + "_" + str(topic[0])],
        )
    )
list_of_topic_tables[0]

Unnamed: 0,Word_0,Prob_0
0,records,0.029021
1,program,0.014086
2,school,0.011822
3,university,0.009701
4,disposition,0.00939
5,records_retention,0.006597
6,services,0.005966
7,senior_high_school,0.005944
8,disposed,0.005894
9,school_counselors,0.005553


In [309]:
pd.set_option('display.max_columns', 500)

bigdf_mod2 = pd.concat(list_of_topic_tables, axis=1)
bigdf_mod2

Unnamed: 0,Word_0,Prob_0,Word_1,Prob_1,Word_2,Prob_2,Word_3,Prob_3,Word_4,Prob_4,Word_5,Prob_5,Word_6,Prob_6,Word_7,Prob_7,Word_8,Prob_8,Word_9,Prob_9,Word_10,Prob_10,Word_11,Prob_11,Word_12,Prob_12,Word_13,Prob_13,Word_14,Prob_14,Word_15,Prob_15,Word_16,Prob_16,Word_17,Prob_17,Word_18,Prob_18,Word_19,Prob_19,Word_20,Prob_20,Word_21,Prob_21,Word_22,Prob_22,Word_23,Prob_23,Word_24,Prob_24,Word_25,Prob_25,Word_26,Prob_26,Word_27,Prob_27,Word_28,Prob_28,Word_29,Prob_29,Word_30,Prob_30,Word_31,Prob_31,Word_32,Prob_32,Word_33,Prob_33,Word_34,Prob_34,Word_35,Prob_35,Word_36,Prob_36,Word_37,Prob_37,Word_38,Prob_38,Word_39,Prob_39,Word_40,Prob_40,Word_41,Prob_41,Word_42,Prob_42,Word_43,Prob_43,Word_44,Prob_44,Word_45,Prob_45,Word_46,Prob_46,Word_47,Prob_47,Word_48,Prob_48,Word_49,Prob_49
0,records,0.029021,upon_the_recommendation_of_the_superintendent,0.010339,board,0.019829,services,0.01986,school_year,0.012809,fund,0.045693,said,0.017068,motion_to_approve,0.032386,board,0.039708,school,0.032698,policy,0.029342,policy,0.026909,board,0.021852,motion,0.041343,public,0.019043,board,0.015285,board,0.021289,district,0.014842,board_of_education,0.030227,district,0.013351,district,0.020276,teachers,0.010826,student,0.026984,yes,0.075712,present,0.022486,amount,0.023411,student,0.029879,school_year,0.012543,motion,0.037905,committee,0.061813,student,0.024138,office,0.02171,district,0.015699,students,0.044359,students,0.010798,records,0.026193,meeting,0.014899,board,0.023949,school,0.01233,school,0.017791,teacher,0.018094,dw,0.014273,board,0.029117,seconded,0.030465,public,0.017487,coventry,0.027962,meeting,0.029928,approve,0.042326,summer,0.017235,motion_carried,0.025382
1,program,0.014086,district,0.009612,districtwide,0.016277,maintenance,0.016806,hours,0.011207,budget,0.021528,board,0.014022,assistant,0.022128,superintendent,0.012598,district,0.011206,files,0.013467,board,0.021759,report,0.016671,action,0.027859,medical,0.015433,district,0.014236,contract,0.019448,school,0.01147,personnel,0.019187,elementary,0.010271,students,0.018117,students,0.010604,board,0.016006,second,0.018737,board_of_education,0.021352,approve,0.01208,days,0.023273,teacher,0.012056,pm,0.033555,board,0.019986,students,0.015079,tier,0.009135,reports,0.012757,school,0.01078,hs,0.010601,district,0.022794,board_of_education,0.01476,motion,0.014786,school_year,0.012087,students,0.015259,appointment,0.016544,new_hire,0.013522,board_of_education,0.023733,board,0.025582,board,0.01392,motion,0.022422,board,0.02288,seconded,0.031347,teacher,0.013199,meeting,0.020392
2,school,0.011822,new_york_state,0.008643,effective,0.012306,environmental,0.01426,wohs,0.010359,account,0.021264,school,0.013075,school_year,0.02128,budget,0.011062,regular,0.008826,record,0.012239,motion,0.01048,annual,0.011944,second,0.023378,patient,0.01268,approval,0.011464,project,0.014607,project,0.010111,brush,0.014824,information_technology,0.008371,teachers,0.007415,principal,0.00825,district,0.014797,motion,0.017614,motion,0.013844,board,0.011163,students,0.016278,director,0.010825,board_meeting,0.032593,motion,0.016249,course,0.014314,students,0.008825,including,0.008365,district,0.007515,services,0.009522,employee,0.019278,school,0.012328,approved_the_resolution,0.013854,approve,0.011238,staff,0.011014,effective,0.015577,th_grade,0.012466,yes,0.013947,minutes,0.022134,notice,0.0121,board,0.013291,minutes,0.018019,motion,0.028423,june,0.012444,approve_the_recommendation,0.012364
3,university,0.009701,board_of_education,0.007973,employment,0.011807,sale,0.013763,fte,0.009679,total,0.017664,tax,0.008148,education,0.013591,meeting,0.010545,students,0.008546,travel,0.011312,student,0.008798,school,0.010617,board,0.015305,records,0.012322,meeting,0.010027,services,0.014248,upon_the_recommendation_of_the_superintendent_...,0.008949,teacher_aide,0.014328,effective_october,0.008317,school,0.006826,shared,0.008114,elections,0.011515,closed_session,0.012547,meeting,0.013135,school,0.011083,school,0.011542,coordinator,0.010534,board,0.026607,school,0.013488,school,0.012274,hib,0.00818,records,0.008255,use,0.006836,motion_unanimously_carried,0.006976,employees,0.013399,minutes,0.011909,carried_by_a_vote,0.01224,high_school,0.00961,will_be,0.01008,name,0.015329,marching_band,0.012117,meeting,0.013863,meeting,0.020581,records,0.008243,school,0.012412,public,0.016613,board_of_education,0.024493,approve,0.011935,series,0.012211
4,disposition,0.00939,price,0.007064,approve,0.010633,drill,0.012871,students,0.008301,balance,0.014072,district,0.008017,elementary,0.010489,motion,0.010349,staff,0.008409,district,0.010738,resolution_no,0.008535,presentation,0.009506,final_resolution_motion_carries,0.010702,permit,0.011903,minutes,0.008422,approval,0.011522,approval,0.007493,pending,0.014167,baseball,0.007633,plan,0.006056,will_be,0.007477,school,0.00731,recommended_action_approve,0.010011,minutes,0.008895,it_is_recommended,0.009467,staff,0.009341,school,0.009357,school,0.016278,vote,0.011413,replaces,0.008513,february,0.007644,petition,0.008048,teachers,0.00667,school,0.006322,funds,0.010518,superintendent,0.011191,made_a_motion,0.011394,motion,0.006869,board,0.008723,resignation,0.01223,sub,0.010361,section,0.012828,approve,0.018967,intermediate_school,0.008111,r_s,0.011421,board_of_education,0.011075,motion_carried,0.022892,leave,0.009983,double_click_to_return_to_agenda,0.011615
5,records_retention,0.006597,mitchell,0.007048,action,0.010628,property,0.011904,effective_dates,0.007984,bond,0.013894,budget,0.006701,instructor,0.010232,district,0.010252,building,0.007276,employee,0.009965,district,0.008211,shs,0.00946,trustee,0.007749,meetings,0.008762,that_the_board_of_directors,0.007958,new,0.010144,approved,0.007217,following,0.013735,approval,0.007042,training,0.00575,director,0.007471,e_t,0.007215,action,0.009498,contract,0.008577,donation,0.007799,data,0.009198,elementary,0.008703,meeting,0.012855,services,0.010915,high_school,0.008412,class,0.007246,school,0.007668,staff,0.006621,special_education,0.006284,employment,0.009189,board,0.010015,approve,0.011247,board,0.006584,program,0.007576,salary,0.011137,kyle,0.009934,this_resolution,0.009787,motion,0.015637,west_orange_board_of_education,0.007975,g_e,0.010116,action,0.010923,board,0.011663,district,0.009766,discussion,0.010898
6,services,0.005966,school,0.006581,seconded,0.009286,records,0.011307,district,0.007336,check_total,0.013591,year,0.006521,approval,0.009866,report,0.00893,education,0.006457,shall,0.009545,records,0.007649,june,0.009198,bonds,0.007732,issues,0.008523,resolution,0.007655,approve,0.009517,records,0.006763,memorial,0.013691,board,0.006667,schools,0.005617,virtual,0.007315,parent,0.006262,board,0.009266,second,0.008503,october,0.006493,incident,0.008884,brown,0.008286,second,0.012399,discharge,0.008617,whs,0.008164,business,0.007102,information,0.00752,math,0.006524,board,0.006043,local,0.008744,pm,0.009792,approval,0.011081,march,0.006331,work,0.006377,district,0.010037,assistant_coach,0.009926,public,0.008874,present,0.010414,meeting,0.00759,r_i,0.009728,motion,0.007128,moved,0.010489,effective_july,0.008899,policy,0.010623
7,senior_high_school,0.005944,board,0.006256,motion,0.007809,agreement,0.01113,meeting,0.006413,bonds,0.012968,town,0.00634,grade_teacher,0.009716,school,0.006773,health,0.00629,including,0.008935,school,0.007638,program,0.008533,school,0.007208,board,0.007775,voting_unanimously_approved,0.007155,construction,0.009114,program,0.00599,owen,0.011493,school,0.006542,program,0.005457,school,0.006927,target,0.005875,school,0.009169,board,0.008196,minutes,0.006467,records,0.008049,team,0.00767,abstain,0.008898,meeting,0.008522,including,0.006781,spanish,0.006949,copies,0.006726,would,0.005813,district,0.005914,board,0.00849,approve,0.009373,as_presented,0.009782,minutes,0.006219,district,0.005561,step,0.009272,weber,0.009759,seconded,0.008586,moved,0.009919,election,0.007402,s_t,0.009669,closed_session,0.00625,school_year,0.010174,varsity,0.008303,board,0.009833
8,disposed,0.005894,approve,0.005914,high_school,0.007256,routine,0.010426,name_location_position,0.006248,interest,0.012794,collected,0.005918,boces,0.008256,student,0.00617,system,0.006263,personal,0.008175,review,0.006709,plan,0.008111,minutes,0.007183,sewer,0.007683,information,0.00681,agreement,0.008407,facility,0.00589,new_position,0.010589,title_i,0.006248,support,0.005426,reading,0.006284,she_attended,0.005254,butler,0.006778,award,0.00794,report,0.006284,training,0.007478,approve,0.00753,march,0.008186,board_of_trustees,0.006092,will_be,0.006624,unit,0.006835,student,0.006523,class,0.00524,program,0.005579,note,0.007391,following,0.008973,instructional_assistant,0.00828,following,0.005979,report,0.005091,stipend,0.008955,name_position_location,0.009543,comments,0.007303,as_presented,0.007804,cases,0.007002,stuart,0.009334,may,0.005995,absent,0.008678,effective_june,0.008,school,0.008999
9,school_counselors,0.005553,issuance,0.005714,jhs,0.007144,evacuation,0.008535,superintendent_recommends_approval,0.006179,revenue,0.012071,school_district,0.005871,test_results,0.008149,member,0.006049,community,0.005918,employees,0.008067,approval,0.006599,approved,0.007019,board_of_education,0.006704,seven,0.007677,community_college,0.006219,final_resolution_motion_passes,0.00784,action,0.00575,resolved_that,0.009097,policy,0.006117,children,0.005131,district,0.006099,conference,0.005142,school_year,0.006679,executive_session,0.006965,approval,0.006147,teachers,0.007151,assistant,0.007417,elementary_school,0.007822,discussion,0.005796,provider,0.006239,whereas,0.006723,system,0.006197,stated_that,0.004864,meeting,0.005172,student,0.006805,seconded,0.008959,seconded,0.007992,grade,0.005952,community,0.005034,position,0.008613,diagnostic,0.00852,motion,0.00707,board_meeting,0.007777,county,0.00672,r_m,0.008442,discussion,0.005945,aye,0.008039,esy,0.007886,boe,0.008953


# Get document topics prevalence

The method get_document_topics() retrieves the topic distribution for a single document in the corpus, represented as a list of topic-prevalence pairs.

Key parameters include:

*bow*: Gensim representation of a single document (list of tuples with word IDs and their frequency counts)

*minimum_probability*: Minimum probability threshold for a topic to be included in the output


In [310]:
# What topic has the highest prevalelence in document 0?
lda_model.get_document_topics(corpus[0], minimum_probability=0)

NameError: name 'lda_model' is not defined

## Create a list of list of topic probabilities for each document

In [None]:
topic_probs = []
for document in corpus_bow:
    document_topics = []
    for topic_prob in lda_model_1.get_document_topics(document, minimum_probability=0):
        document_topics.append(topic_prob[1])
    topic_probs.append(document_topics)
topic_probs[0]

[0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.05541529,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.100726694,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.11466677,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.032085173,
 0.08551382,
 0.00012826791,
 0.21743359,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.10296074,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.11651634,
 0.00012826791,
 0.1694226,
 0.00012826791,
 0.00012826791,
 0.00012826791,
 0.00012826791]

In [None]:
topic_probs_df = pd.DataFrame(topic_probs)
topic_probs_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,0.000128,0.000128,0.000128,0.000128,0.000128,0.055415,0.000128,0.000128,0.000128,0.100727,0.000128,0.000128,0.000128,0.000128,0.114667,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.032085,0.085514,0.000128,0.217434,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.102961,0.000128,0.000128,0.000128,0.000128,0.000128,0.116516,0.000128,0.169423,0.000128,0.000128,0.000128,0.000128
1,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.147561,0.000131,0.000131,0.000131,0.000131,0.181207,0.000131,0.054146,0.000131,0.225820,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.031553,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.100980,0.000131,0.000131,0.000131,0.253108,0.000131,0.000131,0.000131,0.000131
2,0.000135,0.000135,0.000135,0.000135,0.000135,0.043483,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.320260,0.000135,0.000135,0.000135,0.000135,0.021091,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.052637,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.111599,0.000135,0.000135,0.000135,0.105440,0.000135,0.294249,0.000135,0.000135,0.045562,0.000135,0.000135,0.000135
3,0.000158,0.000158,0.000158,0.000158,0.016053,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.064122,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.050789,0.021237,0.000158,0.000158,0.060736,0.000158,0.000158,0.000158,0.000158,0.000158,0.185651,0.000158,0.000158,0.000158,0.000158,0.000158,0.340051,0.000158,0.066590,0.188310,0.000158,0.000158,0.000158
4,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.102983,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.500302,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.108403,0.000171,0.000171,0.034060,0.000171,0.000171,0.000171,0.000171,0.159229,0.000171,0.011131,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.076538,0.000171,0.000171,0.000171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7724,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.014501,0.000173,0.000173,0.000173,0.000173,0.175347,0.000173,0.000173,0.000173,0.000173,0.000173,0.108708,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.355729,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.143394,0.000173,0.000173,0.000173,0.000173,0.000173,0.194730,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173
7725,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.063103,0.020764,0.000155,0.000155,0.087135,0.000155,0.012754,0.000155,0.000155,0.000155,0.070225,0.000155,0.000155,0.000155,0.000155,0.070199,0.000155,0.000155,0.091905,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.335316,0.000155,0.000155,0.000155,0.000155,0.000155,0.204643,0.000155,0.000155,0.000155,0.000155,0.037750,0.000155
7726,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.055194,0.000152,0.000152,0.000152,0.181486,0.000152,0.024985,0.034010,0.000152,0.000152,0.103625,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.178635,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.129626,0.000152,0.028589,0.153032,0.000152,0.000152,0.104756,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152
7727,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.016639,0.000149,0.000149,0.000149,0.152772,0.000149,0.021781,0.000149,0.000149,0.000149,0.042854,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.194659,0.000149,0.095347,0.000149,0.000149,0.000149,0.000149,0.056246,0.000149,0.000149,0.064011,0.000149,0.000149,0.349569,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149


In [None]:
topic_probs_df.sum(axis = 1)

0       1.000000
1       0.999999
2       1.000000
3       1.000000
4       1.000000
          ...   
7724    1.000000
7725    1.000000
7726    1.000000
7727    1.000000
7728    1.000000
Length: 7729, dtype: float32

In [None]:
doc_topics = df.reset_index().merge(topic_probs_df, left_index=True, right_index=True)
doc_topics

Unnamed: 0,index,document_id,leaid,state,district,document_year,document_month,document_day,text,chunk_number,chunk_id,tokens,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,0,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_1,3176450.0,NEBRASKA,SCHUYLER COMMUNITY SCHOOLS,2021.0,7.0,26.0,Schuyler Community Schools SAA Weekly Meeting/...,1,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_...,"[schuyler, community, school, saa, weekly, tea...",0.000128,0.000128,0.000128,0.000128,0.000128,0.054480,0.000128,0.000128,0.000128,0.099538,0.000128,0.000128,0.000128,0.000128,0.115535,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.031644,0.086925,0.000128,0.212583,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.102294,0.000128,0.000128,0.000128,0.000128,0.000128,0.122686,0.000128,0.169055,0.000128,0.000128,0.000128,0.000128
1,1,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_1,3176450.0,NEBRASKA,SCHUYLER COMMUNITY SCHOOLS,2021.0,7.0,26.0,positions open at this time? 5. Friday Substit...,2,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_...,"[position, open, time, friday, substitute, tea...",0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.147557,0.000131,0.000131,0.000131,0.000131,0.181140,0.000131,0.054157,0.000131,0.225827,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.031562,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.101020,0.000131,0.000131,0.000131,0.253112,0.000131,0.000131,0.000131,0.000131
2,2,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_1,3176450.0,NEBRASKA,SCHUYLER COMMUNITY SCHOOLS,2021.0,7.0,26.0,Town to Fisher's: 11 Miles Town to Richland: 2...,3,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_...,"[town, fisher, mile, town, richland, mile, fis...",0.000135,0.000135,0.000135,0.000135,0.000135,0.043564,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.320325,0.000135,0.000135,0.000135,0.000135,0.021082,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.052712,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.111492,0.000135,0.000135,0.000135,0.105423,0.000135,0.294203,0.000135,0.000135,0.045521,0.000135,0.000135,0.000135
3,3,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_1,3176450.0,NEBRASKA,SCHUYLER COMMUNITY SCHOOLS,2021.0,7.0,26.0,each week. 21. 403B Elective and Non-Elective ...,4,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_...,"[week, elective, contribution, employee, optio...",0.000158,0.000158,0.000158,0.000158,0.017556,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.000158,0.047972,0.022711,0.000158,0.033019,0.059070,0.000158,0.000158,0.000158,0.000158,0.000158,0.179962,0.000158,0.000158,0.000158,0.000158,0.000158,0.348996,0.000158,0.084292,0.199962,0.000158,0.000158,0.000158
4,4,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_1,3176450.0,NEBRASKA,SCHUYLER COMMUNITY SCHOOLS,2021.0,7.0,26.0,a.m. Preschool7:55 a.m. SES 8:05 a.m. Richland...,5,NEBRASKA_SCHUYLER COMMUNITY SCHOOLS_2021_7_26_...,"[s, richland, pick, s, trailer, pm, s, sm, s, ...",0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.108525,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.506671,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.092126,0.000171,0.000171,0.034841,0.000171,0.000171,0.000171,0.000171,0.161368,0.000171,0.011271,0.000171,0.000171,0.000171,0.000171,0.000171,0.000171,0.077845,0.000171,0.000171,0.000171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7724,7724,RHODE ISLAND_COVENTRY_2020_8_26_1,4400210.0,RHODE ISLAND,COVENTRY,2020.0,8.0,26.0,looked like had your meeting been held in pers...,38,RHODE ISLAND_COVENTRY_2020_8_26_1chunk38,"[looked, like, meeting, held, person, regardle...",0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.014500,0.000173,0.000173,0.000173,0.000173,0.175347,0.000173,0.000173,0.000173,0.000173,0.000173,0.108742,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.355691,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173,0.143405,0.000173,0.000173,0.000173,0.000173,0.000173,0.194723,0.000173,0.000173,0.000173,0.000173,0.000173,0.000173
7725,7725,RHODE ISLAND_COVENTRY_2020_8_26_1,4400210.0,RHODE ISLAND,COVENTRY,2020.0,8.0,26.0,"what plan we all return to school under, no ma...",39,RHODE ISLAND_COVENTRY_2020_8_26_1chunk39,"[plan, return, school, matter, much, time, spe...",0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.063062,0.020770,0.000155,0.000155,0.087533,0.000155,0.012762,0.000155,0.000155,0.000155,0.071249,0.000155,0.000155,0.000155,0.000155,0.070218,0.000155,0.000155,0.091489,0.000155,0.000155,0.000155,0.000155,0.000155,0.000155,0.335152,0.000155,0.000155,0.000155,0.000155,0.000155,0.204561,0.000155,0.000155,0.000155,0.000155,0.037000,0.000155
7726,7726,RHODE ISLAND_COVENTRY_2020_8_26_1,4400210.0,RHODE ISLAND,COVENTRY,2020.0,8.0,26.0,and tested throughout the country and I am sur...,40,RHODE ISLAND_COVENTRY_2020_8_26_1chunk40,"[tested, throughout, country, sure, one, could...",0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.055190,0.000152,0.000152,0.000152,0.181013,0.000152,0.024990,0.033256,0.000152,0.000152,0.102430,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.182206,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.128883,0.000152,0.028615,0.152942,0.000152,0.000152,0.104411,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152
7727,7727,RHODE ISLAND_COVENTRY_2020_8_26_1,4400210.0,RHODE ISLAND,COVENTRY,2020.0,8.0,26.0,We can only lean on the science and facts to h...,41,RHODE ISLAND_COVENTRY_2020_8_26_1chunk41,"[lean, science, fact, help, guide, decision, e...",0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.016639,0.000149,0.000149,0.000149,0.152805,0.000149,0.021776,0.000149,0.000149,0.000149,0.042489,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149,0.194724,0.000149,0.095374,0.000149,0.000149,0.000149,0.000149,0.056382,0.000149,0.000149,0.064040,0.000149,0.000149,0.349648,0.000149,0.000149,0.000149,0.000149,0.000149,0.000149


In [None]:
doc_topics.sort_values(by = 7, ascending = False)

Unnamed: 0,index,document_id,leaid,state,district,document_year,document_month,document_day,text,chunk_number,chunk_id,tokens,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
2511,2511,NEW YORK_VESTAL CENTRAL SCHOOL DISTRICT_2021_3...,3629610.0,NEW YORK,VESTAL CENTRAL SCHOOL DISTRICT,2021.0,3.0,9.0,NAME POSITION SHIFT/SCHOOL EFFECTIVE DATE REMA...,5,NEW YORK_VESTAL CENTRAL SCHOOL DISTRICT_2021_3...,"[name, position, effective, date, remark, dela...",0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.974003,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.020213,0.000120,0.000120,0.000120
1697,1697,PENNSYLVANIA_UPPER SAINT CLAIR SD_2020_8_17_1,4224570.0,PENNSYLVANIA,UPPER SAINT CLAIR SD,2020.0,8.0,17.0,"Assignment Status Effective Date DeCroo, Jenni...",31,PENNSYLVANIA_UPPER SAINT CLAIR SD_2020_8_17_1c...,"[assignment, status, effective, date, decroo, ...",0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.969155,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.025791,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105,0.000105
6098,6098,MONTANA_VICTOR K-12 SCHOOLS_2021_6_10_1,3027270.0,MONTANA,VICTOR K-12 SCHOOLS,2021.0,6.0,10.0,2021. Seniority in Elementary Education will b...,7,MONTANA_VICTOR K-12 SCHOOLS_2021_6_10_1chunk7,"[seniority, elementary, education, frozen, eff...",0.000142,0.000142,0.000142,0.009846,0.000142,0.000142,0.000142,0.966749,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.016737,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142,0.000142
689,689,NEW YORK_SOUTH ORANGETOWN CENTRAL SCHOOL DISTR...,3627450.0,NEW YORK,SOUTH ORANGETOWN CENTRAL SCHOOL DISTRICT,2022.0,5.0,5.0,A. Certificated Personnel Agenda Resolution: I...,5,NEW YORK_SOUTH ORANGETOWN CENTRAL SCHOOL DISTR...,"[certificated, personnel, agenda, resolution, ...",0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.966326,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.026400,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152,0.000152
5293,5293,PENNSYLVANIA_HANOVER PUBLIC SD_2018_9_10_1,4211450.0,PENNSYLVANIA,HANOVER PUBLIC SD,2018.0,9.0,10.0,All grades are holding the same from start to ...,2,PENNSYLVANIA_HANOVER PUBLIC SD_2018_9_10_1chunk2,"[grade, holding, start, finish, year, high, sc...",0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.964706,0.000133,0.023057,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.006009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,1038,CALIFORNIA_EAST SIDE UNION HIGH_2022_4_7_1,611820.0,CALIFORNIA,EAST SIDE UNION HIGH,2022.0,4.0,7.0,Items Board Member Lorena Chavez Item Date (te...,11,CALIFORNIA_EAST SIDE UNION HIGH_2022_4_7_1chunk11,"[item, board, member, lorena, chavez, item, da...",0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.014289,0.000086,0.077804,0.000086,0.000086,0.000086,0.043504,0.000086,0.783299,0.000086,0.000086,0.077223,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086
3157,3157,CALIFORNIA_EAST SIDE UNION HIGH_2019_12_12_1,611820.0,CALIFORNIA,EAST SIDE UNION HIGH,2019.0,12.0,12.0,2020 Study Session Presentation/Discussion Mar...,20,CALIFORNIA_EAST SIDE UNION HIGH_2019_12_12_1ch...,"[study, session, marketing, tbd, sexual, haras...",0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.074446,0.000086,0.000086,0.061734,0.000086,0.000086,0.000086,0.000086,0.747913,0.000086,0.000086,0.043554,0.000086,0.000086,0.000086,0.068490,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086
3627,3627,CALIFORNIA_EAST SIDE UNION HIGH_2019_10_17_1,611820.0,CALIFORNIA,EAST SIDE UNION HIGH,2019.0,10.0,17.0,Presentation/Discussion Board Member Lorena Ch...,19,CALIFORNIA_EAST SIDE UNION HIGH_2019_10_17_1ch...,"[board, member, lorena, chavez, item, date, te...",0.000085,0.000085,0.044576,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.010695,0.000085,0.000085,0.000085,0.000085,0.000085,0.053491,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.855769,0.000085,0.000085,0.031638,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085
1868,1868,NEW YORK_MEXICO CENTRAL SCHOOL DISTRICT_2021_1...,3619170.0,NEW YORK,MEXICO CENTRAL SCHOOL DISTRICT,2021.0,10.0,14.0,"Resolution: Motion Carries 7-0 Yea: Amy Shaw, ...",21,NEW YORK_MEXICO CENTRAL SCHOOL DISTRICT_2021_1...,"[resolution, motion, carry, yea, amy, shaw, ch...",0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.949684,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.000085,0.046248,0.000085,0.000085
