## Setup

In [2]:
#imports
import os, sys, json, datetime, re # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
from datetime import date
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import ast
import requests
import urllib3
import time
import glob
from tqdm import tqdm
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
# from pandas.core.common import SettingWithCopyWarning

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_pat, state_abv_pat
from cprl_functions.defined_functions import create_pk, add_seats, get_recent_file

from IPython.display import display_markdown


## File Gathering

In [None]:
#new file gathering

path = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\bill_data\quorum downloads"
glob_pat = os.path.join(path, r'[!old]*')

bill_files = glob.glob(glob_pat)




In [None]:
# File declaration (deprecated)
thi_bills = r"C:\Users\clutz\Downloads\2024_thi_states_bills.xlsx"
nd_bills = r"C:\Users\clutz\Downloads\nd-bills.xlsx"
non_thi_bills = r"C:\Users\clutz\Downloads\2024_non_thi_states_bills.xlsx"

bill_files = [thi_bills, nd_bills, non_thi_bills]


In [None]:
#format dfs
dfs = []
for f in bill_files:
    df = pd.read_excel(f)
    df.columns = ['full_bill_label','bill_label', 'state','title', 'bill_sum','ai_sum','sponsors','status','status_date','detailed_status','detailed_status','quorum_link']
    dfs.append(df)


all_bills = pd.concat(dfs)


## Filtering Bills

In [None]:
#education bills filter

#getting only education bills with regex
ed_bills = all_bills.loc[
    (all_bills['title'].notna() & 
     (all_bills['title'].str.contains(
         r'[Ii]ccb-?|[Oo]ce-|[Ee]duc-.?|[Ee]arly [Cc]h(i)?ld|^Sch\s?|[Hg]ighr [Ee]d|[Hh]igher ([Ee]d(ucation)?)?\s?([Ll]earning)?|.*[Ee]ducation.*|.*[Cc]hild.*|.*[Ss]chool.*|.*[Cc]harter.*|.*[Mm]ath.*|.*[Rr]ead.*|.*[Tt]each.*|.*[Pp]arent.*|.*[Kk]id.*|.*[Ss]tudent.*|.*[Cc]ollege.*|.*[Uu]niversit.*|.*[Tt]uition.*', 
         regex=True, na=False
     ))
    ) 
    | 
    all_bills['bill_sum'].str.contains(
        r'[Ee]arly\s?-?[Cc]hildhood|[Ee]ducational\s?[Rr]equirements|[Pp]ost-?\s?[Ss]econdary\s?[Ee]ducation|[Cc]hild\s?[Ee]mploy', 
        regex=True, na=True
    )
]

ed_bills = ed_bills[~((ed_bills.title.str.contains(r'[Cc]ongrat[Ss]?(ulations)?|[Mm]ourn|[Mm]emorial|[Cc]ommending|[Hh]onoring', regex = True, na = True)) |
                    (ed_bills.title.str.contains(r'[Cc]ongrat[Ss]?(ulations)?|[Mm]ourn|[Mm]emorial|[Cc]ommending|[Hh]onoring', regex = True, na = True))) ]

ed_bills = ed_bills.reset_index(drop=True)
# ed_bills = ed_bills[~ed_bills.title.str.contains(r'[Cc]ongrat[Ss]?(ulations)?|[Mm]ourn|[Mm]emorial|[Cc]ommending|[Hh]onoring', regex = True, na = True)]

# ed_bills

## Final Bills exporting (creates ed_bills)

In [None]:

# splitting and getting sponsors
ed_bills['pk_sponsors'] = np.nan
for i,j in enumerate(ed_bills['sponsors']):

    # print(j)
    print('####')
    is_nan = pd.isna(ed_bills.loc[i,'sponsors'])  # True
    if is_nan == True:
        continue
    try:
        sponsor_split = j.split(',')
    except:
        print('************')
        print(f'sponsors: {j}')
        print(f'title: {str(ed_bills.loc[i,'title'])}')
        print('************')
    if len(sponsor_split) == 1:
        sponsor_split = [j]
    
        
    pks = []
    
    for s in enumerate(sponsor_split):
        s = s[-1]
        print(str(s).split('('))
        last_name = str(s).split('(')[0].strip().split(' ')[-1]
        chamber_match = re.findall(r'[Ss]en\.|[Rr]ep\.|[Ss]peaker|[Dd]el\.', str(s))
        if len(chamber_match) == 0:
            print(f'here is what has no chamber: {s}')
            continue
        # if not chamber_match[0]:
        #     print(chamber_match)
        #     print(s)
        #     continue

        if 'rep' in str(chamber_match[0]).lower() or 'speaker' in str(chamber_match[0]).lower() or 'del' in str(chamber_match[0]).lower():
            chamber_code = '0'
        elif 'sen' in str(chamber_match[0]).lower():
            chamber_code = '1'
        get_dis_and_state = re.findall(r'(?!\()[A-Z]{1}-[A-Z]{2}-\d{1,3}(?<!\))', str(s))

        if len(get_dis_and_state) == 0:
            print('regex failed')
            print(s)
            continue
        else:
            broken_up_val = str(get_dis_and_state[0]).split('-')
            state_code = state_coding.get(broken_up_val[1])
        # print(type(state_code))
            district_code = broken_up_val[2]
            desired_length = 3

            if len(district_code) < desired_length:
                district_code = district_code.zfill(desired_length)

        pk = str(state_code)+chamber_code+district_code
        pk = f'{pk}({last_name})'
        print(pk)

        pks.append(pk)



    pks_for_bill = '|'.join(pks)
    ed_bills.loc[i,'pk_sponsors'] = pks_for_bill
            
# print(ed_bills.head())
# ed_bills


## Tagging

In [None]:
pat = r'[Dd]iversity'


for a,b in zip(ed_bills.title, ed_bills.bill_sum):
    # print(a)
    if re.search(pat, str(a)):
        print('###### title #######')
        print(a)
        print('/n')
    if re.search(pat, str(b)):
        print('##### summary ########')
        print(b)
        print('/n')

In [None]:
#tagging bills
ed_bills['HE_tag'] = False
ed_bills['EC_tag'] = False
ed_bills['ED_tag'] = False
i = 0
for a,b in zip(ed_bills.title, ed_bills.bill_sum):
    if re.search(r'[Ii]ccb|[Ee]duc-.?|[Hg]ighr [Ee]d|[Hh]igher ([Ee]d(ucation)?)?\s?([Ll]earning)?|.*[Cc]ollege.*|.*[Uu]niversit.*|.*[Tt]uition.*', str(a)):
        ed_bills.loc[i,'HE_tag'] = True
    
    if re.search('[Pp]ost-?\s?[Ss]econdary\s?[Ee]ducation', str(b)):
        ed_bills.loc[i,'HE_tag'] = True


    if re.search('[Ee]arly [Cc]h(i)?ld', str(a)):
        ed_bills.loc[i,'EC_tag'] = True

    if re.search(r'[Ee]arly\s?-?[Cc]hildhood', str(b)):
        ed_bills.loc[i,'EC_tag'] = True

    i += 1


ec_bills = ed_bills[ed_bills['EC_tag']==True]
he_bills = ed_bills[ed_bills['HE_tag']==True]

test_dupes = ed_bills[ed_bills['bill_label'].duplicated(keep=False)]
test_dupes


ed_bills[ed_bills['bill_label'].duplicated(keep="last")]
ed_bills['lookup'] = ed_bills['bill_label'] + "-[" + ed_bills['state'] + "]"

ed_bills.columns

# leg_info file
Takes ed bills and compiles all of the sponsors together (in pk form)
goes into compiling act_and_infl_scores.ipynb

In [None]:
#Leg_bills_info creation
##creates list of bills and counts of bills per legislator
sponsor_bill_lookup = []
for i,j in enumerate(ed_bills['pk_sponsors']):
    is_nan = pd.isna(ed_bills.loc[i,'pk_sponsors'])  # True

    if is_nan == True:
        continue 
    sponsors_ls = []
    bill_nums = []
    sponsors_split = str(j).split('|')
    sponsors_split = [x for x in sponsors_split if len(x) > 0]
    n = len(sponsors_split)
    sponsors_ls.extend(sponsors_split)
    bill_nums.extend([ed_bills.loc[i,'bill_label']]*n)
    append_df = pd.DataFrame({'primary_key':sponsors_ls, 'bill_labels':bill_nums})
    sponsor_bill_lookup.append(append_df)

leg_bill_lookup = pd.concat(sponsor_bill_lookup)
leg_bill_lookup = leg_bill_lookup.reset_index(drop=True)



#bills per primary key
leg_bill_lookup_grouped = leg_bill_lookup.groupby(['primary_key']).agg({
    'bill_labels': ['|'.join, 'count']
}).reset_index()
leg_bill_lookup_grouped.columns = ['primary_key', 'bill_labels', 'bill_counts']



#print statements for assistance
# print(leg_bill_lookup_grouped.columns)
# leg_bill_lookup_grouped


In [None]:
#export
# leg_filename = r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\bills\leg_bills_info.xlsx'
# leg_bill_lookup_grouped.to_excel(leg_filename, index=False)


leg_filename = f'leg_bills_info_{str(date.today()).replace('-','_')}.xlsx'
leg_bill_lookup_grouped.to_excel(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\Bill Data\{leg_filename}',sheet_name=f'{leg_filename.replace(".xlsx", "")}', index=False)




# Tableau Ed Bills
Pulls together and unions data to get bills and legislators and other info associated with them
Used in making:
- Legislative Bill Tracking

In [None]:
#pull in bills and legislators
key_path = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\bills and legislators\2025"

compiled_file = get_recent_file("bills_and_legislators*.xlsx", key_path)
# print(compiled_file)
compiled_df = pd.read_excel(compiled_file)


leg_bills = compiled_df['bills'].to_list()
leg_bills = list(set(leg_bills))
# print(*leg_bills, sep='|')

In [54]:
#moving columns around
# Move the 'C' column to the first position
column_to_move = ed_bills.pop('bill_label')
ed_bills.insert(0, 'bill_label', column_to_move)

column2_to_move = ed_bills.pop('lookup')
ed_bills.insert(0, 'lookup', column2_to_move)
ed_bills

cpf_column_to_move = compiled_df.pop('bill_lookup')
compiled_df.insert(0,'bill_lookup', cpf_column_to_move)

In [None]:
#troubleshooting
print(*ed_bills.columns, sep = "|")
print(*compiled_df.columns, sep = "|")

In [None]:
#ed bills is the finalized list of education bills and compiled_df comes in from bills and legislators are the exploded list of legislators with emphasis on bills with repeated legislator values
full_df = pd.merge(ed_bills, compiled_df, how='left', left_on='lookup', right_on='bill_lookup')
# full_df_v2 = pd.merge(ed_bills, compiled_df, how='left', left_on='bill_label', right_on='bills')

#this is just for testing
# print(full_df_v1.head(2).to_string())
# print(len(full_df_v1))
# print(full_df_v2.head(2).to_string())
# print(len(full_df_v2))
# full_df

In [None]:
file_name_full = f'full_join{str(date.today()).replace('-','_')}.xlsx'
file_name_full_csv = f'full_join{str(date.today()).replace('-','_')}.csv'
full_df.to_excel(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\Bill Data\{file_name_full}',sheet_name=f'{file_name_full.replace(".xlsx", "")}', index=False)
full_df.to_csv(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\Bill Data\{file_name_full_csv}', index=False)


# full_df.to_excel(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\bills\ed_bills_v3_w_join.xlsx', sheet_name=f'ed_bills',index = False)

In [None]:
ed_bills.head()

# END


## Tagging for Program and Content Teams



In [None]:
#[Bb]lack

count = 0
for i,b in enumerate(ed_bills['title']):
    if re.search(r'[Dd]ay\s?[Cc]are', str(b)):
    # if re.search(r'[Ee]arly [Cc]h(i)?ld', str(b)) and len(str(b)) < 100:
        # if re.search(r'[Ee]arly', str(b)):
            print(str(b))
            print('###################')
            # wrapped_text = textwrap.fill(ed_bills.loc[i,'title'], width=100)
            # print(wrapped_text)
            # print('###################')
            # print('\n')
            count += 1

print(count)

3


In [None]:

ed_bills.loc[:,"sponsors_helper"] = np.nan
for i,j in enumerate(ed_bills['sponsors']):
    
    # print(str(j))
    

    if isinstance(j,float):
        continue
    sponsors = str(j).split(',')
    s_list = []
    for s in sponsors:
        # display_markdown(f'{s}', raw=True)
        try:
            lname_identifier = re.findall(r'\w+\s\([RD]-[A-Z]{2}-\d+\)', str(s))[0]
        except:
            continue
            # print('no last name')
            # print(str(s))
            # # print(type(j))
        
        
        house_or_senate = re.findall(r'[Ss]en\.|[Rr]ep\.|[Dd]el\.',str(s))
        # print(house_or_senate)                

        if len(house_or_senate) == 0:
            if re.search(r'[Ss]peaker', str(s)):
                chamber = 'House'
            else:
                print('not speaker but something else')
                print(s)

        else:
            if house_or_senate[0] == "Sen.":
                chamber = "Senate"
                # print("found a senate")
            elif house_or_senate[0] == "Rep." or house_or_senate[0] == "Del.":
                chamber = "House"

        # print(chamber)
        state = re.findall(r'-[A-Z]{2}-',str(lname_identifier))[0]
        state = state.strip('-')
        # print(lname_identifier)
        # print(str(state))
        
        district = re.findall(r'-\d+', str(lname_identifier))[0]
        district = district.lstrip('-').strip().lstrip('0')
        name = re.findall(r'^\w+', str(lname_identifier))[0]
        # print(str(district))
        if 'ND' in str(state):
            helper = state +'-'+ chamber + "-" + district +'-' + name
        elif "CT" in str(state) and chamber == "Senate":
            helper = state +'-'+ chamber + "-S" + district

        else:
            helper = state +'-'+ chamber + "-" + district
        s_list.append(helper)

    sponsor_ext = "|".join(s_list)
    # try:
    #     print(sponsor_ext)
    # except:
    #     display_markdown(f'{s}', raw=True)


    ed_bills.loc[i, 'sponsors_helper'] = str(sponsor_ext)


    

In [97]:
first_column = ed_bills.pop('sponsors_helper') 
ed_bills.insert(0, 'sponsors_helper', first_column) 
ed_bills.reset_index(inplace=True, drop=True)



In [None]:
#This is used to check if the values match the compiled_info csv

helper_values = []
for i,j in enumerate(ed_bills['sponsors_helper']):
    helper_ls = str(j).split('|')
    helper_values.extend(helper_ls)



helper_values = list(set(helper_values))
df = pd.DataFrame({"helper":helper_values})
df.to_excel(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\quorum\bill data downloads\exports\unique_helpers.xlsx', index=False)

In [None]:
#taking bill data and providing cumulative totals
sponsors_dfs = []
for i,j in enumerate(ed_bills['sponsors_helper']):
    sponsor_list = []
    bill_label_list = []
    
    sponsors = str(j).split('|')
    bill_number = ed_bills.loc[i,'bill_label']
    n = len(sponsors)
    print(bill_number)

    sponsor_list.extend(sponsors)
    bill_label_list.extend([str(bill_number)]*n)

    df = pd.DataFrame({'sponsors':sponsor_list, "bill_labels": bill_label_list})
    print(df.to_string())
    sponsors_dfs.append(df)

bills_and_sponsors = pd.concat(sponsors_dfs)
bills_and_sponsors.reset_index(inplace=True, drop=True)

In [None]:
#grouping data together and getting list of events per legislator
grouped_df = bills_and_sponsors.groupby('sponsors')#['bill_labels']#.apply('|'.join).reset_index()


# agg({
#     'bills': lambda x: 
#         f"{sc} ({ac})" if not pd.isna(ac) else f"{sc}"
#         for sc, ac in zip(thi_states_df.loc[x.index, 'event name'], thi_states_df.loc[x.index, 'role'])),

# }).reset_index()
# # grouped_df.reset_index()
# grouped_df.rename(columns={'event name': 'events'}, inplace=True)

: 

In [None]:

bills_clean_up = non_ed_bills.loc[non_ed_bills['bill_sum'].notna() & (non_ed_bills['bill_sum'].str.contains(r'[Hh]igher ([Ee]d(ucation)?)?\s?([Ll]earning)?'))]

# for bill in non_ed_bills['title']:
#     print('##############')
#     print(bill)


In [None]:

# Iterate through each row in the 'bill_sum' column
for text in all_bills['title'].dropna():
    # Find all matches with 5 words of context before and after
    matches = re.findall(r'(?:\b\w+\b\s){0,10}\b[Ee]ducation\b(?:\s\b\w+\b){0,10}', text)
    matches_v2 = re.findall(r'.*[Ee]ducation.*|.*[Cc]hild.*|.*[Ss]chool.*|.*[Cc]harter.*|.*[Mm]ath.*|.*[Rr]ead.*|.*[Tt]each.*|.*[Pp]arent.*|.*[Kk]id.*|.*[Ss]tudent.*|.*[Cc]ollege.*|.*[Uu]niversit.*|.*[Tt]uition.*', str(text))


    
    # Print matches if found
    print('####################')
    for match in matches_v2:
        
        print("****")
        print(str(match))
        print('\n')






In [None]:
ec_subs = ['child\s{0,1}care', 'early\s{0,1}childhood', 'preschool']
ec_pat = "|".join(ec_subs)
print(ec_pat)

%%

In [None]:
print(all_bills.columns)
for bill in all_bills['bill_sum']:
    if re.search(f'{ec_pat}', str(bill).lower()):
        matches = re.findall(ec_pat, str(bill).lower())
        print('############################')
        print('***********')
        print(*matches)
        print('***********')
        print(bill)
        print('\n')
# %%

In [None]:
ec_ed_bills = all_bills[all_bills.subjects.str.contains(ec_pat, regex = True, case=False)]

In [None]:
ec_ed_bills.reset_index(inplace=True, drop=True)

% Higher Ed

In [None]:
he_keywords = [
    "post-secondary transition",
    "equity gaps",
    "college-going rates",
    "workforce readiness",
    "certificate programs",
    "wraparound services",
    "stackable credentials",
    "student persistence",
    "retention strategies",
    "lifelong learning",
    "postsecondary barriers",
    "alternative pathways",
    "higher education institutions",
    "higher education",
    "legislative support for education",
    "student success metrics",
    "high quality credential",
    "vocational training",
    "attainment",
    "persistence",
    "resistance",
    "graduation"
]

In [None]:
he_pat = r'\b(' + '|'.join(map(re.escape, he_keywords)) + r')\b'

he_pat = "|".join(he_keywords)

In [None]:
print(he_pat)

In [None]:
all_bills['bill_sum'].fillna('nan',inplace=True)

In [None]:
he_ed_bills = all_bills[all_bills.bill_sum.str.contains(he_pat, regex = True, case=False)]
he_ed_bills.reset_index(inplace=True, drop=True)

In [None]:
os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\bill_data\Higher Ed')
he_ed_bills.to_csv('higher_ed_bills.csv', index=False)
# %%
# print(all_bills.columns)
for i,bill in enumerate(he_ed_bills['bill_sum']):
    # matches = re.findall(he_pat, str(bill).lower())
    matches = re.findall(r'graduation', str(bill).lower())
    if matches:
        grad_match = re.findall(r'higher ed|post-{0,1}secondary', str(bill).lower())
        if grad_match:
            continue
        
        else:
            print('############################')
            print(he_ed_bills.loc[i,'state'])
            print(he_ed_bills.loc[i,'bill'])
            print('############################')
            print('***********')
            print(*grad_match)
            print('***********')
            print(bill)
            print('\n')
    
        print('############################')
        print(he_ed_bills.loc[i,'bill'])
        print('############################')
        print('***********')
        print(*matches)
        print('***********')
        print(bill)
        print('\n')
# %%