In [116]:
import os, sys, json, datetime, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
from lxml import etree, html
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from IPython.display import display_markdown

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, extract_title_and_name, get_recent_file
from unidecode import unidecode

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

## Driver Set up

In [66]:
house_url = "https://alison.legislature.state.al.us/committees-house-standing-current"
senate_url = "https://alison.legislature.state.al.us/committees-senate-standing-current-year"

In [67]:
webdriver_path = r"C:\Users\clutz\hunt_env\chrome driver\chromedriver-win64\chromedriver.exe"
chrome_options = Options()

prefs = {"download.default_directory" : r"c:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt"};

chrome_options.add_experimental_option("prefs",prefs);
# chrome_options.add_argument('--headless')
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
# Set up WebDriver service
service = Service(webdriver_path)

In [68]:
#call on driver and get data
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)


# Definitions

In [69]:
def clean_string(s):
    return s.strip().lower()

In [70]:
#split text, clean, and append
def split_and_clean(element):
    text_split = element.text.split('\n')
    com_name = text_split[0]
    com_name = clean_string(com_name)
    # print(com_name)

    return com_name


def look_for_coms(app_list, chamber):
    elements = driver.find_elements(By.XPATH, "//tr[@role='button']")
    # print(f'length of elements results: {len(elements)}')
    for ele in elements:
        # print(ele)
        com_name = split_and_clean(ele)
        if chamber == "house":
            if com_name in h_com_names:
                app_list.append(ele)
                # print(f'appended {com_name}')
        if chamber == "senate":
            if com_name in s_com_names:
                app_list.append(ele)
            # print(f'appended {com_name}')


def look_for_coms_by_keywords(app_list):
    #find committee rows
    elements = driver.find_elements(By.XPATH, "//tr[@role='button']")
    # print(f'length of elements results: {len(elements)}')
    keywords = ['[Ee]ducation', '[Cc]hildren', '[Yy]oung']
    pat = re.compile("|".join(keywords))
    for ele in elements:
        # print(ele)
    
        
        com_name = split_and_clean(ele)
        if re.search(pat,com_name):
            app_list.append(ele)


    

In [71]:
# testing chunk
# driver.get(house_url)
# time.sleep(2)
# elements = driver.find_elements(By.XPATH, "//button[@aria-label='Next Page']")

# processed = []
# # print(elements)
# for ele in elements:
#     print(ele)
#     ele.click()

# Data Setup

In [72]:
#senate com setup
s_com_names = """
    Committe
Education Policy
Children and Youth Health
Finance and Taxation General Fund
Finance and Taxation Education
Education Policy
Children and Youth Health
    """
s_com_names = list(map(clean_string, s_com_names.split('\n')))
s_com_names = [x for x in s_com_names if len(x) > 0]
print(s_com_names)


['committe', 'education policy', 'children and youth health', 'finance and taxation general fund', 'finance and taxation education', 'education policy', 'children and youth health']


In [73]:
#house com setup
h_com_names = """
    Children and Senior Advocacy
    Education Policy
    Ways and Means Education
    Ways and Means General Fund 
    """
h_com_names = list(map(clean_string, h_com_names.split('\n')))
h_com_names = [x for x in h_com_names if len(x) > 0]
print(h_com_names)


['children and senior advocacy', 'education policy', 'ways and means education', 'ways and means general fund']


# Main

In [83]:
#house comm info
driver.get(house_url)
time.sleep(2)

processed_h = {}
while len(h_com_names) > len(processed_h):
#first look
    page_processed = []
    look_for_coms(page_processed, "house")
    for ele in page_processed:
        time.sleep(1)
        ele.click()
    
        com_name = ele.text.split('\n')[0].strip().lower()
        print(type(com_name))
        # state_leg_ref
        wait = WebDriverWait(driver, 10)

        # download_button = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV']")
        # driver.execute_script("arguments[0].click();", download_button)
        download_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[starts-with(@id, 'headlessui-dialog-panel')]/div[2]/div/div[1]/div/button[1]")))
        # download = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV']")
        # download.click()
        download_button.click()
        time.sleep(1)
        close = driver.find_element(By.XPATH, "//button[starts-with(@aria-label, 'Close')]")
        close.click()
        processed_h[com_name]= ele

        dl_path = r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt'
        
        try:
            current = get_recent_file('*Com*', dl_path)
            new = f'al_house_{com_name}.csv'
            os.rename(current, new)
            print(f"File '{current}' renamed to '{new}' successfully.")
        except FileNotFoundError:
            print(f"Error: File '{current}' not found.")
        except FileExistsError:
            print(f"Error: File '{new}' already exists.")
        except Exception as e:
            print(f"An error occurred: {e}")



    next_page = driver.find_element(By.XPATH, "//button[@aria-label='Next Page']")
    next_page.click()
    time.sleep(2)
    
print(processed_h)

<class 'str'>
File 'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt\Committee_Members (25).csv' renamed to 'al_house_children and senior advocacy.csv' successfully.
<class 'str'>
File 'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt\Committee_Members (25).csv' renamed to 'al_house_education policy.csv' successfully.
<class 'str'>
File 'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt\Committee_Members (25).csv' renamed to 'al_house_ways and means education.csv' successfully.
<class 'str'>
File 'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt\Committee_Members (25).csv' renamed to 'al_house_ways and means general fund.csv' successfully.
{'children and senior advocacy': <selenium.webdriver.remote.webelement.WebElement (session="acfe72c275fac7ee705f82af0515f2df", element="f.39988506FBF10FCB85D658EB1F2FBC40.d.CA2F36D1E1FC3F3B2F5383906DB931AA.e.593")>, 'education policy': <selenium.webdriver.remote.webelement.WebElem

In [82]:
#senate comm info
driver.get(senate_url)
time.sleep(2)

from selenium.common.exceptions import WebDriverException

processed_s = {}
i = 0
while i < 5:
#first look
    i += 1
    print(i)
    # print(len())
    page_processed = []
    look_for_coms_by_keywords(page_processed)
    print(len(page_processed))
    # next_page = driver.find_element(By.XPATH, "//button[@aria-label='Next Page']")
    
    # next_page.click()
    for ele in page_processed:
        time.sleep(1)
        ele.click()
        # print(ele.text)
        com_name = ele.text.split('\n')[0]

        # state_leg_ref
        wait = WebDriverWait(driver, 10)
        
        # download_button = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV']")
        # driver.execute_script("arguments[0].click();", download_button)
        download_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[starts-with(@id, 'headlessui-dialog-panel')]/div[2]/div/div[1]/div/button[1]")))
        # download = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV']")
        # download.click()
        download_button.click()
        time.sleep(1)
        close = driver.find_element(By.XPATH, "//button[starts-with(@aria-label, 'Close')]")
        close.click()
        processed_s[com_name] = ele
        try:
            current = get_recent_file('*Com*', dl_path)
            new = f'al_senate_{com_name}.csv'
            os.rename(current, new)
            print(f"File '{current}' renamed to '{new}' successfully.")
        except FileNotFoundError:
            print(f"Error: File '{current}' not found.")
        except FileExistsError:
            print(f"Error: File '{new}' already exists.")
        except Exception as e:
            print(f"An error occurred: {e}")

    

    
    try:
        next_page = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//button[@aria-label='Next Page']"))
        )

        # Check if the element is clickable
        if next_page.is_enabled() and next_page.is_displayed():
            next_page.click()
            print("Next page button clicked.")
        else:
            print("Next page button is not clickable.")
        
    except Exception as e:
        i = 5


        # next_page = driver.find_element(By.XPATH, "//button[@aria-label='Next Page']")
        # next_page.click()

        
        
        # ele.click()
        # time.sleep(2)
        # download_button = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV' and starts-with(@id='headlessui')]")
        # driver.execute_script("arguments[0].click();", download_button)
    
    # download = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV']")
    # download.click()
    time.sleep(1)
    # close = driver.find_element(By.XPATH, "//button[starts-with(@aria-label, 'Close')]")
    # close.click()
    # print("clicked")
    
time.sleep(2)

# /html/body/div[4]/div[3]/div/div/div[2]/div[2]/div/div[1]/div/button[1]
# //*[@id="headlessui-dialog-panel-25"]/div[2]/div/div[1]/div/button[1]

for k,v in processed_s.items():
    print(k)


1
3
File 'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt\Committee_Members (25).csv' renamed to 'al_senate_Children and Youth Health.csv' successfully.
File 'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt\Committee_Members (25).csv' renamed to 'al_senate_Education Policy.csv' successfully.
File 'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt\Committee_Members (25).csv' renamed to 'al_senate_Finance and Taxation Education.csv' successfully.
Next page button clicked.
2
0
Next page button is not clickable.
3
0
Next page button is not clickable.
4
0
Next page button is not clickable.
5
0
Next page button is not clickable.
Children and Youth Health
Education Policy
Finance and Taxation Education


In [112]:
leg_lookup_file = get_recent_file("*.csv", r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\key_creation\2025")
leg_ref = pd.read_csv(leg_lookup_file)
leg_ref = leg_ref[leg_ref['state abbreviation'] == "AL"].reset_index(drop = True)
# print(leg_ref.columns)

leg_ref['last name'] = leg_ref['last name'].str.lower().str.strip()
leg_ref.head()

Unnamed: 0,full_pk,primary_key,district_code,state abbreviation,chamber,title,first name,last name,party,district,date assumed office,name,tenure,leader,state_code,chamber_code
0,10006300.0,100063,63.0,AL,House,Alabama Representative,Cynthia,almond,Republican,63.0,2021,AL Rep. Cynthia Almond (R-AL-063),4,,10.0,0.0
1,10006600.0,100066,66.0,AL,House,Alabama Representative,Alan,baker,Republican,66.0,2006,AL Rep. Alan Baker (R-AL-066),19,,10.0,0.0
2,10004900.0,100049,49.0,AL,House,Alabama Representative,Russell,bedsole,Republican,49.0,2020,AL Rep. Russell Bedsole (R-AL-049),5,,10.0,0.0
3,10008000.0,100080,80.0,AL,House,Alabama Representative,Chris,blackshear,Republican,80.0,2016,AL Rep. Chris Blackshear (R-AL-080),9,,10.0,0.0
4,10006100.0,100061,61.0,AL,House,Alabama Representative,Ronald,bolton,Republican,61.0,2022,"AL Rep. Ronald ""Ron"" Bolton (R-AL-061)",3,,10.0,0.0


In [139]:
os.chdir(r'c:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Desktop\downloads_alt')
senate_files = glob.glob('al_senate*.csv')
house_files = glob.glob('al_house*.csv')
files = senate_files+house_files
print(files)

for file in files:
    print(file)
    df = pd.read_csv(file)
    print(df.columns)





    # Regex pattern to split at the last name
    pattern = r"(.*\s(?:'[^']+?'\s)*)\s([A-Z][a-z]+(?:-[A-Z][a-z]+)?|\bO'\w+|\bVan\s\w+|\bDe\s\w+)$"

    # Function to split names
    def split_name(name):
        match = re.match(pattern, name)
        if match:
            first_part, last_name = match.groups()
            return first_part.strip(), last_name.strip()
        return name, None  # If no match, return original name

    # Apply the function to split names
    df[['f_name', 'l_name']] = df['member'].apply(lambda x: pd.Series(split_name(x)))

    # Drop the original column if needed
    df.drop(columns=['member'], inplace=True)
    print('##############')
    # for i,j in enumerate(df['Last Name']):
    # print(df.head(2))
    

    if 'senate' in file:
        chamber = 'Senate'
    else:        
        chamber = 'House'
    for row in df.itertuples():
        # print(row.f_name)
        if row.l_name is None:
            print(row)
            print('^^^^^^^^'+'\n'+'this one is nonetype'+'\n'+'^^^^^^^^')
            break
        # l_name = row.
        results = leg_ref[leg_ref['last name'] == row.l_name.lower().strip()].reset_index(drop=True)

        if len(results) == 1:
            continue
        else:
            display_markdown(f' ## {row.f_name} {row.l_name}', raw = True)
            print(len(results))
            print(results)
            break


        # print(type(row))
        # print(row)
        
    # Print the result
    # print(df.to_string())

    # mem_list = df['member'].to_list()
    
    # # Regex pattern to split at the last name
    # pattern = r"(?:\b[A-Z][a-z]+(?:\s[A-Z]\.)?\s)([A-Z][a-z]+(?:-[A-Z][a-z]+)?|\bO'\w+|\bVan\s\w+|\bDe\s\w+)"

    # # Split names into (first part, last name)
    # split_names = [re.split(pattern, name, maxsplit=1) for name in names]

    # # Clean up results (removing None values)
    # split_names = [[part.strip() for part in name_parts if part] for name_parts in split_names]



['al_senate_Children and Youth Health.csv', 'al_senate_Education Policy.csv', 'al_senate_Finance and Taxation Education.csv', 'al_house_children and senior advocacy.csv', 'al_house_education policy.csv', 'al_house_ways and means education.csv', 'al_house_ways and means general fund.csv']
al_senate_Children and Youth Health.csv
Index(['member', 'position'], dtype='object')
##############
Pandas(Index=0, position='Chairperson', f_name='Larry Stutts', l_name=None)
^^^^^^^^
this one is nonetype
^^^^^^^^
al_senate_Education Policy.csv
Index(['member', 'position'], dtype='object')
##############
Pandas(Index=0, position='Chairperson', f_name='Donnie Chesteen', l_name=None)
^^^^^^^^
this one is nonetype
^^^^^^^^
al_senate_Finance and Taxation Education.csv
Index(['member', 'position'], dtype='object')
##############
Pandas(Index=0, position='Chairperson', f_name='Arthur Orr', l_name=None)
^^^^^^^^
this one is nonetype
^^^^^^^^
al_house_children and senior advocacy.csv
Index(['member', 'positi

In [None]:
processed = processed_s + processed_h

for p in processed:
    

In [52]:
print(processed)
print(s_com_names)

['children and youth health', 'education policy', 'finance and taxation education', 'finance and taxation general fund']
['committee', 'finance and taxation general fund', 'finance and taxation education', 'education policy', 'children and youth health', 'finance and taxation general fund', 'finance and taxation education', 'education policy', 'children and youth health']


In [None]:




html_from_page = driver.page_source
soup = BeautifulSoup(html_from_page, 'html.parser')
links = soup.find_all("a", href = True)
print(links)



In [None]:

page_url = r'https://www.ncleg.gov/Committees#HouseStanding'
driver.get(page_url)

html_from_page = driver.page_source
soup = BeautifulSoup(html_from_page, 'html.parser')

links = soup.find_all("a", {"class": "list-group-item list-group-item-action filteredGroup searchable"})


In [None]:
import os, sys, json, datetime, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import time
from tqdm import tqdm

from bs4 import BeautifulSoup

In [None]:
response = requests.get(url, verify = False).content
df_list = pd.read_html(response)
print(type(df_list[0]))
df = df_list[0]

%% For AL


<br>
This will require committee names <br>
to match what is on the website.<br>
***double check that this is the case****<br>


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

Path to your WebDriver executable (adjust if necessary)

In [None]:
webdriver_path = r"C:\Users\clutz\hunt_env\chrome driver\chromedriver-win64\chromedriver.exe"

Set up Chrome options

In [None]:
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)

Set up WebDriver service

In [None]:
service = Service(webdriver_path)

Initialize WebDriver

In [None]:
driver = webdriver.Chrome(service=service, options=chrome_options)

%%% senate

In [None]:
dfs = {}
try:
    # Open the target URL
    driver.get('https://alison.legislature.state.al.us/committees-senate-standing-current-year')

    # Wait for the page to load and the tbody to be present
    wait = WebDriverWait(driver, 20)

    # Use a broader selector or additional waits to ensure the page is fully loaded
    tbody = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "tbody")))

    # Debug: Print the HTML of the tbody to verify its presence
    # print(tbody.get_attribute('outerHTML'))

    # Find all rows within the tbody
    rows = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "tbody tr")))
    #list of committee names (replace with logic to pull from csv and filter by appropriate state)
    coms = ['Education Policy','Finance and Taxation Education','Finance and Taxation General Fund','Children and Youth Health','Healthcare']

In [None]:
    # go through each committee and check if relevant and save table to dict
    for row in rows:
        time.sleep(2)  # Adjust sleep time if necessary to allow the table to load
        ActionChains(driver).move_to_element(row).click().perform()
        time.sleep(2)  # Adjust sleep time if necessary to allow the table to load
        html_from_page = driver.page_source
        soup = BeautifulSoup(html_from_page, 'html.parser')
        modal_div = soup.find_all("div", class_= "ReactModalPortal")
        for m in modal_div:
            # print(m)
            if len(str(m)) > 36:
                div = m
                break

        # print(div)
        header = div.find("h1")
        header = str(header).split('">', 1)[-1].split('</h1', 1)[0].replace('Members','').strip()
        pass_through = False
        for h in coms:
            if str(header) in h:
                pass_through = True
            else:
                continue
        #close if not a relevant committee
        if pass_through != True:
            print(str(header) + ' is not a valid committee')
            close_button_pot = driver.find_element(By.XPATH, "/html/body/div[6]/div/div/div/div[1]/button")
            # time.sleep(5)  # Adjust sleep time if necessary to allow the table to load
            ActionChains(driver).move_to_element(close_button_pot).click().perform()
            # time.sleep(5)  # Adjust sleep time if necessary to allow the table to load
            continue
        
        #fetch table
        df_list = pd.read_html(html_from_page)
        df = df_list[-1]
        
       
        #close popup
        close_button_pot = driver.find_element(By.XPATH, "/html/body/div[6]/div/div/div/div[1]/button")
        ActionChains(driver).move_to_element(close_button_pot).click().perform()
        
        # save table
        dfs[header] = df

In [None]:
finally:
    # Close the WebDriver
    driver.quit()

%%% house

In [None]:
dfs = {}
try:
    # Open the target URL
    driver.get('https://alison.legislature.state.al.us/committees-house-standing-current')

    # Wait for the page to load and the tbody to be present
    wait = WebDriverWait(driver, 20)

    # Use a broader selector or additional waits to ensure the page is fully loaded
    tbody = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "tbody")))

    # Debug: Print the HTML of the tbody to verify its presence
    # print(tbody.get_attribute('outerHTML'))
    
    coms = ["Education Policy","Ways and Means General Fund","Ways and Means Education","Health","Children and Senior Advocacy"]
    
    
    
    
    # Find all rows within the tbody
    rows = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "tbody tr")))
    #list of committee names (replace with logic to pull from csv and filter by appropriate state)

    # go through each committee and check if relevant and save table to dict
    for row in rows:
        time.sleep(2)  # Adjust sleep time if necessary to allow the table to load
        ActionChains(driver).move_to_element(row).click().perform()
        time.sleep(2)  # Adjust sleep time if necessary to allow the table to load
        html_from_page = driver.page_source
        soup = BeautifulSoup(html_from_page, 'html.parser')
        modal_div = soup.find_all("div", class_= "ReactModalPortal")
        for m in modal_div:
            # print(m)
            if len(str(m)) > 36:
                div = m
                break

        # print(div)
        header = div.find("h1")
        header = str(header).split('">', 1)[-1].split('</h1', 1)[0].replace('Members','').strip()
        pass_through = False
        for h in coms:
            if str(header) in h:
                pass_through = True
            else:
                continue
        #close if not a relevant committee
        if pass_through != True:
            print(str(header) + ' is not a valid committee')
            close_button_pot = driver.find_element(By.XPATH, "/html/body/div[6]/div/div/div/div[1]/button")
            # time.sleep(5)  # Adjust sleep time if necessary to allow the table to load
            ActionChains(driver).move_to_element(close_button_pot).click().perform()
            # time.sleep(5)  # Adjust sleep time if necessary to allow the table to load
            continue
        
        #fetch table
        df_list = pd.read_html(html_from_page)
        df = df_list[-1]
        
       
        #close popup
        close_button_pot = driver.find_element(By.XPATH, "/html/body/div[6]/div/div/div/div[1]/button")
        ActionChains(driver).move_to_element(close_button_pot).click().perform()
        
        # save table
        dfs[header] = df

In [None]:
finally:
    # Close the WebDriver
    driver.quit()
# %%
for k,v in dfs.items():
    print(k)
    print(v)

%%

In [None]:
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
r = requests.get(url, headers=headers)

%%