In [66]:
import os, sys, json, datetime, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
from lxml import etree, html
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, extract_title_and_name, get_recent_file
from unidecode import unidecode

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

## Driver Set up

In [67]:
webdriver_path = r"C:\Users\clutz\hunt_env\chrome driver\chromedriver-win64\chromedriver.exe"
chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
# Set up WebDriver service
service = Service(webdriver_path)

In [68]:
#call on driver and get data
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)


In [69]:
house_url = "https://alison.legislature.state.al.us/committees-house-standing-current"
senate_url = "https://alison.legislature.state.al.us/committees-senate-standing-current-year"

In [70]:
def clean_string(s):
    return s.strip().lower()

In [71]:
#senate com setup
s_com_names = """
    Committe
Education Policy
Children and Youth Health
Finance and Taxation General Fund
Finance and Taxation Education
Education Policy
Children and Youth Health
    """
s_com_names = list(map(clean_string, s_com_names.split('\n')))
s_com_names = [x for x in s_com_names if len(x) > 0]
print(s_com_names)


['committe', 'education policy', 'children and youth health', 'finance and taxation general fund', 'finance and taxation education', 'education policy', 'children and youth health']


In [72]:
#house com setup
h_com_names = """
    Children and Senior Advocacy
    Education Policy
    Ways and Means Education
    Ways and Means General Fund 
    """
h_com_names = list(map(clean_string, h_com_names.split('\n')))
h_com_names = [x for x in h_com_names if len(x) > 0]
print(h_com_names)


['children and senior advocacy', 'education policy', 'ways and means education', 'ways and means general fund']


In [73]:
# testing chunk
# driver.get(house_url)
# time.sleep(2)
# elements = driver.find_elements(By.XPATH, "//button[@aria-label='Next Page']")

# processed = []
# # print(elements)
# for ele in elements:
#     print(ele)
#     ele.click()

In [None]:
#split text, clean, and append
def split_and_clean(element):
    text_split = element.text.split('\n')
    com_name = text_split[0]
    com_name = clean_string(com_name)
    # print(com_name)

    return com_name


def look_for_coms(app_list, chamber):
    elements = driver.find_elements(By.XPATH, "//tr[@role='button']")
    # print(f'length of elements results: {len(elements)}')
    for ele in elements:
        # print(ele)
        com_name = split_and_clean(ele)
        if chamber == "house":
            if com_name in h_com_names:
                app_list.append(ele)
                # print(f'appended {com_name}')
        if chamber == "senate":
            if com_name in s_com_names:
                app_list.append(ele)
            # print(f'appended {com_name}')


def look_for_coms_by_keywords(app_list):
    #find committee rows
    elements = driver.find_elements(By.XPATH, "//tr[@role='button']")
    # print(f'length of elements results: {len(elements)}')
    keywords = ['[Ee]ducation', '[Cc]hildren', '[Yy]oung']
    pat = re.compile("|".join(keywords))
    for ele in elements:
        # print(ele)
    
        
        com_name = split_and_clean(ele)
        if re.search(pat,com_name):
            app_list.append(ele)


    

In [75]:
#house comm info
driver.get(house_url)
time.sleep(2)


processed_h = []
while len(h_com_names) > len(processed_h):
#first look
    page_processed = []
    look_for_coms(page_processed, "house")
    for ele in page_processed:
        time.sleep(1)
        ele.click()

        download_button = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV']")
        driver.execute_script("arguments[0].click();", download_button)

        # download = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV']")
        # download.click()
        time.sleep(1)
        close = driver.find_element(By.XPATH, "//button[starts-with(@aria-label, 'Close')]")
        close.click()
        processed_h.append(ele)



    next_page = driver.find_element(By.XPATH, "//button[@aria-label='Next Page']")
    next_page.click()
    time.sleep(2)
    
print(processed_h)

[<selenium.webdriver.remote.webelement.WebElement (session="43dd50451b5ce31d55dbfc14eacc71d8", element="f.4D6FFF93B8D9BE8937AAF8574364FEFA.d.71DA3AA6C74AAC9E9857AE442A02A265.e.25")>, <selenium.webdriver.remote.webelement.WebElement (session="43dd50451b5ce31d55dbfc14eacc71d8", element="f.4D6FFF93B8D9BE8937AAF8574364FEFA.d.71DA3AA6C74AAC9E9857AE442A02A265.e.30")>, <selenium.webdriver.remote.webelement.WebElement (session="43dd50451b5ce31d55dbfc14eacc71d8", element="f.4D6FFF93B8D9BE8937AAF8574364FEFA.d.71DA3AA6C74AAC9E9857AE442A02A265.e.77")>, <selenium.webdriver.remote.webelement.WebElement (session="43dd50451b5ce31d55dbfc14eacc71d8", element="f.4D6FFF93B8D9BE8937AAF8574364FEFA.d.71DA3AA6C74AAC9E9857AE442A02A265.e.78")>]


In [84]:
#senate comm info
driver.get(senate_url)
time.sleep(2)

from selenium.common.exceptions import WebDriverException

processed_s = []
i = 0
while i < 5:
#first look
    i += 1
    print(i)

    look_for_coms_by_keywords(processed_s)
    # next_page = driver.find_element(By.XPATH, "//button[@aria-label='Next Page']")
    
    # next_page.click()
    for ele in processed_s:
        ele.click()
        time.sleep(2)
        download_button = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV' and starts-with(@id='headlessui')]")
        driver.execute_script("arguments[0].click();", download_button)
    
    # download = driver.find_element(By.XPATH, "//button[@aria-label='Download CSV']")
    # download.click()
    time.sleep(1)
    # close = driver.find_element(By.XPATH, "//button[starts-with(@aria-label, 'Close')]")
    # close.click()
    # print("clicked")
next_page = driver.find_element(By.XPATH, "//button[@aria-label='Next Page']")
next_page.click()
time.sleep(2)

# /html/body/div[4]/div[3]/div/div/div[2]/div[2]/div/div[1]/div/button[1]
# //*[@id="headlessui-dialog-panel-25"]/div[2]/div/div[1]/div/button[1]

print(processed_s)


1


InvalidSelectorException: Message: invalid selector: Unable to locate an element with the xpath expression //button[@aria-label='Download CSV' and starts-with(@id='headlessui')] because of the following error:
SyntaxError: Failed to execute 'evaluate' on 'Document': The string '//button[@aria-label='Download CSV' and starts-with(@id='headlessui')]' is not a valid XPath expression.
  (Session info: chrome=134.0.6998.36); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalid-selector-exception
Stacktrace:
	GetHandleVerifier [0x00E6B5A3+24387]
	(No symbol) [0x00DF5904]
	(No symbol) [0x00CD0753]
	(No symbol) [0x00CD7137]
	(No symbol) [0x00CD93AA]
	(No symbol) [0x00CD9427]
	(No symbol) [0x00D18431]
	(No symbol) [0x00D18EFB]
	(No symbol) [0x00D619C2]
	(No symbol) [0x00D3D894]
	(No symbol) [0x00D5F138]
	(No symbol) [0x00D3D646]
	(No symbol) [0x00D0C59F]
	(No symbol) [0x00D0D8E4]
	GetHandleVerifier [0x0116D883+3179043]
	GetHandleVerifier [0x01186CF9+3282585]
	GetHandleVerifier [0x0118167C+3260444]
	GetHandleVerifier [0x00F04330+650448]
	(No symbol) [0x00DFED0D]
	(No symbol) [0x00DFBAF8]
	(No symbol) [0x00DFBC99]
	(No symbol) [0x00DEE530]
	BaseThreadInitThunk [0x75F05D49+25]
	RtlInitializeExceptionChain [0x770CCDEB+107]
	RtlGetAppContainerNamedObjectPath [0x770CCD71+561]


In [None]:
processed = processed_s + processed_h

for p in processed:
    

In [52]:
print(processed)
print(s_com_names)

['children and youth health', 'education policy', 'finance and taxation education', 'finance and taxation general fund']
['committee', 'finance and taxation general fund', 'finance and taxation education', 'education policy', 'children and youth health', 'finance and taxation general fund', 'finance and taxation education', 'education policy', 'children and youth health']


In [None]:




html_from_page = driver.page_source
soup = BeautifulSoup(html_from_page, 'html.parser')
links = soup.find_all("a", href = True)
print(links)



In [None]:

page_url = r'https://www.ncleg.gov/Committees#HouseStanding'
driver.get(page_url)

html_from_page = driver.page_source
soup = BeautifulSoup(html_from_page, 'html.parser')

links = soup.find_all("a", {"class": "list-group-item list-group-item-action filteredGroup searchable"})


In [None]:
import os, sys, json, datetime, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import time
from tqdm import tqdm

from bs4 import BeautifulSoup

In [None]:
response = requests.get(url, verify = False).content
df_list = pd.read_html(response)
print(type(df_list[0]))
df = df_list[0]

%% For AL


<br>
This will require committee names <br>
to match what is on the website.<br>
***double check that this is the case****<br>


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

Path to your WebDriver executable (adjust if necessary)

In [None]:
webdriver_path = r"C:\Users\clutz\hunt_env\chrome driver\chromedriver-win64\chromedriver.exe"

Set up Chrome options

In [None]:
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)

Set up WebDriver service

In [None]:
service = Service(webdriver_path)

Initialize WebDriver

In [None]:
driver = webdriver.Chrome(service=service, options=chrome_options)

%%% senate

In [None]:
dfs = {}
try:
    # Open the target URL
    driver.get('https://alison.legislature.state.al.us/committees-senate-standing-current-year')

    # Wait for the page to load and the tbody to be present
    wait = WebDriverWait(driver, 20)

    # Use a broader selector or additional waits to ensure the page is fully loaded
    tbody = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "tbody")))

    # Debug: Print the HTML of the tbody to verify its presence
    # print(tbody.get_attribute('outerHTML'))

    # Find all rows within the tbody
    rows = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "tbody tr")))
    #list of committee names (replace with logic to pull from csv and filter by appropriate state)
    coms = ['Education Policy','Finance and Taxation Education','Finance and Taxation General Fund','Children and Youth Health','Healthcare']

In [None]:
    # go through each committee and check if relevant and save table to dict
    for row in rows:
        time.sleep(2)  # Adjust sleep time if necessary to allow the table to load
        ActionChains(driver).move_to_element(row).click().perform()
        time.sleep(2)  # Adjust sleep time if necessary to allow the table to load
        html_from_page = driver.page_source
        soup = BeautifulSoup(html_from_page, 'html.parser')
        modal_div = soup.find_all("div", class_= "ReactModalPortal")
        for m in modal_div:
            # print(m)
            if len(str(m)) > 36:
                div = m
                break

        # print(div)
        header = div.find("h1")
        header = str(header).split('">', 1)[-1].split('</h1', 1)[0].replace('Members','').strip()
        pass_through = False
        for h in coms:
            if str(header) in h:
                pass_through = True
            else:
                continue
        #close if not a relevant committee
        if pass_through != True:
            print(str(header) + ' is not a valid committee')
            close_button_pot = driver.find_element(By.XPATH, "/html/body/div[6]/div/div/div/div[1]/button")
            # time.sleep(5)  # Adjust sleep time if necessary to allow the table to load
            ActionChains(driver).move_to_element(close_button_pot).click().perform()
            # time.sleep(5)  # Adjust sleep time if necessary to allow the table to load
            continue
        
        #fetch table
        df_list = pd.read_html(html_from_page)
        df = df_list[-1]
        
       
        #close popup
        close_button_pot = driver.find_element(By.XPATH, "/html/body/div[6]/div/div/div/div[1]/button")
        ActionChains(driver).move_to_element(close_button_pot).click().perform()
        
        # save table
        dfs[header] = df

In [None]:
finally:
    # Close the WebDriver
    driver.quit()

%%% house

In [None]:
dfs = {}
try:
    # Open the target URL
    driver.get('https://alison.legislature.state.al.us/committees-house-standing-current')

    # Wait for the page to load and the tbody to be present
    wait = WebDriverWait(driver, 20)

    # Use a broader selector or additional waits to ensure the page is fully loaded
    tbody = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "tbody")))

    # Debug: Print the HTML of the tbody to verify its presence
    # print(tbody.get_attribute('outerHTML'))
    
    coms = ["Education Policy","Ways and Means General Fund","Ways and Means Education","Health","Children and Senior Advocacy"]
    
    
    
    
    # Find all rows within the tbody
    rows = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "tbody tr")))
    #list of committee names (replace with logic to pull from csv and filter by appropriate state)

    # go through each committee and check if relevant and save table to dict
    for row in rows:
        time.sleep(2)  # Adjust sleep time if necessary to allow the table to load
        ActionChains(driver).move_to_element(row).click().perform()
        time.sleep(2)  # Adjust sleep time if necessary to allow the table to load
        html_from_page = driver.page_source
        soup = BeautifulSoup(html_from_page, 'html.parser')
        modal_div = soup.find_all("div", class_= "ReactModalPortal")
        for m in modal_div:
            # print(m)
            if len(str(m)) > 36:
                div = m
                break

        # print(div)
        header = div.find("h1")
        header = str(header).split('">', 1)[-1].split('</h1', 1)[0].replace('Members','').strip()
        pass_through = False
        for h in coms:
            if str(header) in h:
                pass_through = True
            else:
                continue
        #close if not a relevant committee
        if pass_through != True:
            print(str(header) + ' is not a valid committee')
            close_button_pot = driver.find_element(By.XPATH, "/html/body/div[6]/div/div/div/div[1]/button")
            # time.sleep(5)  # Adjust sleep time if necessary to allow the table to load
            ActionChains(driver).move_to_element(close_button_pot).click().perform()
            # time.sleep(5)  # Adjust sleep time if necessary to allow the table to load
            continue
        
        #fetch table
        df_list = pd.read_html(html_from_page)
        df = df_list[-1]
        
       
        #close popup
        close_button_pot = driver.find_element(By.XPATH, "/html/body/div[6]/div/div/div/div[1]/button")
        ActionChains(driver).move_to_element(close_button_pot).click().perform()
        
        # save table
        dfs[header] = df

In [None]:
finally:
    # Close the WebDriver
    driver.quit()
# %%
for k,v in dfs.items():
    print(k)
    print(v)

%%

In [None]:
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
r = requests.get(url, headers=headers)

%%