%%

In [55]:
import os, sys, json, datetime, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
from lxml import etree, html
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, extract_title_and_name, get_recent_file
from unidecode import unidecode

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

## Driver Set up

In [56]:
webdriver_path = r"C:\Users\clutz\hunt_env\chrome driver\chromedriver-win64\chromedriver.exe"
chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
# Set up WebDriver service
service = Service(webdriver_path)

#call on driver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)


In [57]:
#get data
senate_page_url = r'https://oksenate.gov/committees-list'
house_page_url = r'https://www.okhouse.gov/committees/house'

urls = pd.DataFrame({"chamber":['senate', 'house'],"urls":[senate_page_url,house_page_url]}).reset_index(drop = True)
urls_dict = urls.set_index('chamber')['urls'].to_dict()

for k,u in urls_dict.items():
    print(f'{k}: {u}{"\n"}')

    

senate: https://oksenate.gov/committees-list

house: https://www.okhouse.gov/committees/house



In [None]:
driver.get("https://oksenate.gov/committees/appropriations")
# time.sleep(2)

# class="field field--name-title field--type-string field--label-hidden"

title = driver.find_element(By.XPATH,'//span[@class="field field--name-title field--type-string field--label-hidden"]')
print(title.text)
if re.search(title.text

Appropriations


In [None]:
## House

In [58]:

house_links = []

comms_dict = {}
for k,u in urls_dict.items():
    if k.lower().strip() == "house":
        driver.get(u)

        html_from_page = driver.page_source
        soup = BeautifulSoup(html_from_page, 'html.parser')
        links = soup.find_all("a", {"class": "theme-shape theme-border relative flex flex-nowrap py-4 px-6 items-center border border-warmgray-300 shadow-md bg-white hover:shadow-xl min-h-[120px]"}, href=True)
        
        for link in links:
            link_dl = f'https://www.okhouse.gov/{link.attrs.get("href")}'
            driver.get(link_dl)
            time.sleep(2)
            subs = link.get_attribute
            html_from_page = driver.page_source
            sub_soup = BeautifulSoup(html_from_page, 'html.parser')
            
            subs = sub_soup.find_all("a", {'class':'cursor-pointer text-primary underline'}, href = True)
            print(len(subs))
            
            for sub in subs:
                
                keywords = ['[Ee]ducation', '[Cc]hildren', '[Yy]oung', '[Hh]ealth']
                pat = re.compile("|".join(keywords))
                if re.search(pat, sub.text):
                    print(f'com name: {sub.text}')
                else:
                    print(f'not a relevant comm: {sub.text}')
                    continue
                
                
                sub_link = f'https://www.okhouse.gov/{sub['href']}'
                sub_name = sub.text
                comms_dict[sub_name] = sub_link


                # print(f'sub: {sub}')
            # house_links.append(link_dl)




0
10
com name: A&B Education Subcommittee
not a relevant comm: A&B Finance Subcommittee
not a relevant comm: A&B General Government Subcommittee
com name: A&B Health Subcommittee
not a relevant comm: A&B Human Services Subcommittee
not a relevant comm: A&B Judiciary Subcommittee
not a relevant comm: A&B Natural Resources Subcommittee
not a relevant comm: A&B Public Safety Subcommittee
not a relevant comm: A&B Select Agencies Subcommittee
not a relevant comm: A&B Transportation Subcommittee
5
not a relevant comm: Business
not a relevant comm: Government Modernization and Technology
not a relevant comm: Insurance
not a relevant comm: Tourism
not a relevant comm: Transportation
2
com name: Common Education
com name: Postsecondary Education
4
not a relevant comm: Agriculture
not a relevant comm: Utilities
not a relevant comm: Wildlife
not a relevant comm: Energy
5
not a relevant comm: County and Municipal Government
not a relevant comm: Elections and Ethics
not a relevant comm: General Gov

In [59]:

# //*[@id="__next"]/div/div[2]/div/div[1]/div[2]/div/h2
for k,v in comms_dict.items():
    print(f'{k}: {v}')


A&B Education Subcommittee: https://www.okhouse.gov//committees/house/approp/ap-edu
A&B Health Subcommittee: https://www.okhouse.gov//committees/house/approp/ap-hlth
Common Education: https://www.okhouse.gov//committees/house/edu/comed
Postsecondary Education: https://www.okhouse.gov//committees/house/edu/posted
Public Health: https://www.okhouse.gov//committees/house/hhs/pubhlth
Children, Youth and Family Services: https://www.okhouse.gov//committees/house/hhs/child


In [None]:
comm_dict = {}
for k,lk in comms_dict.items():
    driver.get(lk)


    # html_from_page = driver.page_source
    # soup = BeautifulSoup(html_from_page, 'html.parser')
    # <h1 class="brand-font text-center type-h2 mb-4">Health and Human Services Oversight</h1>
    time.sleep(2)
    com_name = driver.find_element(By.XPATH, "//h1[@class='brand-font text-center type-h2 mb-4']")
    com_name = com_name.text
    keywords = ['[Ee]ducation', '[Cc]hildren', '[Yy]oung']
    pat = re.compile("|".join(keywords))
    if re.search(pat, com_name):
        print(f'com name: {com_name}')
    else:
        print(f'not a relevant comm: {com_name}')
        continue


    elements = driver.find_elements(By.XPATH, "//article[starts-with(@aria-label, 'see details on')]")
    

    # leg_arts = soup.find_all("article")
    dfs = []
    for leg in elements:
        # print(leg.get_attribute('outerHTML'))
        print('\n')

        # print(type(leg))
        leg_name = leg.find_element(By.XPATH, ".//p[@class = 'text-primary cta utility-font mb-1']")
        leg_district = leg.find_element(By.XPATH, ".//p[@class = 'ml-2 utility-font label']")
        try:
            leg_position = leg.find_element(By.XPATH, ".//p[@class = 'utility-font caption mb-1']")
            leg_position = leg.text
        except:
            leg_position = np.nan
            print('not a comm head')
        
        
        df = pd.DataFrame({'name': [leg_name.text], 'position': [leg_position], 'district': [leg_district.text]})
        dfs.append(df)
    
    leg_data = pd.concat(dfs).reset_index(drop=True)
    print(f'length: {len(leg_data)}')
        # print(leg_name.text)
        # print(leg_name.get_attribute('outerHTML'))
    comm_dict[com_name] = leg_data
    
    # name = <p class="text-primary cta utility-font mb-1">Brian Hill</p>
    # position = <p class="utility-font caption mb-1">Chair</p>
    # district = <p class="ml-2 utility-font label">District 47</p> #only for chair and vice chair
    # break


In [61]:
for k,v in comm_dict.items():
    
    print(f'{k}: {v}')

A&B Education Subcommittee:               name                                    position     district
0    CHAD CALDWELL        Chair\nCHAD CALDWELL\nR\nDistrict 40  District 40
1   TONI HASENBECK  Vice Chair\nTONI HASENBECK\nR\nDistrict 65  District 65
2    CHRIS BANNING                                         NaN  District 24
3      RONNY JOHNS                                         NaN  District 25
4        DICK LOWE                                         NaN  District 56
5  MICHELLE MCCANE                                         NaN  District 72
6      MIKE OSBURN                                         NaN  District 81
7     MARK TEDFORD                                         NaN  District 69
8     JOHN WALDRON                                         NaN  District 77
9     GABE WOOLLEY                                         NaN  District 98
Common Education:                 name                                    position     district
0          DICK LOWE            Chair\nD

In [None]:
main_url = r"https://www.ncleg.gov"

In [None]:
import fnmatch


hrefs = {}
for l in links:
    if 'NonStanding' in str(l):
        continue
    elif 'Standing' in str(l):
        half_link = str(l).split('href="', 1)[-1].split('">', 1)[0].strip()
        link = f'https://www.ncleg.gov{half_link}'
        title = str(l).split('title">', 1)[-1].split("</span",1)[0].strip()
        keywords = ["[Hh]ealth", "[Ee]ducation"]
        pattern = re.compile('|'.join(keywords))
        # print(pattern)
        res = bool(re.search(pattern, title))  # Use re.search() to match anywhere in the string
        # print(res)
        if res is True:
            hrefs[title] = link
        else:
            print(f'title was : {title}')


title was : Agriculture and Environment
title was : Alcoholic Beverage Control
title was : Appropriations
title was : Appropriations, Agriculture and Natural and Economic Resources
title was : Appropriations, Capital and Information Technology
title was : Appropriations, General Government
title was : Appropriations, Justice and Public Safety
title was : Appropriations, Transportation
title was : Commerce and Economic Development
title was : Election Law
title was : Emergency Management and Disaster Recovery
title was : Energy and Public Utilities
title was : Ethics
title was : Federal Relations and American Indian Affairs
title was : Finance
title was : Homeland Security and Military and Veterans Affairs
title was : Housing and Development
title was : Insurance
title was : Judiciary 1
title was : Judiciary 2
title was : Judiciary 3
title was : Oversight
title was : Pensions and Retirement
title was : Regulatory Reform
title was : Rules, Calendar, and Operations of the House
title was 

## THI Leg Data

In [None]:
#set up
ref_path = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\key_creation\2025"
leg_ref =pd.read_excel(get_recent_file("*", ref_path))
leg_ref = leg_ref[leg_ref['state abbreviation'] == "NC"]
print(leg_ref.columns)
print()
#loop_group creation (last_names associated with key)


Index(['full_pk', 'primary_key', 'district_code', 'state abbreviation',
       'chamber', 'title', 'first name', 'last name', 'party', 'district',
       'date assumed office', 'name', 'tenure', 'leader', 'state_code',
       'chamber_code'],
      dtype='object')



In [None]:

#groupby data to get primary key and last names associated with it
leg_lname_counts = leg_ref.groupby(['full_pk'])['last name'].nunique().reset_index()
print(leg_lname_counts.columns)

leg_lname_counts = leg_lname_counts[leg_lname_counts['last name']>1].reset_index(drop=True)
print(len(leg_lname_counts))

if len(leg_lname_counts) == 0:
    print(True)

leg_ref['full_pk'] = leg_ref['full_pk'].fillna(0).astype(int)

#set up dict for lookup
leg_ref_dict = leg_ref.set_index('last name')['full_pk'].to_dict()
for name, data in leg_ref_dict.items():
    if "Biggs" in name:
        print(name)
        print(data)

In [None]:
#refining to only members
members_dict = {}
for title,com_url in hrefs.items():
    
    
    driver.get(com_url)
    html_from_page = driver.page_source
    com_soup = BeautifulSoup(html_from_page, 'html.parser')

    members = com_soup.find_all("a", href=True)
    print('#')

    
    acceptable = []
    # [print(x.text) for x in members]
    for mem in members:
        search_str = str(mem).lower().strip()
        
        if re.search(r'members\/bio.+\n<img', search_str):
            acceptable.append(mem)
    
    members_dict[title] = acceptable



In [None]:
# bio dict creation
bios_dfs = []
names = []
bios = []
for title,ls in members_dict.items():
    
    #look through member bs4 tags
    for l in ls:
        name = l.text.strip()
        names.append(name)
        bio_url = main_url + l.attrs.get('href')
        bios.append(bio_url)

#create dict
bio_dict = dict(zip(names,bios))

# for key, value in bio_dict.items():
#     print(f"{key}: {value}")



In [None]:
#make bio dict

comm_info_dict = {}
for title, com_members in members_dict.items():
    #go through members list
    dfs_concat = []
    for mem in com_members:
        
        #get name
        name = re.sub('\n','',str(mem.text).strip())
        leg_titles_exp = name

        #get position
        parents = mem.find_parents(limit=2)
        for x in parents:
            # print('###')
            # next_siblings = x.find_next_siblings()
            # print(x.name)
            # print(x.attrs)
            f_class_o_parent = x.attrs.get('class')
            matches_parent = False
            for f in f_class_o_parent:
                if 'row' in str(f).lower():
                    matches_parent = True
            if matches_parent == True:
                header_tag_div = x
                break
            
        position_exp = header_tag_div.find_previous_sibling().text

        
        # get link to leg bio
        bio_link = f'https://www.ncleg.gov{mem.attrs.get('href')}'
        
        #df creation
        df_exp = pd.DataFrame({"com":[title],"name":[leg_titles_exp], "position": [position_exp], "bio_link": [bio_link]})
        dfs_concat.append(df_exp)

        
    #pull dfs for com together
    com_df = pd.concat(dfs_concat).reset_index(drop = True)
    comm_info_dict[title] = com_df


In [None]:
#assign fpk
for k,v in comm_info_dict.items():
    
    v['full_pk'] = np.nan 

    #individual com df's
    for i,j in enumerate(v['name']):
        name = re.split(r'^[Rr]ep.|^[Ss]en.', str(j), maxsplit=1)[-1].strip()


        # if there is an initial remove and cache
        if re.search(r'^[A-Za-z]{1}\.\s', name):
            name_split = re.split(r'\.\s', name, maxsplit=1)
            name = name_split[-1]
            initial = name_split[0]
        
        #gets all matches for the last name
        name = unidecode(name)
        check = leg_ref[leg_ref['last name'].str.contains(name)].reset_index(drop=True)
        
        print('=========================================')
        print(f'name: {name}')
        
        print('#')
        #only one match go ahead and assign
        if len(check) == 1:
            printit = False
            full_pk = str(int(check.loc[0,'full_pk']))
            v.loc[i,'full_pk'] = full_pk
            if printit == True:
                print("intial route 1: matched 1")
                print('_________________')            
            # print(check.to_string())

        #more than one, check intials
        elif len(check) > 1:
            printit = False

            if printit == True:
                print("intial route 2: >1 result")
            # print(check.to_string())
            initials = check['first name'].to_list()
            iis = []
            
            #list of first names from leg_ref
            for ii,jj in enumerate(initials):
                if re.search(fr'^{initial}', str(jj)):
                    iis.append(ii)
            
            #intials checked
            printit = False
            if len(iis) == 1:
                df_i = iis[0]
                full_pk = str(int(check.loc[df_i,'full_pk']))
                v.loc[i,'full_pk'] = full_pk
                if printit == True:
                    print('narrowed it down')
                    print('_________________')
            elif len(iis) > 1:
                if printit == True:
                    print('more than 1 still')
                    print('_________________')
                break
            
            #no results from initials look up
            else:
                
                print('no match on intials, looking in bio')
                
                #search website
                route_2_url = bio_dict.get(j)
                driver.get(route_2_url)
                html_from_page = driver.page_source
                bio_soup = BeautifulSoup(html_from_page, 'html.parser')

                #retrieve info
                title = bio_soup.find("h1", {"class": "section-title"})
                cl_title = re.sub("[Rr]ep[resentative]*|[Ss]en[ator]*",'', title.text).replace('()','').strip()
                fname = cl_title.split(' ', 1)[0]
                print(f'first name: {fname}')
                half = len(fname)//2
                fname_pt1 = fname[:half]
                fname_pt2 = fname[half:]
                    
                for ii,jj in enumerate(initials):
                    if re.search(f'{fname_pt1}*{fname_pt2}*', str(jj)):
                        print(f'found {jj}')
                        full_pk = str(int(check.loc[ii,'full_pk']))
                        v.loc[i,'full_pk'] = full_pk



                
                # print(cl_title)
                
                
                print(initials)
                print(f'searched value: {name}')

                print('_________________')
        
        
        #this means there were no results from the check
        else:
            print('##')

            print("backup")
            name2 = name.title()
            check2 = leg_ref[leg_ref['last name'].str.contains(name2)].reset_index(drop=True)
            print(f'name2: {name2}')
            if len(check2) == 1:
                full_pk = str(int(check2.loc[0,'full_pk']))
                v.loc[i,'full_pk'] = full_pk
                print("backup: sucess")
                print('_________________')
            elif len(check2) == 0:
                print("backup: fail [no results from 2nd check]")
                print('_________________')
                print(j)
                print(name)
                print(check2.to_string())
                break

            elif len(check2) > 1:

                bio_ref = bio_dict.get(j)
                print(bio_ref)
                print("backup: fail [still more than one]")
                print('_________________')
                print(j)
                print(name)
                print(check2.to_string())
                break


    # check for nan values
    no_values = v[v['full_pk'].isna()]
    indexs = no_values.index.to_list()
    if no_values.empty:
        continue
    else:
        print('still something left in this one')
        print(no_values.to_string)

In [None]:
#final df creation
nc_com_df = pd.concat(comm_info_dict.values()).reset_index(drop=True)
pop_column = nc_com_df.pop('full_pk')


# Insert 'col_B' at the beginning of the DataFrame (index 0)
nc_com_df.insert(0, 'full_pk', pop_column)



%%