%%

In [2]:
import os, sys, json, datetime, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
from lxml import etree, html
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, extract_title_and_name, get_recent_file
from unidecode import unidecode

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

## Driver Set up

In [3]:
webdriver_path = r"C:\Users\clutz\hunt_env\chrome driver\chromedriver-win64\chromedriver.exe"
chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
# Set up WebDriver service
service = Service(webdriver_path)

In [4]:
#call on driver and get data
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
house_url = r'https://www.ncleg.gov/Committees#HouseStanding'
sen_url = r'https://www.ncleg.gov/Committees#SenateStanding'
urls = [house_url, sen_url]


links = []
for url in urls:
    driver.get(url)
    html_from_page = driver.page_source
    soup = BeautifulSoup(html_from_page, 'html.parser')

    link_soup = soup.find_all("a", {"class": "list-group-item list-group-item-action filteredGroup searchable"})
    links.extend(link_soup)


In [5]:

print(*links, sep='\n')


<a class="list-group-item list-group-item-action filteredGroup searchable" href="/Committees/CommitteeInfo/HouseStanding/225"><span class="comms-item-title">Agriculture and Environment </span></a>
<a class="list-group-item list-group-item-action filteredGroup searchable" href="/Committees/CommitteeInfo/HouseStanding/4"><span class="comms-item-title">Alcoholic Beverage Control </span></a>
<a class="list-group-item list-group-item-action filteredGroup searchable" href="/Committees/CommitteeInfo/HouseStanding/6"><span class="comms-item-title">Appropriations </span></a>
<a class="list-group-item list-group-item-action filteredGroup searchable" href="/Committees/CommitteeInfo/HouseStanding/12"><span class="comms-item-title">Appropriations, Agriculture and Natural and Economic Resources </span></a>
<a class="list-group-item list-group-item-action filteredGroup searchable" href="/Committees/CommitteeInfo/HouseStanding/227"><span class="comms-item-title">Appropriations, Capital and Information

In [6]:
main_url = r"https://www.ncleg.gov"

In [7]:
import fnmatch


hrefs = {}
for l in links:
    if 'NonStanding' in str(l):
        continue
    elif 'Standing' in str(l):
        half_link = str(l).split('href="', 1)[-1].split('">', 1)[0].strip()
        link = f'https://www.ncleg.gov{half_link}'
        title = str(l).split('title">', 1)[-1].split("</span",1)[0].strip()
        keywords = ["[Hh]ealth", "[Ee]ducation","[Cc]hildren"]
        pattern = re.compile('|'.join(keywords))
        # print(pattern)
        res = bool(re.search(pattern, title))  # Use re.search() to match anywhere in the string
        # print(res)
        if res is True:
            hrefs[title] = link
        else:
            print(f'title was : {title}')


title was : Agriculture and Environment
title was : Alcoholic Beverage Control
title was : Appropriations
title was : Appropriations, Agriculture and Natural and Economic Resources
title was : Appropriations, Capital and Information Technology
title was : Appropriations, General Government
title was : Appropriations, Justice and Public Safety
title was : Appropriations, Transportation
title was : Commerce and Economic Development
title was : Election Law
title was : Emergency Management and Disaster Recovery
title was : Energy and Public Utilities
title was : Ethics
title was : Federal Relations and American Indian Affairs
title was : Finance
title was : Homeland Security and Military and Veterans Affairs
title was : Housing and Development
title was : Insurance
title was : Judiciary 1
title was : Judiciary 2
title was : Judiciary 3
title was : Oversight
title was : Pensions and Retirement
title was : Regulatory Reform
title was : Rules, Calendar, and Operations of the House
title was 

In [8]:
for k,v in hrefs.items():
    print(v)

https://www.ncleg.gov/Committees/CommitteeInfo/HouseStanding/7
https://www.ncleg.gov/Committees/CommitteeInfo/HouseStanding/10
https://www.ncleg.gov/Committees/CommitteeInfo/HouseStanding/166
https://www.ncleg.gov/Committees/CommitteeInfo/HouseStanding/26
https://www.ncleg.gov/Committees/CommitteeInfo/HouseStanding/226
https://www.ncleg.gov/Committees/CommitteeInfo/SenateStanding/141
https://www.ncleg.gov/Committees/CommitteeInfo/SenateStanding/143
https://www.ncleg.gov/Committees/CommitteeInfo/SenateStanding/134
https://www.ncleg.gov/Committees/CommitteeInfo/SenateStanding/139


## THI Leg Data

In [9]:
#set up
ref_path = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\key_creation\2025"
leg_ref =pd.read_excel(get_recent_file("*", ref_path))
leg_ref = leg_ref[leg_ref['state abbreviation'] == "NC"]
print(leg_ref.columns)
print()
#loop_group creation (last_names associated with key)


Index(['full_pk', 'primary_key', 'district_code', 'state abbreviation',
       'chamber', 'title', 'first name', 'last name', 'party', 'district',
       'date assumed office', 'name', 'tenure', 'leader', 'state_code',
       'chamber_code'],
      dtype='object')



In [10]:

#groupby data to get primary key and last names associated with it
leg_lname_counts = leg_ref.groupby(['full_pk'])['last name'].nunique().reset_index()
print(leg_lname_counts.columns)

leg_lname_counts = leg_lname_counts[leg_lname_counts['last name']>1].reset_index(drop=True)
print(len(leg_lname_counts))

if len(leg_lname_counts) == 0:
    print(True)

leg_ref['full_pk'] = leg_ref['full_pk'].fillna(0).astype(int)

#set up dict for lookup
leg_ref_dict = leg_ref.set_index('last name')['full_pk'].to_dict()
for name, data in leg_ref_dict.items():
    if "Biggs" in name:
        print(name)
        print(data)

Index(['full_pk', 'last name'], dtype='object')
0
True
Biggs
42007000


In [11]:
#refining to only members
members_dict = {}
for title,com_url in hrefs.items():
    
    
    driver.get(com_url)
    html_from_page = driver.page_source
    com_soup = BeautifulSoup(html_from_page, 'html.parser')

    members = com_soup.find_all("a", href=True)
    print('#')

    
    acceptable = []
    # [print(x.text) for x in members]
    for mem in members:
        search_str = str(mem).lower().strip()
        
        if re.search(r'members\/bio.+\n<img', search_str):
            acceptable.append(mem)
    
    members_dict[title] = acceptable



#
#
#
#
#
#
#
#
#


In [12]:
# bio dict creation
bios_dfs = []
names = []
bios = []
for title,ls in members_dict.items():
    
    #look through member bs4 tags
    for l in ls:
        name = l.text.strip()
        names.append(name)
        bio_url = main_url + l.attrs.get('href')
        bios.append(bio_url)

#create dict
bio_dict = dict(zip(names,bios))

# for key, value in bio_dict.items():
#     print(f"{key}: {value}")



In [13]:
#make bio dict

comm_info_dict = {}
for title, com_members in members_dict.items():
    #go through members list
    dfs_concat = []
    for mem in com_members:
        
        #get name
        name = re.sub('\n','',str(mem.text).strip())
        leg_titles_exp = name

        #get position
        parents = mem.find_parents(limit=2)
        for x in parents:
            # print('###')
            # next_siblings = x.find_next_siblings()
            # print(x.name)
            # print(x.attrs)
            f_class_o_parent = x.attrs.get('class')
            matches_parent = False
            for f in f_class_o_parent:
                if 'row' in str(f).lower():
                    matches_parent = True
            if matches_parent == True:
                header_tag_div = x
                break
            
        position_exp = header_tag_div.find_previous_sibling().text

        
        # get link to leg bio
        bio_link = f'https://www.ncleg.gov{mem.attrs.get('href')}'
        
        #df creation
        df_exp = pd.DataFrame({"com":[title],"name":[leg_titles_exp], "position": [position_exp], "bio_link": [bio_link]})
        dfs_concat.append(df_exp)

        
    #pull dfs for com together
    com_df = pd.concat(dfs_concat).reset_index(drop = True)
    comm_info_dict[title] = com_df


In [14]:
#assign fpk
for k,v in comm_info_dict.items():
    
    v['full_pk'] = np.nan 

    #individual com df's
    for i,j in enumerate(v['name']):
        name = re.split(r'^[Rr]ep.|^[Ss]en.', str(j), maxsplit=1)[-1].strip()


        # if there is an initial remove and cache
        if re.search(r'^[A-Za-z]{1}\.\s', name):
            name_split = re.split(r'\.\s', name, maxsplit=1)
            name = name_split[-1]
            initial = name_split[0]
        
        #gets all matches for the last name
        name = unidecode(name)
        check = leg_ref[leg_ref['last name'].str.contains(name)].reset_index(drop=True)
        
        print('=========================================')
        print(f'name: {name}')
        
        print('#')
        #only one match go ahead and assign
        if len(check) == 1:
            printit = False
            full_pk = str(int(check.loc[0,'full_pk']))
            v.loc[i,'full_pk'] = full_pk
            if printit == True:
                print("intial route 1: matched 1")
                print('_________________')            
            # print(check.to_string())

        #more than one, check intials
        elif len(check) > 1:
            printit = False

            if printit == True:
                print("intial route 2: >1 result")
            # print(check.to_string())
            initials = check['first name'].to_list()
            iis = []
            
            #list of first names from leg_ref
            for ii,jj in enumerate(initials):
                if re.search(fr'^{initial}', str(jj)):
                    iis.append(ii)
            
            #intials checked
            printit = False
            if len(iis) == 1:
                df_i = iis[0]
                full_pk = str(int(check.loc[df_i,'full_pk']))
                v.loc[i,'full_pk'] = full_pk
                if printit == True:
                    print('narrowed it down')
                    print('_________________')
            elif len(iis) > 1:
                if printit == True:
                    print('more than 1 still')
                    print('_________________')
                break
            
            #no results from initials look up
            else:
                
                print('no match on intials, looking in bio')
                
                #search website
                route_2_url = bio_dict.get(j)
                driver.get(route_2_url)
                html_from_page = driver.page_source
                bio_soup = BeautifulSoup(html_from_page, 'html.parser')

                #retrieve info
                title = bio_soup.find("h1", {"class": "section-title"})
                cl_title = re.sub("[Rr]ep[resentative]*|[Ss]en[ator]*",'', title.text).replace('()','').strip()
                fname = cl_title.split(' ', 1)[0]
                print(f'first name: {fname}')
                half = len(fname)//2
                fname_pt1 = fname[:half]
                fname_pt2 = fname[half:]
                    
                for ii,jj in enumerate(initials):
                    if re.search(f'{fname_pt1}*{fname_pt2}*', str(jj)):
                        print(f'found {jj}')
                        full_pk = str(int(check.loc[ii,'full_pk']))
                        v.loc[i,'full_pk'] = full_pk



                
                # print(cl_title)
                
                
                print(initials)
                print(f'searched value: {name}')

                print('_________________')
        
        
        #this means there were no results from the check
        else:
            print('##')

            print("backup")
            name2 = name.title()
            check2 = leg_ref[leg_ref['last name'].str.contains(name2)].reset_index(drop=True)
            print(f'name2: {name2}')
            if len(check2) == 1:
                full_pk = str(int(check2.loc[0,'full_pk']))
                v.loc[i,'full_pk'] = full_pk
                print("backup: sucess")
                print('_________________')
            elif len(check2) == 0:
                print("backup: fail [no results from 2nd check]")
                print('_________________')
                print(j)
                print(name)
                print(check2.to_string())
                break

            elif len(check2) > 1:

                bio_ref = bio_dict.get(j)
                print(bio_ref)
                print("backup: fail [still more than one]")
                print('_________________')
                print(j)
                print(name)
                print(check2.to_string())
                break


    # check for nan values
    no_values = v[v['full_pk'].isna()]
    indexs = no_values.index.to_list()
    if no_values.empty:
        continue
    else:
        print('still something left in this one')
        print(no_values.to_string)

name: Biggs
#
name: Cotham
#
name: Willis
#
name: Blackwell
#
name: Ball
#
name: Brockman
#
name: Gable
#
name: Hawkins
#
name: Johnson-Hostler
#
name: Pickett
#
name: Prather
#
name: Potts
#
name: Reeder
#
name: White
#
name: Lambeth
#
name: Almond
#
name: Buansi
#
name: Campbell
#
name: Cervania
#
name: Crawford
#
name: Cunningham
#
name: Huneycutt
#
name: Liu
#
name: Pless
#
name: Rhyne
#
name: von Haefen
#
##
backup
name2: Von Haefen
backup: sucess
_________________
name: Biggs
#
name: Cotham
#
name: Willis
#
name: Blackwell
#
name: Brockman
#
name: Rhyne
#
name: Wheatley
#
name: Balkcom
#
name: Ball
#
name: Brown
#
name: Budd
#
name: Dew
#
name: Greenfield
#
name: Iler
#
name: Johnson
#
no match on intials, looking in bio
first name: Jake
found Jake
['Jake', 'Monika', 'Todd']
searched value: Johnson
_________________
name: Lambeth
#
name: Lofton
#
name: Morey
#
name: Potts
#
name: Riddell
#
name: Schietzelt
#
name: Shepard
#
name: Torbett
#
name: von Haefen
#
##
backup
name2: Von 

In [15]:
#final df creation
nc_com_df = pd.concat(comm_info_dict.values()).reset_index(drop=True)
pop_column = nc_com_df.pop('full_pk')


# Insert 'col_B' at the beginning of the DataFrame (index 0)
nc_com_df.insert(0, 'full_pk', pop_column)



In [16]:
#final df cleaning

#get list of comms
sen = nc_com_df[nc_com_df['full_pk'].astype(str).str.contains(r'^\d{2}1\d+', regex = True)]
hou = nc_com_df[nc_com_df['full_pk'].astype(str).str.contains(r'^\d{2}0\d+', regex = True)]


nc_com_df['com_id'] = nc_com_df['full_pk'].astype(str)[:2]


pop_column = nc_com_df.pop('full_pk')

# Insert 'col_B' at the beginning of the DataFrame (index 0)
nc_com_df.insert(0, 'full_pk', pop_column)

In [17]:


nc_com_df.to_excel(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\committee info\committee update files\NC_coms.xlsx', index=False)


%%