%%

In [78]:
import os, sys, json, datetime, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import time
from tqdm import tqdm
import urllib3
import tabula as tb
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [79]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, extract_title_and_name, get_recent_file
from unidecode import unidecode

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

In [80]:
from bs4 import BeautifulSoup

In [81]:
nd_coms_url = r"https://ndlegis.gov/assembly/69-2025/committees"


In [82]:
webdriver_path = r"C:\Users\clutz\hunt_env\chrome driver\chromedriver-win64\chromedriver.exe"
chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
# Set up WebDriver service
service = Service(webdriver_path)

#call on driver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)


In [83]:
driver.get(nd_coms_url)
html_from_page = driver.page_source
com_soup = BeautifulSoup(html_from_page, 'html.parser')
print(type(com_soup))



<class 'bs4.BeautifulSoup'>


In [84]:
hrefs = com_soup.find_all("a", href = True)
print(type(hrefs))
com_hrefs = []
for h in hrefs:
    # print(h)
    if "69-2025/committees" in str(h):
        url = h.get("href")
        # print(url)
        if re.search(f'senate|house', str(h)):
            keywords = ["[Hh]ealth", "[Ee]ducation","[Cc]hildren", "[Ss]ervices", "[Hh]uman"]
            pattern = re.compile('|'.join(keywords))
        # print(pattern)
            res = bool(re.search(pattern, str(h)))  # Use re.search() to match anywhere in the string
            # print(res)
            if res == True:
                com_hrefs.append("https://ndlegis.gov"+url)        
        


<class 'bs4.element.ResultSet'>


In [85]:
# t = com_hrefs[0]
# print(t)
# driver.get(t)

print(*com_hrefs, sep = '\n')

https://ndlegis.gov/assembly/69-2025/committees/house/appropriations-education-and-environment-division
https://ndlegis.gov/assembly/69-2025/committees/house/appropriations-human-resources-division
https://ndlegis.gov/assembly/69-2025/committees/house/education
https://ndlegis.gov/assembly/69-2025/committees/house/human-services
https://ndlegis.gov/assembly/69-2025/committees/senate/appropriations-education-and-environment-division
https://ndlegis.gov/assembly/69-2025/committees/senate/appropriations-human-resources-division
https://ndlegis.gov/assembly/69-2025/committees/senate/education
https://ndlegis.gov/assembly/69-2025/committees/senate/human-services


In [86]:
coms_dict = {}
for h in com_hrefs:
    driver.get(h)
    url_split = h.split('/')


    com_name = url_split[-1]
    chamber = url_split[-2]
    com_name = f'{chamber}_{com_name}'
    print(chamber)
    comms_elements = driver.find_elements(By.CLASS_NAME, "member-wrapper")
    print('#####################################')
    print(com_name)
    print('#####################################')
    members_preview = []
    # for c in comms_elements:
    #     print('_________________')
    #     print(c.text)

    members = []
    for mem in comms_elements:
        member_info = mem.text.split('\n')
        # print(member_info)

        mem_len = len(member_info)
        if mem_len > 3:
            pos_grab = member_info[-1]
            position = str(pos_grab)

            # print(f'position is {position}')
        else:
            position = "Member"
            
        
        for i,m in enumerate(member_info):
            m_cl = str(m).strip()
            if re.search('[Rr]epres|[Ss]ena', m_cl):
                continue
            elif re.search('[Dd]istrict', m_cl):
                district = re.sub(r'[Dd]istrict', '',m_cl).strip().split('|', maxsplit=1)[0].strip()
                # print(f'district: {district}')
            
            
            else:
                if mem_len > 3 and i == 3:
                    position = m_cl
                    
                elif mem_len > 3 and i == 2:
                    name = m_cl
                else:
                    name = m_cl
                
            
        mem_df = pd.DataFrame({'com_name':[com_name], 'name':[name], 'position':[position], 'distict':[district]})
        # print(mem_df.to_string())
        members.append(mem_df)


    com_membership = pd.concat(members).reset_index(drop = False)
    print(com_membership.to_string())

    coms_dict[com_name] = com_membership




house
#####################################
house_appropriations-education-and-environment-division
#####################################
   index                                                 com_name               name       position distict
0      0  house_appropriations-education-and-environment-division         Mike Nathe       Chairman      30
1      0  house_appropriations-education-and-environment-division     Steve Swiontek  Vice Chairman      10
2      0  house_appropriations-education-and-environment-division  Karla Rose Hanson         Member      44
3      0  house_appropriations-education-and-environment-division       Scott Louser         Member       5
4      0  house_appropriations-education-and-environment-division      Bob Martinson         Member      35
5      0  house_appropriations-education-and-environment-division      David Richter         Member       1
6      0  house_appropriations-education-and-environment-division       Mark Sanford         Member      1

In [87]:
dfs = []
for k,v in coms_dict.items():
    print(f'{k}: {v}')
    dfs.append(v)

nd_coms = pd.concat(dfs).reset_index(drop=True)

nd_coms = nd_coms.drop('index', axis = 1)

house_appropriations-education-and-environment-division:    index                                           com_name  \
0      0  house_appropriations-education-and-environment...   
1      0  house_appropriations-education-and-environment...   
2      0  house_appropriations-education-and-environment...   
3      0  house_appropriations-education-and-environment...   
4      0  house_appropriations-education-and-environment...   
5      0  house_appropriations-education-and-environment...   
6      0  house_appropriations-education-and-environment...   

                name       position distict  
0         Mike Nathe       Chairman      30  
1     Steve Swiontek  Vice Chairman      10  
2  Karla Rose Hanson         Member      44  
3       Scott Louser         Member       5  
4      Bob Martinson         Member      35  
5      David Richter         Member       1  
6       Mark Sanford         Member      17  
house_appropriations-human-resources-division:    index               

In [88]:
def create_pk2(state, chamber, district, mls):
    
    if not re.search(r'^[A-Z]{2}$', state):
        
        error = 'func error: state is wrong'
        return print(error)
        

    state_code = state_coding.get(state)
    
    if not re.search(r'[Hh]ouse|[Ss]enate', chamber):
        error = 'func error: chamber wrong'
        return print(error)
    elif re.search(r'[Hh]ouse',chamber):
        chamber_code = 0
    else:
        chamber_code = 1

    if not re.search(r'^\d+$', district):
        match = re.search(r'.(\d{0,4}).', district)
        district = match.group(1)
        dist_code = str(district)
        # return print('func error: district is more than just numbers')
    else:
        dist_code = str(district).strip().zfill(3)

    if mls == 'y':
        # print('we are just gonna do the primary key')
        pk = int(f'{state_code}{chamber_code}{dist_code}')

        return pk

    else:
        seat = '00'

    fpk = int(f'{state_code}{chamber_code}{dist_code}{seat}')
    return fpk

    

In [89]:
def get_lname(name):
    # This pattern captures either a compound last name starting with a common prefix (La, Du, De)
    # or a single-word last name, ignoring optional suffixes.
    pattern = re.compile(
        r'(?P<last>(?:(?:La|Du|De)\s+\S+)|\S+)\s*(?:Jr\.?|Sr\.?|II|III|IV)?\s*$',
        re.UNICODE
    )
    match = pattern.search(name)
    if match:
        return match.group('last')
    else:
        print(f"No match for: {name}")
        return None

    

In [90]:
#set up
ref_path = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\key_creation\2025"
leg_ref =pd.read_excel(get_recent_file("*", ref_path))
leg_ref = leg_ref[leg_ref['state abbreviation'] == "ND"]
# print(leg_ref.columns)
print(leg_ref.head().to_string())
leg_ref


         full_pk primary_key  district_code state abbreviation chamber                        title first name last name       party  district  date assumed office                               name  tenure leader  state_code  chamber_code
1184  43000201.0      430002            2.0                 ND   House  North Dakota Representative       Bert  Anderson  Republican       2.0                 2014   ND Rep. Bert Anderson (R-ND-002)      11    NaN        43.0           0.0
1185  43000601.0      430006            6.0                 ND   House  North Dakota Representative       Dick  Anderson  Republican       6.0                 2010   ND Rep. Dick Anderson (R-ND-006)      15    NaN        43.0           0.0
1186  43001901.0      430019           19.0                 ND   House  North Dakota Representative      Karen  Anderson  Republican      19.0                 2022  ND Rep. Karen Anderson (R-ND-019)       3    NaN        43.0           0.0
1187  43001701.0      430017           1

Unnamed: 0,full_pk,primary_key,district_code,state abbreviation,chamber,title,first name,last name,party,district,date assumed office,name,tenure,leader,state_code,chamber_code
1184,43000201.0,430002,2.0,ND,House,North Dakota Representative,Bert,Anderson,Republican,2.0,2014,ND Rep. Bert Anderson (R-ND-002),11,,43.0,0.0
1185,43000601.0,430006,6.0,ND,House,North Dakota Representative,Dick,Anderson,Republican,6.0,2010,ND Rep. Dick Anderson (R-ND-006),15,,43.0,0.0
1186,43001901.0,430019,19.0,ND,House,North Dakota Representative,Karen,Anderson,Republican,19.0,2022,ND Rep. Karen Anderson (R-ND-019),3,,43.0,0.0
1187,43001701.0,430017,17.0,ND,House,North Dakota Representative,Landon,Bahl,Republican,17.0,2022,ND Rep. Landon Bahl (R-ND-017),3,,43.0,0.0
1188,43002001.0,430020,20.0,ND,House,North Dakota Representative,Mike,Beltz,Republican,20.0,2020,ND Rep. Mike Beltz (R-ND-020),5,,43.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1320,43100400.0,431004,4.0,ND,Senate,North Dakota Senator,Chuck,Walen,Republican,4.0,2024,ND Sen. Chuck Walen (R-ND-004),1,,43.0,1.0
1321,43102900.0,431029,29.0,ND,Senate,North Dakota Senator,Terry,Wanzek,Republican,29.0,2006,ND Sen. Terry Wanzek (R-ND-029),19,,43.0,1.0
1322,43102200.0,431022,22.0,ND,Senate,North Dakota Senator,Mark,Weber,Republican,22.0,2020,ND Sen. Mark Weber (R-ND-022),5,,43.0,1.0
1323,43101500.0,431015,15.0,ND,Senate,North Dakota Senator,Kent,Weston,Republican,15.0,2024,ND Sen. Kent Weston (R-ND-015),1,,43.0,1.0


In [91]:
nd_coms

Unnamed: 0,com_name,name,position,distict
0,house_appropriations-education-and-environment...,Mike Nathe,Chairman,30
1,house_appropriations-education-and-environment...,Steve Swiontek,Vice Chairman,10
2,house_appropriations-education-and-environment...,Karla Rose Hanson,Member,44
3,house_appropriations-education-and-environment...,Scott Louser,Member,5
4,house_appropriations-education-and-environment...,Bob Martinson,Member,35
...,...,...,...,...
59,senate_human-services,Kent Weston,Vice Chairman,15
60,senate_human-services,David A. Clemens,Member,16
61,senate_human-services,Kathy Hogan,Member,21
62,senate_human-services,Kristin Roers,Member,27


In [92]:
def get_subset(df, fpk_col, fpk):
    df_new = df[df[fpk_col] == fpk]
    return df_new

In [93]:
nd_coms

Unnamed: 0,com_name,name,position,distict
0,house_appropriations-education-and-environment...,Mike Nathe,Chairman,30
1,house_appropriations-education-and-environment...,Steve Swiontek,Vice Chairman,10
2,house_appropriations-education-and-environment...,Karla Rose Hanson,Member,44
3,house_appropriations-education-and-environment...,Scott Louser,Member,5
4,house_appropriations-education-and-environment...,Bob Martinson,Member,35
...,...,...,...,...
59,senate_human-services,Kent Weston,Vice Chairman,15
60,senate_human-services,David A. Clemens,Member,16
61,senate_human-services,Kathy Hogan,Member,21
62,senate_human-services,Kristin Roers,Member,27


In [94]:

nd_coms['chamber'] = np.nan
for row in nd_coms.itertuples(index=True):
    com_string_split = row.com_name.split('_', 1)
    chamber = com_string_split[0]
    comname = com_string_split[1].replace('-', ' ')
    

    last_name = get_lname(row.name)


    #get pk
    pk = create_pk2('ND', chamber, row.distict, mls = 'y')
    
    #pull in ref
    results = get_subset(leg_ref, 'primary_key', str(pk))
    for j in results.itertuples(index=True):
        if re.search(fr'{last_name[:4]}',j.name):
            # print('matched')
            fpk = int(j.full_pk)
            break
        elif j.Index == (len(results)-1):
            print("something didnt work")
    
    #would print if something didnt work
    # print(f'length of results: {len(results)}')
 

    #assignments
    nd_coms.loc[row.Index, 'chamber'] = chamber
    nd_coms.loc[row.Index, 'com_name'] = comname
    nd_coms.loc[row.Index, 'fpk'] = fpk



    # print(fpk)
    # print(row.name)
    # print(results.to_string())
 
    # print(get_lname(row.name))
    # print(pk)


In [95]:
fpk_pop = nd_coms.pop('fpk')
nd_coms.insert(0,'fpk', fpk_pop)


In [None]:
#export
com_folder = r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\committee info\committee update files\full_pk_assigned'
file_name = os.path.join(com_folder,"nd_com_info.xlsx")
nd_coms.to_excel(file_name, index=False)

: 

%%

In [None]:
ks_coms = ks_coms.drop_duplicates(subset=['committee', 'url'], keep='first')
ks_coms.reset_index(inplace=True, drop=True)
# %%
# url = ks_coms['url'].iloc[1]

In [None]:
site = "https://ndlegis.gov/assembly"

In [None]:
response = requests.get(site, verify = False)
soup = BeautifulSoup(response.content, 'html.parser')

print(soup.text)

In [None]:
assemb = soup.find_all('a')

In [None]:
for a in assemb:
    print("######################################")
    print(a)
    if 'Current' in str(a):
        base = "https://ndlegis.gov"
        href = a.get('href')
        link = f'{base}{href}'
        print(link)
        break

In [None]:
current_session = f'{link}'

%%

In [None]:
response = requests.get(current_session, verify = False)
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
links = soup.find_all('a')

In [None]:
for link in links:
    if 'standing' in str(link):
        print("######################")
        print(link)

In [None]:
file = "C:\Users\clutz\Downloads\2023-ndla-senate-standing-committees.pdf"

%% loop to get data

In [None]:
ks_com_dict = {}
dfs_to_append = []
for i,url in tqdm(enumerate(ks_coms['url'])):
    response = requests.get(url, verify = False)
    soup = BeautifulSoup(response.content, 'html.parser')
    side_bar = soup.find('div', id='sidebar')
    # print(type(side_bar))
    side_contents = side_bar.find_all("ul")
    print(ks_coms['committee'].iloc[i])
    print(len(side_contents))
    
    for i,s in enumerate(side_contents):
        members = []
        roles = []# print(i)
        # Chair
        
            
        name = s.get_text()
        name = name.split('\n')
        name = [x for x in name if len(x) != 0]
        if not isinstance(name, list):
            name = [name]
                
        if i < 2:
            position = len(name) * ['Chair']

        # Ranking Minority Member
        elif i > 2:
            position = len(name) * ['Member']
            

In [None]:
        elif i > 3:
            break
        
        members.extend(name)
        roles.extend(position)
        append_df = pd.DataFrame({"members": members, "roles": roles})
        print("append_df length: " + str(len(append_df)))
        dfs_to_append.append(append_df)
        print("dfs to append: " + str(len(dfs_to_append)))

In [None]:
        ks_dfs = pd.concat(dfs_to_append)
        ks_com_dict[ks_coms['committee'].iloc[i]] = ks_dfs

%%

In [None]:
for k,v in ks_com_dict.items():
    print("##################")
    print(k)
    # print(v)
# %%
    print(s.get_text())
    new = s.find("li")
    text = new.get_text(strip=True)
    print(text)
print(type(side_bar))

In [None]:
for s in side_contents:
    print(s)
    
# %%

In [None]:
print(response.text)
# %%
df_list = pd.read_html(response)

In [None]:
print(url)
# %%

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

Path to your WebDriver executable (adjust if necessary)

In [None]:
webdriver_path = r"C:\Users\clutz\hunt_env\chrome driver\chromedriver-win64\chromedriver.exe"

Set up Chrome options

In [None]:
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)

Set up WebDriver service

In [None]:
service = Service(webdriver_path)

Initialize WebDriver

In [None]:
driver = webdriver.Chrome(service=service, options=chrome_options)

%%<br>
url = in_coms['url'].iloc[0]<br>
print(in_coms['committee'].iloc[0])

In [None]:
member_dict = {}
for i,url in enumerate(in_coms["url"]):
    driver.get(url)
    time.sleep(5)

    # print(response.text)  # Check the raw HTML
    html_from_page = driver.page_source
    soup = BeautifulSoup(html_from_page, 'html.parser')
    member_cards = soup.find_all(class_='MemberCard_memberCardContent__5CHYi')
    if len(member_cards) == 0:
        print(in_coms['committee'].iloc[i])
        break

In [None]:
    members = []
    role = []
    members_df = []
    for member in member_cards:
        
        name = member.select_one('.MemberCard_memberCardName__AA367 span').text.strip()
        position = member.select_one('.MemberCard_memberCardPosition__3b90Z p').text.strip()
        members.extend([name])
        role.extend([position])
        try:
            df = pd.DataFrame({"member": members, "position": role})
        except:
            print("members list length: " + str(len(members)))
            print(members)
            print("roles list length: " + str(len(role)))
            print(role)
            break
        members_df.append(df)
    in_com_dfs = pd.concat(members_df)
    member_dict[in_coms['committee'].iloc[i]] = in_com_dfs

%%

In [None]:
for k,v in member_dict.items():
    print('########################')
    print(k)
    print(v)
    print('\n')