## Imports and Definitions

### Imports

In [1]:
import os, sys, json, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import time
import glob
import xlsxwriter
from tqdm import tqdm
from datetime import date #date/time manipulation
import lxml
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_coding_r, state_pat, state_abv_pat, state_abbreviations
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, add_seats, get_recent_file

import requests

from bs4 import BeautifulSoup,SoupStrainer
from io import StringIO

### Definitions

In [2]:
def filter_row(row, string_column):
    # Check conditions using an if-else statement
    if re.search(r'[Hh]ouse|[Rr]epresentative', str(row[string_column])):
        return "House"
    elif re.search(r'[Ss]enate', str(row[string_column])):
        return "Senate"
    else:
        return 'Unknown'


## ballotpedia info pull

this will be commented out until need for a repull, use the loaded file in the "JSON File Load" section 

In [3]:
# Intitial Pull

# #initializing webscraping info
# soup_url = r'https://ballotpedia.org/State_Legislative_Districts'
# all_districts = []
# response = requests.get(soup_url, verify = False)
# soup = BeautifulSoup(response.content, 'html.parser')
# state_districts = soup.find_all("a", href = True)
# h_refs = []
# for url in state_districts:
#     if 'state legislative districts' in str(url):
#         # print(url)
#         base = "https://ballotpedia.org/"
#         full_url = base + str(url.text).replace(' ',"_")
#         h_refs.append(full_url)

In [4]:
#Main Webscrape


# Fetches all of the districts (commented out until needed to repull)
# for ref in h_refs:
#     url = ref
    
#     page = requests.get(url)
    
#     os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\txt files for troubleshooting')

#     # Write the page's text content to a file
#     # with open('output_soup_strainer.txt', "w", encoding="utf-8") as f:
#     #     f.write(page.text)
#     # print(page.content)
#     table_strainer = SoupStrainer('table', id='officeholder-table')
#     page_soup = BeautifulSoup(page.content, 'html.parser', parse_only=table_strainer)

#     # print(page_soup.content)
#     # print(type(page_soup))
#     districts = page_soup.find_all("a")
#     total_districts = []
#     # print(page_soup.prettify())
#     for d in districts:
#         total_districts.append(d.text)
#         # print(d.text)
#     all_districts.extend(total_districts)


### JSON File Load

In [5]:
# # Save data to JSON file
# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\json save data')
# with open(f"all_districts_{str(date.today()).replace('-', '_')}.json", "w") as f:
#     json.dump(all_districts, f)
#     save_file_name = f.name
#     print(save_file_name)

In [6]:
#loading districts webscraping data
os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\json save data')
json_files = glob.glob('all_districts_*.json')

max_mtime = 0
for dirname,subdirs,files in os.walk("."):
    for fname in files:
        full_path = os.path.join(dirname, fname)
        mtime = os.stat(full_path).st_mtime
        if mtime > max_mtime:
            max_mtime = mtime
            max_dir = dirname
            max_file = fname
save_file_name = max_file
print(save_file_name)




with open(f'{save_file_name}', "r") as f:
    all_districts = json.load(f)


all_districts_2024_11_22.json


### Compile and Curate df for Seats

In [7]:
#pull together intitials for values
state_intitals = []
for i,j in enumerate(all_districts):
    state_match = re.findall(state_pat, str(j))[0]
    state = state_match.strip()
    state_ab = state_ref.get(state)
    state_intitals.append(state_ab)


In [8]:
#compile and clean districts data
districts_w_intials = pd.DataFrame({'state_abbreviation': state_intitals,'district_string': all_districts})
districts_w_intials = districts_w_intials[~districts_w_intials['district_string'].str.contains(r'[Hh]istorical|9[AB]{1}', regex=True)]
thi_state_districts = districts_w_intials[districts_w_intials['state_abbreviation'].isin(thi_states)]

thi_state_districts.reset_index(inplace=True, drop=True)


thi_state_districts['chamber'] = thi_state_districts.apply(
    filter_row, args=('district_string',), axis=1
)

thi_state_districts["district"] = thi_state_districts["district_string"].str.extractall(r"(\d+)").unstack().fillna('').apply(' '.join, 1)



In [9]:
#create pk for leg seats from ballot pedia

leg_keys, leg_keys_dupes = create_pk(thi_state_districts, 'district', 'chamber')

# leg_lookup = pd.concat([leg_keys_wseats,leg_keys_dupes_wseats])
leg_lookup = pd.concat([leg_keys,leg_keys_dupes]).reset_index(drop = True)
print(*leg_keys_dupes, sep = ', ')

#uncomment for help troubleshooting
#  os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data')
# leg_lookup.to_csv('leg_lookup.csv')



#get states with multi_seat legislature
multi_seats = leg_lookup[leg_lookup['primary_key'].str.startswith(('430','571'))]
multi_seats = list(set(multi_seats['primary_key'].to_list()))

#assign seats
leg_lookup['seat'] = np.nan
for m in multi_seats:
    n = [1]
    
    #grab all of the pks that match m
    leg_lookup_m = leg_lookup[leg_lookup['primary_key'] == m]
    
    #create dict to change values
    new_values = {index: i for i, (index, row) in enumerate(leg_lookup_m.iterrows(), start=1)}
    leg_lookup.update(pd.DataFrame({'seat': new_values}).astype(str))

    
    # for row_i,seat in new_values.items():
    #     leg_lookup.loc[row_i, 'seat'] = str(seat)

primary_key, district_code, state_abbreviation, district_string, chamber, district, state_code, chamber_code


In [10]:
#create full_pk for leg_lookup

leg_lookup = leg_lookup.copy()
# leg_lookup[leg_lookup['seat'].isnull(), 'full_pk'] = leg_lookup['primary_key'] + '00'

leg_lookup.loc[leg_lookup['seat'].notna(), 'full_pk'] = leg_lookup['primary_key'] + "0" + leg_lookup['seat']
leg_lookup.loc[leg_lookup['seat'].isnull(), 'full_pk'] = leg_lookup.loc[leg_lookup['seat'].isnull(), 'primary_key'] + '00'



# Move the full_pk column to the first position
column_to_move = leg_lookup.pop('full_pk')
leg_lookup.insert(0, 'full_pk', column_to_move)

leg_lookup.columns
bp_leg_lookup = leg_lookup.copy()

## Grabbing actual ppl

### Initial Data set up

In [11]:
#pull in current year file
path = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\all_leg_files\2025"
all_leg_data = get_recent_file(r'*.xlsx', path)
all_leg_df = pd.read_excel(all_leg_data)


all_leg_df.columns = [x.lower() for x in all_leg_df.columns]
# all_leg_df = all_leg_df.iloc[:,2:].reset_index(drop = True)


# all_leg_df


In [12]:
# only for if you have all records
#getting and creating key for all leg files
# dir = r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files'
# all_leg_data = get_recent_file('*.xlsx', dir)

# all_leg_data = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files\all_leg_records.xlsx"


# all_leg_df = pd.read_excel(all_leg_data)
# all_leg_df.columns = [x.lower() for x in all_leg_df.columns]
# all_leg_df = all_leg_df.iloc[:,2:].reset_index(drop = True)

# all_leg_df = all_leg_df[all_leg_df['recorded_year']==2025].reset_index(drop=True)
# all_leg_df


In [13]:

#extract district from district string and replace 
# all_leg_df["district"] = all_leg_df["district"].str.extractall(r"(\d+)")[0].unstack().fillna('').apply(' '.join, 1)
# all_leg_df.drop(['District'], axis = 1)

# all_leg_df

## Key Creation

In [14]:
#DEPRECATED all_legs already has pk

#bring in leg files
# all_leg_wkey, all_leg_dupes_wkey = create_pk(all_leg_df, 'district', 'chamber')
# all_leg_wkey, all_leg_dupes_wkey = add_seats(all_leg_wkey, all_leg_dupes_wkey, 'First Name', 'Last Name', keep_names = True)

#pull back in all people into one file
# all_leg_lookup = pd.concat([all_leg_wkey, all_leg_dupes_wkey]).reset_index(drop = True)
# all_leg_lookup.drop(['full_pk'], axis = 0).reset_index()

In [15]:

#make dictionary to show full pks available at each primary key
leg_lookbook = bp_leg_lookup.groupby(['primary_key'])['full_pk'].apply(list).reset_index()
leg_dict = dict(zip(leg_lookbook['primary_key'], leg_lookbook['full_pk']))

#go through legislator data and apply
all_leg_df['full_pk'] = np.nan
for i,j in enumerate(all_leg_df['primary_key']):
    # print(type(j))
    j_alt = str(j)
    # continue
    value = leg_dict.get(j_alt)
    # print(type(value))
    if value is None:
        print('something wrong')
        print(j)
        # print(all_leg_df[i,['first name','last name', 'tenure']])
        # print(all_bp_leg_lookup.iloc[i,:])
        continue
        # trouble.append(j)
    elif len(value) == 1:
        full_pk = j_alt + "00"
    elif len(value) > 1:
        names = sorted(all_leg_df[all_leg_df['primary_key']==j]['last name'].to_list())
        row_name = all_leg_df.loc[i,'last name']
        for ni, name in enumerate(names):
            if name == row_name:
                # print(True)
                if ni == 0:
                    seat = 1
                    break
                elif ni == 1:
                    seat = 2
                    break
        full_pk = j_alt + "0" + str(seat)
    all_leg_df.loc[i,['full_pk']] = full_pk


#this is the final full_pk for the year

# Move the full_pk column to the first position
column_to_move = all_leg_df.pop('full_pk')
all_leg_df.insert(0, 'full_pk', column_to_move)
# all_leg_df


something wrong
nannannan


In [16]:
#export Final Data
from datetime import date
file_name = f'leg_lookup_{str(date.today()).replace('-','_')}.csv'
file_name_ex = f'leg_lookup_{str(date.today()).replace('-','_')}.xlsx'
all_leg_df.to_csv(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\key_creation\2025\{file_name}', index = False)
all_leg_df.to_excel(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\key_creation\2025\{file_name_ex}', index = False)
