## Imports and Definitions

### Imports

In [91]:
import os, sys, json, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import time
import glob
import xlsxwriter
from tqdm import tqdm
from datetime import date #date/time manipulation
import lxml
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_coding_r, state_pat, state_abv_pat, state_abbreviations
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, add_seats

import requests

from bs4 import BeautifulSoup,SoupStrainer
from io import StringIO

### Definitions

In [92]:
def filter_row(row, string_column):
    # Check conditions using an if-else statement
    if re.search(r'[Hh]ouse|[Rr]epresentative', str(row[string_column])):
        return "House"
    elif re.search(r'[Ss]enate', str(row[string_column])):
        return "Senate"
    else:
        return 'Unknown'


## Get seats from ballot pedia

### Intitial Pull

In [93]:
#initializing webscraping info
soup_url = r'https://ballotpedia.org/State_Legislative_Districts'
all_districts = []
response = requests.get(soup_url, verify = False)
soup = BeautifulSoup(response.content, 'html.parser')
state_districts = soup.find_all("a", href = True)
h_refs = []
for url in state_districts:
    if 'state legislative districts' in str(url):
        # print(url)
        base = "https://ballotpedia.org/"
        full_url = base + str(url.text).replace(' ',"_")
        h_refs.append(full_url)

### Main Webscrape

In [94]:
# Fetches all of the districts (commented out until needed to repull)
# for ref in h_refs:
#     url = ref
    
#     page = requests.get(url)
    
#     os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\txt files for troubleshooting')

#     # Write the page's text content to a file
#     # with open('output_soup_strainer.txt', "w", encoding="utf-8") as f:
#     #     f.write(page.text)
#     # print(page.content)
#     table_strainer = SoupStrainer('table', id='officeholder-table')
#     page_soup = BeautifulSoup(page.content, 'html.parser', parse_only=table_strainer)

#     # print(page_soup.content)
#     # print(type(page_soup))
#     districts = page_soup.find_all("a")
#     total_districts = []
#     # print(page_soup.prettify())
#     for d in districts:
#         total_districts.append(d.text)
#         # print(d.text)
#     all_districts.extend(total_districts)


In [95]:
# # Save data to JSON file
# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\json save data')
# with open(f"all_districts_{str(date.today()).replace('-', '_')}.json", "w") as f:
#     json.dump(all_districts, f)
#     save_file_name = f.name
#     print(save_file_name)

### Load webscrape file from json save

In [96]:
#loading districts webscraping data
os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\json save data')
json_files = glob.glob('all_districts_*.json')

max_mtime = 0
for dirname,subdirs,files in os.walk("."):
    for fname in files:
        full_path = os.path.join(dirname, fname)
        mtime = os.stat(full_path).st_mtime
        if mtime > max_mtime:
            max_mtime = mtime
            max_dir = dirname
            max_file = fname
save_file_name = max_file
print(save_file_name)




with open(f'{save_file_name}', "r") as f:
    all_districts = json.load(f)


all_districts_2024_11_22.json


### Compile and Curate df for Seats

In [97]:
#pull together intitials for values
state_intitals = []
for i,j in enumerate(all_districts):
    state_match = re.findall(state_pat, str(j))[0]
    state = state_match.strip()
    state_ab = state_ref.get(state)
    state_intitals.append(state_ab)


In [98]:
#compile and clean districts data
districts_w_intials = pd.DataFrame({'state_abbreviation': state_intitals,'district_string': all_districts})
districts_w_intials = districts_w_intials[~districts_w_intials['district_string'].str.contains(r'[Hh]istorical|9[AB]{1}', regex=True)]
thi_state_districts = districts_w_intials[districts_w_intials['state_abbreviation'].isin(thi_states)]

thi_state_districts.reset_index(inplace=True, drop=True)


thi_state_districts['chamber'] = thi_state_districts.apply(
    filter_row, args=('district_string',), axis=1
)

thi_state_districts["district"] = thi_state_districts["district_string"].str.extractall(r"(\d+)").unstack().fillna('').apply(' '.join, 1)


# Apply the filtering function row-wise
# filtered_df = thi_state_districts[thi_state_districts.apply(filter_row, args=('district_string',), axis=1)]
# thi_state_districts['chamber'] = ""
# thi_state_districts['chamber'] = ""
# thi_state_districts

In [99]:
#create pk for leg seats from ballot pedia

leg_keys, leg_keys_dupes = create_pk(thi_state_districts, 'district', 'chamber')

# leg_lookup = pd.concat([leg_keys_wseats,leg_keys_dupes_wseats])
leg_lookup = pd.concat([leg_keys,leg_keys_dupes])
print(*leg_keys_dupes, sep = ', ')

#uncomment for help troubleshooting
#  os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data')
# leg_lookup.to_csv('leg_lookup.csv')

leg_lookup


primary_key, district_code, state_abbreviation, district_string, chamber, district, state_code, chamber_code


Unnamed: 0,primary_key,district_code,state_abbreviation,district_string,chamber,district,state_code,chamber_code
0,101001,001,AL,Alabama State Senate District 1,Senate,1,10,1
1,101002,002,AL,Alabama State Senate District 2,Senate,2,10,1
2,101003,003,AL,Alabama State Senate District 3,Senate,3,10,1
3,101004,004,AL,Alabama State Senate District 4,Senate,4,10,1
4,101005,005,AL,Alabama State Senate District 5,Senate,5,10,1
...,...,...,...,...,...,...,...,...
123,571015,015,WV,West Virginia State Senate District 15,Senate,15,57,1
124,571016,016,WV,West Virginia State Senate District 16,Senate,16,57,1
125,571016,016,WV,West Virginia State Senate District 16,Senate,16,57,1
126,571017,017,WV,West Virginia State Senate District 17,Senate,17,57,1


## Grabbing actual ppl

### Initial Data set up

In [None]:

#getting and creating key for all leg files

all_leg_data = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files\all_legs_files_2024_12_11.csv" 
all_leg_df = pd.read_csv(all_leg_data)
print(all_leg_df.columns)


all_leg_df


Index(['primary_key', 'district_code', 'State Abbreviation', 'Chamber',
       'Title', 'First Name', 'Last Name', 'Party', 'tenure', 'leader',
       'district', 'state_code', 'chamber_code'],
      dtype='object')


Unnamed: 0,primary_key,district_code,State Abbreviation,Chamber,Title,First Name,Last Name,Party,tenure,leader,district,state_code,chamber_code
0,100063,63.0,AL,House,Alabama Representative,Cynthia,Almond,Republican,4.0,,63.0,10.0,0.0
1,100066,66.0,AL,House,Alabama Representative,Alan,Baker,Republican,19.0,,66.0,10.0,0.0
2,100049,49.0,AL,House,Alabama Representative,Russell,Bedsole,Republican,5.0,,49.0,10.0,0.0
3,100080,80.0,AL,House,Alabama Representative,Chris,Blackshear,Republican,9.0,,80.0,10.0,0.0
4,100061,61.0,AL,House,Alabama Representative,Ronald,Bolton,Republican,3.0,,61.0,10.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1955,571004,4.0,WV,Senate,West Virginia Senator,Eric,Tarr,Republican,13.0,,4.0,57.0,1.0
1956,571014,14.0,WV,Senate,West Virginia Senator,Jay,Taylor,Republican,3.0,,14.0,57.0,1.0
1957,571001,1.0,WV,Senate,West Virginia Senator,Ryan,Weld,Republican,9.0,,1.0,57.0,1.0
1958,571015,15.0,WV,Senate,West Virginia Senator,Thomas,Willis,Republican,1.0,,15.0,57.0,1.0


In [None]:

#extract district from district string and replace 
all_leg_df["district"] = all_leg_df["district"].str.extractall(r"(\d+)")[0].unstack().fillna('').apply(' '.join, 1)
# all_leg_df.drop(['District'], axis = 1)

# all_leg_df

### Key Creation

In [80]:
#bring in leg files
all_leg_wkey, all_leg_dupes_wkey = create_pk(all_leg_df, 'district', 'Chamber')
# all_leg_wkey, all_leg_dupes_wkey = add_seats(all_leg_wkey, all_leg_dupes_wkey, 'First Name', 'Last Name', keep_names = True)

#pull back in all people into one file
all_leg_lookup = pd.concat([all_leg_wkey, all_leg_dupes_wkey])


#uncomment for help troubleshooting
# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data')
# all_leg_lookup.to_csv('all_leg_lookup.csv')

#grab cols from legislators data
all_leg_lookup_for_merge = all_leg_lookup.loc[:,['primary_key', 'First Name', 'Last Name']]
#grab cols from seats
leg_lookup_for_merge = leg_lookup.loc[:,['primary_key','state_abbreviation']] 

#merge data together
merge_1 = pd.merge(leg_lookup_for_merge, all_leg_lookup_for_merge, how='left', left_on='primary_key', right_on='primary_key')


issues with the district match
State Abbreviation                                   NC
Chamber                                           House
Title                 State Upper Body Committee Member
First Name                                        Joyce
Last Name                                       Krawiec
Party                                        Republican
District                                            NaN
tenure                                             10.0
leader                                              NaN
district                                            NaN
state_code                                          NaN
chamber_code                                        NaN
district_code                                       NaN
issues with the district match
State Abbreviation               WV
Chamber                       House
Title                  Lt. Governor
First Name                    Craig
Last Name                     Blair
Party                    Repub

### Deal with Dupes

In [81]:

#gets multiseat districts
merge_1_dupes = merge_1[merge_1.duplicated(subset='primary_key',keep=False)]
merge_1_dupes = merge_1_dupes.drop_duplicates()

#gets only non multiseat districts and addes seats
merge_1_nodupes = merge_1[~merge_1.duplicated(subset='primary_key',keep=False)]
merge_wseats = add_seats(df = merge_1_nodupes, keep_names = True) #this will go to the end 



In [82]:
# pull in from seats and truncated for lookup ease and cleaner look
leg_keys_dupes_for_merge = leg_keys_dupes.loc[:,['primary_key','state_abbreviation']] 


In [83]:

#merges the absolute positions with the values found from legislator files
merge_2 = pd.merge(leg_keys_dupes_for_merge, merge_1_dupes, how='left', left_on='primary_key', right_on='primary_key', suffixes=('', '_y'))
merge_2.drop(merge_2.filter(regex='_y$').columns, axis=1, inplace=True)
merge_2 = merge_2.drop_duplicates()
# print(*merge_2.columns, sep=', ')

#add seats to last merge
merge_2 = add_seats('First Name', 'Last Name', df_duplicates = merge_2, keep_names = True)



### Pull all data back together

In [84]:
#combine data
to_combine = [merge_wseats, merge_2]
full_ref = pd.concat(to_combine)
full_ref.reset_index(inplace = True, drop = True)


In [85]:


leg_keys_wseats, leg_keys_dupes_wseats = add_seats(df = leg_keys, df_duplicates = leg_keys_dupes, keep_names = True)
ref_back = pd.concat([leg_keys_wseats, leg_keys_dupes_wseats])
ref_back.reset_index(inplace = True, drop = True)


#reference for checking vacancies

#from final merge also should be the final lookup##
leg_lookup_df = full_ref.loc[:,['full_pk', 'primary_key','First Name', 'Last Name']]
leg_lookup_df = leg_lookup_df.rename(columns={"First Name": "first_name", "Last Name": "last_name"})


#from intial data
ref_back = ref_back.loc[:,['full_pk', 'state_abbreviation']]
# full_ref = full_ref.drop(['state_abbreviation_x', 'state_abbreviation_y'], axis = 1)


# test = pd.merge(ref_back, full_ref, how='left', left_on='full_pk', right_on='full_pk')
# print(*leg_lookup_df.columns, sep=', ')

leg_lookup_df

Unnamed: 0,full_pk,primary_key,first_name,last_name
0,10100100,101001,Tim,Melson
1,10100200,101002,Tom,Butler
2,10100300,101003,Arthur,Orr
3,10100400,101004,Garlan,Gudger
4,10100500,101005,Greg,Reed
...,...,...,...,...
1823,57101500,571015,Thomas,Willis
1824,57101601,571016,Jason,Barrett
1825,57101602,571016,Patricia,Rucker
1826,57101701,571017,Eric,Nelson


## Export Key file

In [86]:
#export Final Data
from datetime import date
file_name = f'leg_lookup_{str(date.today()).replace('-','_')}.csv'
leg_lookup_df.to_csv(fr'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\legislator lookup\{file_name}', index = False)


### Seat info
pulls in data without taking out data for unfilled seats

In [38]:
# seat info
#copy data from ballotpedia
leg_seats_info = leg_lookup_for_merge.copy()

#add seats
leg_seats_info_unique, leg_seats_info_dupes = add_seats(df = leg_keys, df_duplicates = leg_keys_dupes)
leg_seats_info = pd.concat([leg_seats_info_unique, leg_seats_info_dupes])
leg_seats_info.to_csv('all_seats')


In [42]:

#export seat keys
from datetime import date
ex_file_name = f'leg_seats_info_{str(date.today()).replace('-','_')}.xlsx'
csv_file_name= f'leg_seats_info_{str(date.today()).replace('-','_')}.csv'
leg_seats_info.to_excel(fr'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\{ex_file_name}', index = False)
leg_seats_info.to_csv(fr'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\{csv_file_name}', index = False)


# END

In [None]:

# for i,j in enumerate(leg_seats_info['full_pk']):
#     j_pk = re.findall(r'^\d{6}', str(j))[0]
#     seat_num = re.findall(r'\d{2}$', str(j))[0]
#     # print(f'seat num is {seat_num}')
#     # print(f'type is {type(seat_num)}')
    
#     if seat_num == '00':
#         # print('its a single seat')
#         seat_num_v = np.nan
#     else:
#         seat_num_v = 'Seat ' + seat_num

    
#     state_match = re.findall(r'^\d{2}', str(j_pk))
#     state = state_coding_r.get(int(state_match[0]))
#     chamber = int(re.findall(r'(?<=^\d{2})\d{1}(?=\d{3})', str(j_pk))[0])
    
#     if chamber == 0:
#         chamber_v = 'House'
#     else:
#         chamber_v = 'Senate'
    
#     district = int(re.findall(r'(?<=^\d{3})\d{3}$', str(j_pk))[0].lstrip('0'))
#     district_v = f'District {district}'
    
    
    
#     if str(seat_num_v) != 'nan':
#         leg_seats_info.loc[i,'seat_num'] = seat_num_v
        


#     # print('################')
#     # print(f'state is {state}')
#     # print(f'chamber is {chamber_v}')
#     # print(f'district is {district_v}')
#     # print(seat_num_v)
    

#     leg_seats_info.loc[i,'state'] = state
#     leg_seats_info.loc[i,'chamber'] = chamber_v
#     leg_seats_info.loc[i,'district'] = district_v

# leg_lookup_for_merge
    

### END

%%splitting up district numbers from rest of string

In [None]:

def create_pk(df,column):
    lengths = []
    df.loc[:,'state_code'] = np.nan
    df.loc[:,'chamber_code'] = np.nan
    df.loc[:,'district'] = np.nan
    df.loc[:,'primary_key'] = np.nan
    for i,j in enumerate(df[f'{column}']):
        # print(str(j))
        # print(row)
        district_raw = re.split(r'\s(?=District)', str(j))
        match = re.findall(r'\s\d+', str(district_raw))[0]
        match = match.strip()
        if len(match) == 2:
            district_code = '0' + str(match)
        elif len(match) == 1:
            district_code = '00'+str(match)
        else:
            district_code = str(match)
        district_len = len(match)
        lengths.append(district_len)
        ext_state = df.loc[i,'state_abbreviation']
        state_code = state_coding.get(ext_state)
        if 'house' in str(j).lower():
            chamber_code = '0'
        elif 'senate' in str(j).lower():
            chamber_code = '1'
        else:
            print(f'unknown chamber: {str(j)}')
            break
        
        # display_markdown(f'#### {ext_state} - {chamber_code} - {district_raw}', raw=True)
        key_code = f'{state_code}{chamber_code}{district_code}'
        
        
        df.loc[i,'state_code'] = state_code
        df.loc[i,'chamber_code'] = chamber_code
        df.loc[i,'district'] = match
        df.loc[i,'primary_key'] = key_code
    return df

In [None]:
#dont touch original

lengths = []
thi_state_districts['state_code'] = np.nan
thi_state_districts['chamber_code'] = np.nan
thi_state_districts['district'] = np.nan
thi_state_districts['primary_key'] = np.nan
for i,j in enumerate(thi_state_districts['district_string']):
    # print(str(j))
    # print(row)
    district_raw = re.split(r'\s(?=District)', str(j))
    match = re.findall(r'\s\d+', str(district_raw))[0]
    match = match.strip()
    if len(match) == 2:
        district_code = '0' + str(match)
    elif len(match) == 1:
        district_code = '00'+str(match)
    else:
        district_code = str(match)
    district_len = len(match)
    lengths.append(district_len)
    ext_state = thi_state_districts.loc[i,'state_abbreviation']
    state_code = state_coding.get(ext_state)
    if 'house' in str(j).lower():
        chamber_code = '0'
    elif 'senate' in str(j).lower():
        chamber_code = '1'
    else:
        print(f'unknown chamber: {str(j)}')
        break
    
    # display_markdown(f'#### {ext_state} - {chamber_code} - {district_raw}', raw=True)
    key_code = f'{state_code}{chamber_code}{district_code}'
    
    
    thi_state_districts.loc[i,'state_code'] = state_code
    thi_state_districts.loc[i,'chamber_code'] = chamber_code
    thi_state_districts.loc[i,'district'] = match
    thi_state_districts.loc[i,'primary_key'] = key_code

In [None]:
    
    
# %% takes duplicates of primary keys and assigns a seat num

eat num of 00 means there is only one seat (no multi-member districts)

In [None]:
thi_state_districts['count'] = thi_state_districts['primary_key'].map(thi_state_districts['primary_key'].value_counts())
thi_state_districts['seat_num'] = thi_state_districts.groupby('primary_key').cumcount() + 1
thi_state_districts.loc[thi_state_districts['count'] == 1, 'seat_num'] = 0
thi_state_districts = thi_state_districts.drop(columns='count')
thi_state_districts['seat_num'] = thi_state_districts['seat_num'].apply(lambda x: f'{x:02d}')
thi_state_districts['final_primary_key'] = thi_state_districts['primary_key'] + thi_state_districts['seat_num']
# %%

In [None]:
# %%

In [None]:
duplicate = thi_state_districts[thi_state_districts.duplicated('primary_key')]

    # print(str(j))
    # print(district_len)
    

print(max(lengths))

In [None]:
    # print(line.split('District')[-1])

%%

In [None]:
print(*all_districts, sep='\n')

In [None]:
[print(x) for x in thi_state_districts]

    # states.append(str(state_match))
# %%

In [None]:
states = sorted(list(set(states)))
print(len(states))
print(*states, sep="\n")

In [None]:
for i,j in enumerate(states):
    code = str(i+1)
    if len(code) == 1:
        code = "0"+code
    df = pd.DataFrame({'state': [j], 'code':[code]})
    print(df.to_string())

%%

In [None]:
    
    break
    print(page.status_code)
    print(page.content[:500])  # Preview the content
    
    # df_list = pd.read_html(page_soup.prettify())
    # print(len(df_list))
    # print(type(df_list))

In [None]:
    # print(df)

    # dis_soup = BeautifulSoup(response.content, 'html.parser')
    # ditricts_tags = dis_soup.find_all("td")
    # for d in ditricts_tags:
    #     print(d)
    
	

%%