In [1]:
import os, sys, json, re, xlrd  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
from datetime import date
import pandas as pd             # Provides data structures and data analysis tools
from openpyxl import Workbook
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_coding_r, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, add_seats, get_key, get_recent_file

# All Legs Per Year

In [2]:
# What Year?
year = "2025"


In [3]:
#get all leg files


#gather all legislator files from done folder
#committee data doesnt need to be pulled for this

#grab directory for data
dir_path = r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\leg_data_by_state'

full_path = os.path.join(dir_path,year)
# print(full_path)


#gets most recent file in each folder
os.chdir(full_path)
leg_files = [get_recent_file(f'{f}_legislators*', full_path) for f in next(os.walk('.'))[1]]


## trouble shooting print lines
# print((leg_files))
# [print(x.split('\\')[-1]) for x in leg_files]

In [4]:
#file validation

#quick check to make sure files are found and are excel files
import os
for file in leg_files:
    

    if not os.path.isfile(file):
        raise FileNotFoundError(f"File not found: {file}")

    if not file.endswith('.xlsx'):
        raise ValueError(f"Invalid file format. Expected an .xlsx file, got: {file.split('.')[-1]}")
    


In [5]:

#file compilation

"""compiles legislator files into one file
goes through each sheet and retrieves sheet as dataframe"""

dfs = {}
for i,file in enumerate(leg_files):
    
    ##uncomment the lines below for troubleshooting
    # print('working on file:' + str(file))
    # file = legislator_files[0]
    # xls = pd.ExcelFile(file)
    ##

    sheets_dict = pd.read_excel(file, engine="openpyxl", sheet_name=None)
    sheet_names = list(sheets_dict.keys())
    for s in sheet_names:
        df = pd.read_excel(file, engine="openpyxl", sheet_name=s)
        # print(*df.columns, sep = " | ")
        
        filename =  f'{s}'
        dfs[filename] = df


## Legislator Data Cleaning

In [None]:
#extract and replace district
for k,v in dfs.items():
    # display_markdown(f'## {k}', raw = True)
    # print(*v.columns, sep=", ")
    print(v.head(2).to_string())
    cols = v.columns.to_list()
    for c in cols:
        if 'District' in c:
            d_col = 'District'
            d_col_alt = 'district'
            break
        elif 'district' in c:
            d_col = 'district'
            break
        else:
            continue
            
    
    v[d_col] = v[d_col].astype(str)
    v[d_col] = v[d_col].str.extractall(r"(\d+)").unstack().fillna('').apply(' '.join, 1)

    if d_col == "District":
        v.rename(columns={d_col:d_col_alt}, inplace=True)
    # print(v.columns)
    dfs[k] = v





In [None]:
#create pk with no seats
dfs_w_pk = {}
emptys = {}

for k,v in dfs.items():
    # display_markdown(f'## {k}', raw = True)
    # print(v.columns)
    # print(type(v))
    if v.empty:
        print('1')
        print('intitial is empty?')
        break
    # v = v.drop(['Committee List', 'Date Assumed Office'], axis=1)
    
    #call funciton to get pk
    cleaned_df, dupes = create_pk(v,'district', 'Chamber')
    # print(type(cleaned_df))
    # print(type(dupes))
    dfs_to_review = [cleaned_df,dupes]
    
    for d in dfs_to_review:
        if cleaned_df.empty:
            dfs_to_review = [df for df in dfs_to_review if not df.equals(cleaned_df)]
        if dupes.empty:
            dfs_to_review = [df for df in dfs_to_review if not df.equals(dupes)]

    
    df = pd.concat(dfs_to_review).reset_index(drop = True)
    df.to_dict(orient='index')
    print(df.head(2))

    dfs_w_pk[k] = df
    
    
    


In [8]:
#save all dfs for use in influence calculation script

if year != "2024":
    for k,v in dfs_w_pk.items():
        v.to_csv(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\bridges\all_leg_dfs_w_pk\{k}.csv', index=False)

In [9]:
#creates standard leg files without extra info
#trims files to not include committee data
compiling = {}
for k,v in dfs_w_pk.items():
    print(k)
    print(v.head(2))
    # print(v.columns)
    # print(len(v))
    try:
        df = v.drop(['Committee List'], axis=1)
        
        compiling[k] = df
        #print(k," is in")
    except:
        df = v
        compiling[k] = df
        



AL_house
  primary_key district_code State Abbreviation Chamber  \
0      100063           063                 AL   House   
1      100066           066                 AL   House   

                    Title First Name Last Name       Party district  \
0  Alabama Representative    Cynthia    Almond  Republican       63   
1  Alabama Representative       Alan     Baker  Republican       66   

   Date Assumed Office                                     Committee List  \
0                 2021  Ethics and Campaign Finance (Vice Chair), Rule...   
1                 2006  Local Legislation (Chair), Education Policy (V...   

                                Name  tenure leader state_code chamber_code  
0  AL Rep. Cynthia Almond (R-AL-063)       4    NaN         10            0  
1      AL Rep. Alan Baker (R-AL-066)      19    NaN         10            0  
AL_senate
  primary_key district_code State Abbreviation Chamber            Title  \
0      101022           022                 AL  Sen

In [None]:
#testing print statements
for k,v in compiling.items():
    print('############')
    print(k)
    print(v.head(2))

In [11]:
#pull togther all newly trimmed df's

all_legs_files = pd.concat(list(compiling.values())).reset_index(drop = True)
# all_legs_files = all_legs_files.drop_duplicates().reset_index(drop = True)
# all_legs_files


In [12]:
#save all legislators file for the year to folder

# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files')
legs_path = r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\all_leg_files'


save_path = os.path.join(legs_path, year)
file_name = f'{year}_all_legs_files_{str(date.today()).replace('-','_')}.xlsx'
full_path = os.path.join(save_path, file_name)



try:
    all_legs_files.to_excel(full_path, sheet_name="all_legs", index=False)
except:
    print(full_path)
    



# print(*all_legs_files.columns, sep = " | ")
# all_legs_files


# All Legs Records

In [13]:


#pull together all years all_legs data


path_24 = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files\2024"
path_25 = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files\2025"

file_24 = get_recent_file(r'*.xlsx',path_24)
file_25 = get_recent_file(r'*.xlsx',path_25)

print(file_24)
print(file_25)

df_24 = pd.read_excel(file_24)
df_25 = pd.read_excel(file_25)

# # print(*df_24.columns, sep = " | ")
# print(*df_25.columns, sep = " | ")

# print('##########')


col_to_move = df_25.pop('Title')
df_25.insert(df_25.shape[1], 'Title', col_to_move)


col_to_move = df_25.pop('Date Assumed Office')
df_25.insert(df_25.shape[1], 'Date Assumed Office', col_to_move)




df_24 = df_24.drop(['state_code', 'chamber_code', 'full title'], axis=1)
df_25 = df_25.drop(['state_code', 'chamber_code', 'Name'], axis=1)

# print('##########')

print(*df_24.columns, sep = " | ")
print(*df_25.columns, sep = " | ")

# print(df_24.head(2))
# print(df_25.head(2))

# df_25

#final edits
df_24['recorded_year'] = "2024"
df_25['recorded_year'] = "2025"

all_leg_records = pd.concat([df_24, df_25]).reset_index(drop = False)


#export
filename = f'all_leg_records_{str(date.today()).replace('-','_')}.xlsx'
path = fr"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files\all_leg_records\{filename}"
all_leg_records.to_excel(path, index=False)
all_leg_records


C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files\2024\2024_all_legs_files_2025_02_04.xlsx
C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files\2025\2025_all_legs_files_2025_02_04.xlsx


primary_key | district_code | State Abbreviation | Chamber | First Name | Last Name | Party | district | tenure | leader | Title | Date Assumed Office
primary_key | district_code | State Abbreviation | Chamber | First Name | Last Name | Party | district | tenure | leader | Title | Date Assumed Office


Unnamed: 0,index,primary_key,district_code,State Abbreviation,Chamber,First Name,Last Name,Party,district,tenure,leader,Title,Date Assumed Office,recorded_year
0,0,100063,63.0,AL,House,Cynthia,Almond,Republican,63.0,3,,,,2024
1,1,100066,66.0,AL,House,Alan,Baker,Republican,66.0,18,,,,2024
2,2,100049,49.0,AL,House,Russell,Bedsole,Republican,49.0,4,,,,2024
3,3,100080,80.0,AL,House,Chris,Blackshear,Republican,80.0,8,,,,2024
4,4,100061,61.0,AL,House,Ronald,Bolton,Republican,61.0,2,,,,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3967,1980,571015,15.0,WV,Senate,Darren,Thorne,Republican,15.0,0,,West Virginia Senator,2025.0,2025
3968,1981,571001,1.0,WV,Senate,Ryan,Weld,Republican,1.0,9,,West Virginia Senator,2016.0,2025
3969,1982,571015,15.0,WV,Senate,Thomas,Willis,Republican,15.0,1,,West Virginia Senator,2024.0,2025
3970,1983,571005,5.0,WV,Senate,Michael,Woelfel,Democrat,5.0,1,,West Virginia Senator,2024.0,2025
