## Setup

In [None]:
#imports
import os, sys, json, datetime, re, xlrd  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
from openpyxl import Workbook
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time

from tqdm import tqdm
from functools import reduce
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown
from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_coding_r, state_pat, state_abv_pat
from cprl_functions.defined_functions import get_recent_file


In [3]:

# %%
os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files')

In [4]:
#Get lookup Data
key_path = r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors'
legislators_df = get_recent_file("leg_lookup_*.csv", key_path)
# legislators_df = pd.read_csv(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\leg_lookup_df.csv')

#grab scores data
dir_path = r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files'
activity_file = get_recent_file("activity*.csv", dir_path)
influence_file = get_recent_file("*infl*.csv", dir_path)

#make dfs
legislators_df = pd.read_csv(legislators_df)
activities_df = pd.read_csv(activity_file)
influence_df = pd.read_csv(influence_file)


influence_df.loc[influence_df['full_pk'].str.contains('nan'), 'full_pk'] = np.nan
influence_df


print(influence_df.head())

influence_df = influence_df.dropna(subset=['full_pk']).reset_index(drop = True)


    full_pk first_name   last_name       party  tenure  influence_score
0  10006300    Cynthia      Almond  Republican     4.0             11.0
1  10006600       Alan       Baker  Republican    19.0             13.0
2  10004900    Russell     Bedsole  Republican     5.0             11.0
3  10008000      Chris  Blackshear  Republican     9.0             12.0
4  10006100     Ronald      Bolton  Republican     3.0             11.0


In [5]:
activities_df

Unnamed: 0,full_pk,primary_key,first_name,last_name,activities_score,events
0,43000101,430001,Patrick,Hatlestad,15,ND SLR 2023
1,43000102,430001,David,Richter,30,ND SLR 2024 (Legislator)|ND SLR 2023 (Opening ...
2,43000201,430002,Dick,Anderson,15,ND SLR 2023
3,43000202,430002,Donald,Longmuir,15,ND SLR 2024 (Legislator)
4,43000601,430006,Dick,Anderson,15,ND SLR 2024 (Legislator)
...,...,...,...,...,...,...
250,45006900,450069,Mark,Tedford,15,OK SLR 2023
251,16009300,160093,Toni,Walker,10,ECLS 2024
252,34100900,341009,Barbara,Washington,10,ECLS 2024
253,45104800,451048,George,Young,15,OK SLR 2023


In [None]:

#clean influence
for i,j in enumerate(influence_df['full_pk']):
    if re.search('nan', str(j)):
        print(j)
#         influence_df.loc[i,'full_pk'] = np.nan


In [4]:

#testing print statement
all_dfs = [legislators_df,activities_df,influence_df]
for i,df in enumerate(all_dfs):
    # print(*all.columns, sep = ',')
    # print(all.head)
    if df['full_pk'].dtypes == "object":
        df['full_pk'] = pd.to_numeric(df['full_pk'], errors='coerce').astype('Int64')  # Use nullable Int64 if NaNs are present
        all_dfs[i] = df


In [5]:

    
for df in all_dfs:
    # print(*all.columns, sep = ',')
    # print(all.head)
    result = df.full_pk.dtypes
    print(result)


int64
int64
Int64


In [4]:
#narrow down scores dfs
# activities_df = activities_df.loc[:,['full_pk', 'activities_score']]
# influence_df = influence_df.loc[:,['full_pk', 'influence_score']]

In [None]:
#merging
#merge influence scores
first_merge = pd.merge(legislators_df, influence_df, how="left", on='full_pk', suffixes=('', '_y'))
first_merge.drop(first_merge.filter(regex='_y$').columns, axis=1, inplace=True)

# print(first_merge.to_string())
print(*first_merge.columns, sep=", ")
print(*activities_df.columns, sep=", ")
first_merge


In [None]:

#merge influence scores
second_merge = pd.merge(first_merge, activities_df, how="left", on='full_pk', suffixes=('', '_y'))
second_merge.drop(second_merge.filter(regex='_y$').columns, axis=1, inplace=True)

second_merge

In [8]:

# Function to count non-whitespace events
def count_events(event_str):
    if pd.isna(event_str):  # Check if the value is NaN
        return 0
    # Split by "|" and strip whitespace
    events_list = [event.strip() for event in event_str.split('|') if event.strip()]
    
    return len(events_list)

# Add a new column with the count of events
second_merge['event_count'] = second_merge['events'].apply(count_events)
# print(second_merge.columns)
# print(second_merge.to_string())



In [9]:
from datetime import date

#final exports

final_df = second_merge.copy()
print(*final_df.columns, sep=', ')


full_pk, primary_key, first_name, last_name, party, influence_score, activities_score, events, event_count


In [None]:

for i,j in enumerate(final_df['full_pk']):
    j_pk = re.findall(r'^\d{6}', str(j))[0]
    seat_num = re.findall(r'\d{2}$', str(j))[0]
    # print(f'seat num is {seat_num}')
    # print(f'type is {type(seat_num)}')
    
    if seat_num == '00':
        # print('its a single seat')
        seat_num_v = np.nan
    else:
        seat_num_v = 'Seat ' + seat_num

    
    state_match = re.findall(r'^\d{2}', str(j_pk))
    state = state_coding_r.get(int(state_match[0]))
    chamber = int(re.findall(r'(?<=^\d{2})\d{1}(?=\d{3})', str(j_pk))[0])
    
    if chamber == 0:
        chamber_v = 'House'
    else:
        chamber_v = 'Senate'
    
    district = int(re.findall(r'(?<=^\d{3})\d{3}$', str(j_pk))[0].lstrip('0'))
    district_v = f'District {district}'
    
    
    
    if str(seat_num_v) != 'nan':
        final_df.loc[i,'seat_num'] = seat_num_v
        


    # print('################')
    # print(f'state is {state}')
    # print(f'chamber is {chamber_v}')
    # print(f'district is {district_v}')
    # print(seat_num_v)
    

    final_df.loc[i,'state'] = state
    final_df.loc[i,'chamber'] = chamber_v
    final_df.loc[i,'district'] = district_v

print(*final_df.columns, sep=', ')


final_df.loc[:,['year']] = "2025"
   

full_pk, primary_key, first_name, last_name, party, influence_score, activities_score, events, event_count, state, chamber, district, seat_num


In [None]:

file_name = f'compiled_scores{str(date.today()).replace('-','_')}.xlsx'
csv_file_name = f'compiled_scores{str(date.today()).replace('-','_')}.csv'
final_df.to_excel(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\compiled scores\2025\{file_name}', index=False)
final_df.to_csv(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\compiled scores\2025\{csv_file_name}', index=False)
final_df

pulling in bills

In [14]:

bills_file = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\bills\leg_bills_info.xlsx"

bills = pd.read_excel(bills_file)
# print(bills.columns)


In [None]:

compiled_plus_bills = pd.merge(final_df, bills, how="left", on='primary_key')
compiled_plus_bills


In [23]:

file_name_cb = f'compiled_plus_bills{str(date.today()).replace('-','_')}.xlsx'
csv_file_name_cb = f'compiled_plus_bills{str(date.today()).replace('-','_')}.csv'
compiled_plus_bills.to_excel(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\compiled scores\2025\{file_name_cb}',sheet_name=f'{file_name_cb.replace(".xlsx", "")}', index=False)
compiled_plus_bills.to_csv(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\compiled scores\2025\{csv_file_name_cb}', index=False)


In [17]:
bills_and_legislators = compiled_plus_bills.assign(
    bills=compiled_plus_bills['bill_labels'].str.split('|')  # Split the string into a list
).explode('bills')  # Create a new row for each list element

# Optionally, clean up the brackets
# bills_and_legislators['Values'] = df_expanded['Values'].str.strip('[]')
# print(compiled_plus_bills.columns)
# bills_and_legislators

In [21]:

file_name_bl = f'bills_and_legislators{str(date.today()).replace('-','_')}.xlsx'
csv_file_name_bl = f'bills_and_legislators{str(date.today()).replace('-','_')}.csv'
bills_and_legislators.to_excel(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\compiled scores\2025\{file_name_bl}', sheet_name=f'{file_name_bl.replace(".xlsx", "")}', index=False)
bills_and_legislators.to_csv(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\compiled scores\2025\{csv_file_name_bl}', index=False)


In [19]:

# , suffixes=('', '_y'))
# second_merge.drop(second_merge.filter(regex='_y$').columns, axis=1, inplace=True)

In [20]:



# print(final_df.columns)

# for i,j in enumerate(final_df['full_pk']):
#     print('#############')
#     print(j)




# final_df