Needed Data:


In [3]:
import os, sys, json, re, xlrd  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
from datetime import date
import pandas as pd             # Provides data structures and data analysis tools
from openpyxl import Workbook
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_coding_r, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, add_seats, get_key, get_recent_file

# Definitions

In [4]:
def bordered(text):
    
    if isinstance(text, int) or isinstance(text, str):
        text = str(text)
    try:
        lines = text.splitlines()
        width = max(len(s) for s in lines)
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)
    except:
        lines = [text]
        width = len(str(lines[0]))
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)



In [5]:
def is_majority_party(list, x) :
    rep = [x for x in list if "Republican" in str(x)]
    dem = [x for x in list if "Democrat" in str(x)]

    rep_count = len(rep)
    dem_count = len(dem)

    if rep_count > dem_count:
        maj_party = "Republican"
    elif dem_count > rep_count:
        maj_party = "Democrat"
    else:
        print('somehow they are equal')

    if maj_party == x:
        return True
    else: 
        return False
    


# Influence Scores

## Key Lookup from "Key_Creation.py"
Pull in ref key from main key file, this file purely 

In [8]:
##Pulling in Legislator reference file comes from outside file and creating df
leg_lookup = get_recent_file("*", r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\legislator lookup")
print(leg_lookup)


C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\legislator lookup\leg_lookup_2025_02_03.xlsx


In [None]:

leg_lookup_ref = pd.read_csv(leg_lookup)
leg_lookup_ref

##troubleshoot line to look at columns
# print(*leg_lookup_ref.columns, sep = " | ")

##create leg_lookup dict (not used anywhere)
leg_lookup_dict = (leg_lookup_ref.loc[:,['full_pk', 'last_name']]).set_index('full_pk')['last_name'].to_dict() 

##take out multiseat legislature and create dict
multis_legs = leg_lookup_ref[~leg_lookup_ref['full_pk'].astype(str).str.endswith('00')]
print(*multis_legs.columns, sep = " | ")
ms_legs_lookup = (multis_legs.loc[:,['full_pk', 'last_name']]).set_index('full_pk')['last_name'].to_dict()


##pks of all multiseat legislatures
ms_pks = multis_legs['full_pk'].to_list()

leg_lookup_ref


## Data Gathering
Gather data and clean for legislator data

## Manual Leadership Files
pulls in data from manually curated leadership positions

In [13]:
#manual leadership file for upload
leadership_positions_file = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\all_legs_files_w_rankings.csv"
leaders_lookup = pd.read_csv(leadership_positions_file)

#create primary key for leadership file
infl_rankings, rankings_dupes = create_pk(leaders_lookup, 'district', 'Chamber')

# for i,j in enumerate(infl_rankings['primary_key']):
#     print(type(j))
#     print(j)
# rankings_dupes.columns

infl_rankings = infl_rankings.dropna(axis = 0, subset='district')
infl_rankings.reset_index(inplace = True, drop = True)

#fill in dupes seats and full pk
rankings_dupes['full_pk'] = np.nan

# print(rankings_dupes.columns)
for i,j in enumerate(rankings_dupes['district_code']):
    district_code = j
    name = rankings_dupes['Last Name'].iloc[i]
    full_pks = get_key(name, ms_legs_lookup)
    # print(full_pks)
    # print(rankings_dupes.columns)
    
    for ip,p in enumerate(full_pks):
        # print(p)
        # print(type(p))
        p_str = str(p).strip()
        # print(f' p = {p}, type:{type(p)}')
        # print(f' j = {j}, type:{type(j)}')

        match = re.findall(r'(?<=^\d{3})\d{3}(?=\d{2})', p_str)
        if j == match[0]:
            # print('finally found it')
            # print(type(p))
            # print(p_str)
            rankings_dupes.loc[i,'full_pk'] = p_str
            seat_match = re.findall(r'\d{2}$', p_str)
            rankings_dupes.loc[i,'seat_num'] = seat_match[0]
            break





infl_non_dupes = add_seats(df = infl_rankings)
# print(infl_non_dupes.columns)
# print(rankings_dupes.columns)
infl_non_dupes = infl_non_dupes.loc[:,['full_pk', 'primary_key', 'seat_num','First Name', 'Last Name', 'leader']]
rankings_dupes = rankings_dupes.loc[:,['full_pk', 'primary_key', 'seat_num', 'First Name', 'Last Name','leader']]
# print(infl_non_dupes.columns)
# print(rankings_dupes.columns)
leadership_files = pd.concat([infl_non_dupes, rankings_dupes])
leadership_files.reset_index(inplace=True, drop=True)
# leaders_lookup = leaders_lookup.loc[:, ['helper', "leader"]]

leadership_dict = (leadership_files.loc[:,['full_pk', 'leader']]).set_index('full_pk')['leader'].to_dict()




issues with the district match
State Abbreviation                          WV
Chamber                                  House
full title            Lt. Governor Craig Blair
First Name                               Craig
Last Name                                Blair
Party                               Republican
district                                   NaN
tenure                                       3
leader                                        
state_code                                 NaN
chamber_code                               NaN
district_code                              NaN


## Influence Score calculation
Pulls in committee data, leadership values, and tenure to calculate tenure score

In [None]:
#Pulling in DF's w Pk from all_legs files
os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files\bridges\dfs_w_pks')
pk_files = glob.glob('*.csv')

dfs_w_pk = {}
for p in pk_files:
    df = pd.read_csv(p)
    dfs_w_pk[p.replace('.csv', '')] = df


In [19]:
#pulling in data from legislator files, pulls in 
from collections import Counter

in_process = []
influence_scores = []
full_dfs = {}
for k,v in dfs_w_pk.items():
    # display_markdown(f' # {k}', raw = True)
    # v = dfs.get('AL_house')
    df = v
    
    #Conneticut is all in one file since there committies are all joint
    #This splits them up and puts them into a list, otherwise single files get put into a list of one
    if re.search(r'^CT', str(k)):
        house = df[df['Chamber'] == "House"]
        house.reset_index(inplace=True, drop=True)

        
        senate = df[df['Chamber'] == "Senate"]
        senate.reset_index(inplace=True, drop=True)
        # #print(house.to_string())
        # #print(senate.to_string())
        dfs_temp = [house, senate]
        # print(k)
    
    else:
        dfs_temp = [df]
        # print(k)




    # print(dfs_temp, sep = ' , ')
    #going through each sheet after compiling them
    for d in dfs_temp:
        d = d.reset_index(drop = True)
        # print('###########')
        # print('non-pk one')
        # print(d.head())
        # print(d.head(2))
        # break
        # print('working on: ' + str(d))
    

        
        #getting all columns except for committee columns
     
        col_list = d.columns.to_list()
        for ic,col in enumerate(col_list):
            if re.search(r'^leader', str(col)):
                index_start = ic+1
                break
            else:
                continue
  
  
        d_coms = d.iloc[:, [0] + list(range(index_start, (d.shape[1]-1)))]
        # #print(d_2.columns)


        #getting list of committee memberships, list would include a collection of "none, Member, Vice Chair, Chair, or even Minority Chair"

        ##***taking out until more committee data comes out***##

        # comm_dict = {}
        # for i,dc in enumerate(d_coms['full_pk']):
            
        #     coms_list = d_coms.iloc[i,1:].to_list()
        #     comm_dict[dc] = coms_list

        


        # print(d.to_string())
        # #getting majority party and splitting up by dems and repubs
        party_list_uc = d['Party'].to_list()   
        d['influence_score'] = np.nan
        for i,hv in enumerate(d['full_pk']):
            
            
            #variable declaration
            score = 1
            first_tier = False
            second_tier = False
            other_tier = False
            in_maj_party = False
            is_chair = False
            is_vice = False
            member = False
            minority_mem = False
            
            
            #retrieving values (taking out until leadership positions are announced)
            
            # is_int = isinstance(d.at[i, 'tenure'], np.int64)
            
            
            
            # if is_int == False:
            #     # print(i)
            #     if isinstance(d.at[i,'tenure'], pd.Series):
            #         print("it's a series?")
            #         print(d.at[i,'tenure'])
            #         print(d.iloc[i,:])

            #     else:  
            #         print('################')
            #         print('not an int64')
            #         print(type(d.loc[i,'tenure']))
            #         print(d.loc[i,'tenure'])
            # else:
            #     continue


            try:
                if d.loc[i,'tenure'] == 1: #if they are new then they shouldn't be looked up
                    # print(d.loc[i,'tenure'])
                    continue
                else:
                
                    value = leadership_dict.get(hv)
                    # print(value)
                    d.loc[i,'leaders'] = value
                    if re.search(r'\[\d\]', str(value)):
                        #print('found a top leader')
                        if re.search(r'\[1\]', str(value)):
                            first_tier = True
                        elif re.search(r'\[2\]',str(value)):
                            second_tier = True                
                    else:
                        other_tier = True
            except:
                print('something didnt work')
                

            #get the majority party
            if is_majority_party(party_list_uc, str(d['Party'].iloc[i])):
                in_maj_party = True

            #get comms list (taking out until committees are established)
            # leg_comms = comm_dict.get(hv)
            # #print("****Legislator's Comms")
            # for leg in leg_comms:
                
            #     if isinstance(leg, float):
            #         continue
            #     elif re.search(r'^[Cc]hair', str(leg)):
            #         is_chair = True
            #     elif re.search(r'[Vv]ice-?\s?[Cc]hair', str(leg)):
            #         is_vice = True
            #     elif re.search(r'[Mm]ember', str(leg)):
            #         member = True
            #     elif re.search(r'[Mm]inority', str(leg)):
            #         minority_mem = True
                # else:
                #     print("something else")

    
            #scoring
            if in_maj_party == True:
                #print('in majority party')
                if first_tier == True:
                    score = 20
                    #print("speaker")
                elif second_tier == True:
                    score = 15
                    #print("other majority leaders")

                elif is_chair == True:
                    score = 15
                    #print('chair of a committee')
                elif is_vice == True:
                    score = 10
                    #print('vice chair of a committe')
                elif other_tier == True:
                    score = 10
                    #print('other majority leadership')
                elif member == True:
                    score = 5   
            elif in_maj_party == False:
                #print('not in majority party')
                if first_tier == True:
                    score = 15
                    #print('minority leader')
                elif is_chair == True:
                    score = 15
                    #print('chair of a committee')

                elif second_tier == True:
                    score = 10
                elif is_vice == True:
                    score = 10
                    #print('vice chair of a committe')

                elif minority_mem == True:
                    score = 5
                    #print('is minority ranking mem in committee')
                elif member == True:
                    score = 5
                    #print('is a committee member')
                elif other_tier == True:
                    score = 5
                    #print('other minority leadership')


            #pull out tenure modifier
            tenure = d['tenure'].iloc[i]
            if tenure > 10:
                score += 3
            elif tenure > 6:
                score += 2
            elif tenure > 2:
                score += 1


            #make sure 20 is max score
            if score > 20:
                score = 20

            if score == 1:
                continue



            #assign score to influence score column
            d.loc[i,'influence_score'] = score
        
        #df creation and appending to list of dfs
        full_df = d.copy()
        # print(full_df.head(2))
        final_df = d.loc[:,['full_pk', 'First Name', 'Last Name', 'Party', 'tenure', 'influence_score']]
        influence_scores.append(final_df)
        full_dfs[k] = full_df



In [20]:
#pull together all dfs
leg_infl_df = pd.concat(influence_scores)
leg_infl_df = leg_infl_df.dropna(subset='full_pk')
leg_infl_df.reset_index(drop = True, inplace= True)


leg_infl_df = leg_infl_df.rename(columns={"First Name": "first_name", "Last Name": "last_name", "Party": 'party'})
print(*leg_infl_df.columns, sep=', ')


leg_infl_df

full_pk, first_name, last_name, party, tenure, influence_score


Unnamed: 0,full_pk,first_name,last_name,party,tenure,influence_score
0,10006300,Cynthia,Almond,Republican,4,11.0
1,10006600,Alan,Baker,Republican,19,13.0
2,10004900,Russell,Bedsole,Republican,5,11.0
3,10008000,Chris,Blackshear,Republican,9,12.0
4,10006100,Ronald,Bolton,Republican,3,11.0
...,...,...,...,...,...,...
1980,57100402,Eric,Tarr,Republican,13,13.0
1981,57101402,Jay,Taylor,Republican,3,11.0
1982,57100102,Ryan,Weld,Republican,9,17.0
1983,57100502,Michael,Woelfel,Democrat,1,


In [22]:
#export
from datetime import date
year = 2025
os.chdir(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\influence scores\{year}')
leg_infl_df.to_csv(f"leg_infl_df{str(date.today()).replace('-','_')}.csv", index=False)
leg_infl_df.to_excel(f"leg_infl_df{str(date.today()).replace('-','_')}.xlsx", index=False)

leg_infl_df


Unnamed: 0,full_pk,first_name,last_name,party,tenure,influence_score
0,10006300,Cynthia,Almond,Republican,4,11.0
1,10006600,Alan,Baker,Republican,19,13.0
2,10004900,Russell,Bedsole,Republican,5,11.0
3,10008000,Chris,Blackshear,Republican,9,12.0
4,10006100,Ronald,Bolton,Republican,3,11.0
...,...,...,...,...,...,...
1980,57100402,Eric,Tarr,Republican,13,13.0
1981,57101402,Jay,Taylor,Republican,3,11.0
1982,57100102,Ryan,Weld,Republican,9,17.0
1983,57100502,Michael,Woelfel,Democrat,1,


In [2]:
infl_file = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\influence scores\2025\leg_infl_df2024_12_11.csv"
influence = pd.read_csv(infl_file)
leg_infl_df = influence.copy()

# Pulling committee info for priority analysis

In [None]:
# this is to get information on committee scores for priority analaysis
comms_dfs = []
for k,v in full_dfs.items():
    # print('#############################')
    # print(k)

    # print(v.to_string())
    for i,j in enumerate(v.columns):
        # print(i)
        # print(j)
        if i == 0:
            continue
        subset = v.iloc[:,[0,i]]
        col = subset.columns[-1]

        subset[col] = subset[col].notnull().astype('int')
        subset = subset[subset[col]==1]
        
        state_match = re.match(r'(^[A-Z]{2})', str(k))
        state = state_match.group(1)

        chamber = str(k).split('_', 1)[-1]

        com = str(col)
        pk_list = subset['full_pk'].to_list()
        pks = "|".join(pk_list)
        com_df = pd.DataFrame({"state": [state], "chamber":[chamber], "committee": [com], "pks":[pks]})
        comms_dfs.append(com_df)
comms_summary = pd.concat(comms_dfs)

# comms_summary

comms_summary_final = comms_summary.assign(
    pks=comms_summary['pks'].str.split('|')  # Split the string into a list
).explode('pks')  # Create a new row for each list element
# comms_summary_final



In [None]:
#pull committee files
for k,v in full_dfs.items():
    print('############')
    print(k)
    # print(v.head(2))
    
    # print('_______________')
    v = v.drop(['leaders', 'influence_score'], axis = 1).reset_index(drop = True)
    # print(v.head(2))

    # print('_______________')
    col_select = np.r_[0:1, 15:(len(v.columns)-1)]
    v = v.iloc[:,col_select]
    # print(v.head(2))
    print((v.dropna(how = 'all', axis = 0)).head(2))

    column_to_keep = 'full_pk'

    # Drop rows where all columns except 'A' are NaN
    filtered_v = v.dropna(subset=[col for col in v.columns if col != column_to_keep], how='all').reset_index(drop = True)
    print(v.head())
    full_dfs[k] = filtered_v
    # for i,dc in enumerate(v.columns):
    #     if 'influence' in str(dc):
    #         i
    #     print(f'[{i}]: {dc}')


In [87]:
from datetime import date

comm_sum_name = f'comms_summary{str(date.today()).replace('-','_')}.xlsx'
comms_summary_final.to_excel(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\comm info\{comm_sum_name}', sheet_name=f'{comm_sum_name.replace('.xlsx',"")}', index=False)
# comms_summary_final
        

### Defunct
Chunk below is vistigial of using rankings list from ncls website

Cell below is an older chunk that looked through the raw legislator files, cell above contains the same information

In [31]:
# leader_dfs = []
# for i,j in enumerate(leader_rankings_df['position']):
#     if re.search(r'[Ss]peaker', str(j)):
#         continue
#     elif re.search(r'[Mm]ajority|[Mm]inority', str(j)):
#         # #print(j)
#         continue
#     else:
#         # #print('***not found***')
#         # #print(j)
#         # #print("**************")
#         # #print(leader_rankings_df.iloc[i,:].to_string())
#         df2 = pd.DataFrame(columns=['state', 'position', 'chamber'])
#         df2 = df2._append(leader_rankings_df.iloc[i], ignore_index=True)
#         # #print(type(df))
#         leader_dfs.append(df2)
#         # #print('\n')

# outliers = pd.concat(leader_dfs)


In [None]:
# ranking_file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\leader_rankings.csv"
# rankings = pd.read_csv(ranking_file)


# file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\leadership_ranking.xlsx"
# leader_rankings_df = pd.read_excel(file)
# #print(*leader_rankings_df.columns)
# leader_rankings_df['state'] = leader_rankings_df['state'].fillna(method="ffill")

# n = len(leader_rankings_df)
# break_point = False

# for i,j in enumerate(leader_rankings_df['state']):
#     if "Wyoming" in str(j) and "Alabama" in leader_rankings_df['state'].iloc[i+1]:
#         index_stop = i + 1
#         break_point = True


#     else:
#         continue

#     if break_point == True:
#         house_list = ['House']*index_stop
#         senate_list = ['Senate']*(n-index_stop)
#         full_list = house_list + senate_list
#         leader_rankings_df['chamber'] = full_list
#         leader_rankings_df.dropna(inplace=True)
#         break

# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data')
# leader_rankings_df.to_csv('leader_rankings.csv', index_label= False, index=False)

# #print(leader_rankings_df[leader_rankings_df['state'].str.contains('Connecticut')].to_string())

