In [1]:
import os, sys, json, re, xlrd  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
from datetime import date
import pandas as pd             # Provides data structures and data analysis tools
from openpyxl import Workbook
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_coding_r, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, add_seats, get_key, get_recent_file

AL 10
AK 11
AZ 12
AR 13
CA 14
CO 15
CT 16
DE 17
FL 18
GA 19
HI 20
ID 21
IL 22
IN 23
IA 24
KS 25
KY 26
LA 27
ME 28
MD 29
MA 30
MI 31
MN 32
MS 33
MO 34
MT 35
NE 36
NV 37
NH 38
NJ 39
NM 40
NY 41
NC 42
ND 43
OH 44
OK 45
OR 46
PA 47
RI 48
SC 49
SD 50
TN 51
TX 52
UT 53
VT 54
VA 55
WA 56
WV 57
WI 58
WY 59
DC 60


### Definitions

In [2]:
def bordered(text):
    
    if isinstance(text, int) or isinstance(text, str):
        text = str(text)
    try:
        lines = text.splitlines()
        width = max(len(s) for s in lines)
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)
    except:
        lines = [text]
        width = len(str(lines[0]))
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)



In [3]:
def is_majority_party(list, x) :
    rep = [x for x in list if "Republican" in str(x)]
    dem = [x for x in list if "Democrat" in str(x)]

    rep_count = len(rep)
    dem_count = len(dem)

    if rep_count > dem_count:
        maj_party = "Republican"
    elif dem_count > rep_count:
        maj_party = "Democrat"
    else:
        print('somehow they are equal')

    if maj_party == x:
        return True
    else: 
        return False
    


## Key Lookup from "Key_Creation.py"
Pull in ref key 

In [4]:
#Pulling in Legislator reference file comes from outside file
leg_lookup = get_recent_file("*", r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\legislator lookup")
print(leg_lookup)
leg_lookup_ref = pd.read_csv(leg_lookup)
leg_lookup_ref

C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\legislator lookup\leg_lookup_2024_12_10.csv


Unnamed: 0,full_pk,primary_key,first_name,last_name
0,10100100,101001,Tim,Melson
1,10100200,101002,Tom,Butler
2,10100300,101003,Arthur,Orr
3,10100400,101004,Garlan,Gudger
4,10100500,101005,Greg,Reed
...,...,...,...,...
1823,57101500,571015,Thomas,Willis
1824,57101601,571016,Jason,Barrett
1825,57101602,571016,Patricia,Rucker
1826,57101701,571017,Eric,Nelson


In [82]:


# print(*leg_lookup_ref.columns, sep = " | ")

#create leg_lookup dict
leg_lookup_dict = (leg_lookup_ref.loc[:,['full_pk', 'last_name']]).set_index('full_pk')['last_name'].to_dict() 


multis_legs = leg_lookup_ref[~leg_lookup_ref['full_pk'].astype(str).str.endswith('00')]
print(*multis_legs.columns, sep = " | ")


ms_legs_lookup = (multis_legs.loc[:,['full_pk', 'last_name']]).set_index('full_pk')['last_name'].to_dict()

# for k,v in ms_legs_lookup.items():
#     print(f'{k} - type: {type(k)}')
#     print(f'{v} - type: {type(v)}')

# leg_lookup_ref_noo = leg_lookup_ref[~leg_lookup_ref['full_pk'].astype(str).str.endswith('00')]
# leg_lookup_ref_noo = (leg_lookup_ref_noo.loc[:,['full_pk', 'Last Name']]).set_index('full_pk')['Last Name'].to_dict()
# leg_lookup_ref_noo
# multis_legs

ms_pks = multis_legs['full_pk'].to_list()



full_pk | primary_key | first_name | last_name


### Data Gathering
Gather data and clean for legislator data

In [83]:

#gather all legislator files from done folder
#committee data should be updated before pulling this

# os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\done')
os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\2025')
legislator_files = glob.glob('**/*_info*.xlsx')
leg_files = [f for f in legislator_files if not f.endswith('_old.xlsx')]
leg_files = [f for f in leg_files if re.search(r'12_11', str(f))]

print(leg_files)



['AL\\AL_legislators_info_2024_12_11.xlsx', 'CT\\CT_legislators_info_2024_12_11.xlsx', 'IL\\IL_legislators_info_2024_12_11.xlsx', 'IN\\IN_legislators_info_2024_12_11.xlsx', 'KS\\KS_legislators_info_2024_12_11.xlsx', 'MO\\MO_legislators_info_2024_12_11.xlsx', 'NC\\NC_legislators_info_2024_12_11.xlsx', 'ND\\ND_legislators_info_2024_12_11.xlsx', 'NM\\NM_legislators_info_2024_12_11.xlsx', 'OH\\OH_legislators_info_2024_12_11.xlsx', 'OK\\OK_legislators_info_2024_12_11.xlsx', 'VA\\VA_legislators_info_2024_12_11.xlsx', 'WV\\WV_legislators_info_2024_12_11.xlsx']


In [84]:
import os
for file in leg_files:
    

    if not os.path.isfile(file):
        raise FileNotFoundError(f"File not found: {file}")

    if not file.endswith('.xlsx'):
        raise ValueError(f"Invalid file format. Expected an .xlsx file, got: {file.split('.')[-1]}")

In [85]:
#compiles legislator files into one file
#goes through each sheet and retrieves sheet as dataframe
dfs = {}
for i,file in enumerate(leg_files):
    print('working on file:' + str(file))
    # file = legislator_files[0]
    # xls = pd.ExcelFile(file)
    sheets_dict = pd.read_excel(file, engine="openpyxl", sheet_name=None)
    sheet_names = list(sheets_dict.keys())
    for s in sheet_names:
        df = pd.read_excel(file, engine="openpyxl", sheet_name=s)
        
        
        filename =  f'{s}'
        dfs[filename] = df




working on file:AL\AL_legislators_info_2024_12_11.xlsx
working on file:CT\CT_legislators_info_2024_12_11.xlsx


working on file:IL\IL_legislators_info_2024_12_11.xlsx
working on file:IN\IN_legislators_info_2024_12_11.xlsx
working on file:KS\KS_legislators_info_2024_12_11.xlsx
working on file:MO\MO_legislators_info_2024_12_11.xlsx
working on file:NC\NC_legislators_info_2024_12_11.xlsx
working on file:ND\ND_legislators_info_2024_12_11.xlsx
working on file:NM\NM_legislators_info_2024_12_11.xlsx
working on file:OH\OH_legislators_info_2024_12_11.xlsx
working on file:OK\OK_legislators_info_2024_12_11.xlsx
working on file:VA\VA_legislators_info_2024_12_11.xlsx
working on file:WV\WV_legislators_info_2024_12_11.xlsx


In [86]:
# DEPRECATED
# # #resigned or older seat
# multi_seats_df = multis_legs.copy()
# # print(multi_seats_df.columns)
# multi_seats_df["district"] = multi_seats_df["primary_key"].astype(str).str.extract(r"(\d{3}$)")[0].str.lstrip('0')
# multi_seats_df["state_code"] = multi_seats_df["primary_key"].astype(str).str.extract(r"(^\d{2})")[0].str.strip().astype("float").astype("Int64").apply(lambda x: state_coding_r.get(x))

# # multi_seats_df["district"] = multi_seats_df["district"].astype('float').astype('Int64')
# multi_seats_df["dupekey"] = multi_seats_df["state_code"] + "-" + multi_seats_df["district"]

# ms_pks = multi_seats_df["dupekey"].to_list()
# ms_pks = list(set(ms_pks))

# print(ms_pks)

# multi_seats_df


### Pulling all files together

In [97]:
for k,v in dfs.items():
    print(*v.columns, sep = " | ")


primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | Committee List | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | Committee List | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | Committee List | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | Committee List | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | Committee List | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | Committee List | tenure | le

In [98]:
#trims files to not include committee data
compiling = []
for k,v in dfs.items():
    
    # print(k)

    
    df = v.drop(['Committee List'], axis=1)
    print(*df.columns, sep = " | ")
    compiling.append(df)
    #print(k," is in")

#pull togther all newly trimmed df's
all_legs_files = pd.concat(compiling).reset_index(drop = True)


os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files')
all_legs_files.to_csv(f'all_legs_files_{str(date.today()).replace('-','_')}.csv', index=False)

# print(*all_legs_files.columns, sep = " | ")
all_legs_files


primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber | Title | First Name | Last Name | Party | tenure | leader | district | state_code | chamber_code
primary_key | district_code | State Abbreviation | Chamber

Unnamed: 0,primary_key,district_code,State Abbreviation,Chamber,Title,First Name,Last Name,Party,tenure,leader,district,state_code,chamber_code
0,100063,063,AL,House,Alabama Representative,Cynthia,Almond,Republican,4.0,,63,10,0
1,100066,066,AL,House,Alabama Representative,Alan,Baker,Republican,19.0,,66,10,0
2,100049,049,AL,House,Alabama Representative,Russell,Bedsole,Republican,5.0,,49,10,0
3,100080,080,AL,House,Alabama Representative,Chris,Blackshear,Republican,9.0,,80,10,0
4,100061,061,AL,House,Alabama Representative,Ronald,Bolton,Republican,3.0,,61,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1955,571004,004,WV,Senate,West Virginia Senator,Eric,Tarr,Republican,13.0,,4,57,1
1956,571014,014,WV,Senate,West Virginia Senator,Jay,Taylor,Republican,3.0,,14,57,1
1957,571001,001,WV,Senate,West Virginia Senator,Ryan,Weld,Republican,9.0,,1,57,1
1958,571015,015,WV,Senate,West Virginia Senator,Thomas,Willis,Republican,1.0,,15,57,1


 ## Legislator Data Cleaning

In [89]:
#extract and replace district
for k,v in dfs.items():
    # display_markdown(f'## {k}', raw = True)
    # print(*v.columns, sep=", ")
    # print(v.head(2).to_string())

    v["district"] = v["District"].str.extractall(r"(\d+)").unstack().fillna('').apply(' '.join, 1)

    new_v = v.drop(['District', 'Date Assumed Office'], axis = 1)
    dfs[k] = new_v



In [90]:

#create pk with seats
dfs_w_pk = {}
emptys = {}

for k,v in dfs.items():
    # display_markdown(f'## {k}', raw = True)
    print(type(v))
    if v.empty:
        print('1')
        print('intitial is empty?')
        break
    # v = v.drop(['Committee List', 'Date Assumed Office'], axis=1)
    #call funciton to get pk
    cleaned_df, dupes = create_pk(v,'district', 'Chamber')
    
    #intitalize list to concat
    dfs_to_concat = []

    #is cleaned empty? put in list to concat if so
    if cleaned_df.empty:

        print('normals are empty (from loop)')
    else:
        non_dupes = add_seats(df = cleaned_df)
        dfs_to_concat.append(cleaned_df)
        # break
    
    #are dupes empty? clean and put in list to concat if so
    if dupes.empty:
        print('dupes are empty (from loop)')
        print(k)
    else:
        # display_markdown(f'## {k}', raw = True)
        dupes['full_pk'] = np.nan
        dupes['seat_num'] = np.nan
        print('*****************')
        # print('Columns')
        # print(duplicates.columns, sep = ' , ')
        # print(duplicates.head(2))
        # dupes
        for i,j in enumerate(dupes['district_code']):
            # print(i)
            try:
                name = dupes.loc[i,'Last Name']
            except:
                print(dupes.index)
            full_pks = get_key(name, ms_legs_lookup)
            # print(f'full_pks: {full_pks}')
            # district_code = str(duplicates.loc[i,['district_code']])
            # print(d_duplicates.loc[i,['First Name']])
            # print(d_duplicates.loc[i,['Last Name']])
            for ip,p in enumerate(full_pks):
                # print(p)
                # print(type(p))
                p_str = str(p).strip()
                # print(f' p = {p}, type:{type(p)}')
                # print(f' j = {j}, type:{type(j)}')

                match = re.findall(r'(?<=^\d{3})\d{3}(?=\d{2})', p_str)
                if j == match[0]:
                    # print('finally found it')
                    # print(type(p))
                    # print(p_str)
                    dupes.loc[i,'full_pk'] = p_str
                    seat_match = re.findall(r'\d{2}$', p_str)
                    dupes.loc[i,'seat_num'] = seat_match[0]
                    break
               
                

        new_dupes = dupes
        # print('NEW DUPES')
        # print('#########')
        # print(new_dupes.head(2))
        dfs_to_concat.append(new_dupes)
            # print('putting in a new d')
            



    # duplicates.loc[i,['full_pk']] = int(full_pk)
    # print(full_pk)

    for i,y in enumerate(dfs_to_concat):

        first_column = y.pop('full_pk')
        third_column = y.pop('state_code')
        fourth_column = y.pop('chamber_code')
        sixth_column = y.pop('seat_num')
        y.insert(0, 'full_pk', first_column)
        y.insert(2, 'state_code', third_column)
        y.insert(3, 'chamber_code', fourth_column)
        y.insert(5, 'seat_num', sixth_column)
        # y.reset_index(drop = True)
        dfs_to_concat[i] = y


    if len(dfs_to_concat) == 2:
        d_new = pd.concat(dfs_to_concat)
    elif len(dfs_to_concat) == 1:
        d_new = dfs_to_concat[0]
    else:
        print('all are empty')

    final_df_nan = d_new[d_new['seat_num'].isna()]
    final_df = d_new[~d_new['seat_num'].isna()]
    # print('######## nans #############')
    # print(final_df_nan)
    emptys[k] = final_df_nan

    # print(d_new.columns)
    # d_new = d_new.loc[:,['full_pk', 'primary_key', 'First Name', 'Last Name']]
    dfs_w_pk[k] = final_df


# print(leg_files_fpk.iloc[:3,:4].to_string)



<class 'pandas.core.frame.DataFrame'>
duplicate df is empty (from function)
dupes are empty (from loop)
AL_house
<class 'pandas.core.frame.DataFrame'>
duplicate df is empty (from function)
dupes are empty (from loop)
AL_senate
<class 'pandas.core.frame.DataFrame'>
duplicate df is empty (from function)
dupes are empty (from loop)
CT_house
<class 'pandas.core.frame.DataFrame'>
duplicate df is empty (from function)
dupes are empty (from loop)
CT_senate
<class 'pandas.core.frame.DataFrame'>
duplicate df is empty (from function)
dupes are empty (from loop)
IL_house
<class 'pandas.core.frame.DataFrame'>
duplicate df is empty (from function)
dupes are empty (from loop)
IL_senate
<class 'pandas.core.frame.DataFrame'>
duplicate df is empty (from function)
dupes are empty (from loop)
IN_house
<class 'pandas.core.frame.DataFrame'>
duplicate df is empty (from function)
dupes are empty (from loop)
IN_senate
<class 'pandas.core.frame.DataFrame'>
duplicate df is empty (from function)
dupes are empty 

### Manual Leadership Files

In [91]:
leadership_positions_file = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\all_legs_files_w_rankings.csv"
leaders_lookup = pd.read_csv(leadership_positions_file)

#create primary key for leadership file
infl_rankings, rankings_dupes = create_pk(leaders_lookup, 'district', 'Chamber')

# for i,j in enumerate(infl_rankings['primary_key']):
#     print(type(j))
#     print(j)
# rankings_dupes.columns

infl_rankings = infl_rankings.dropna(axis = 0, subset='district')
infl_rankings.reset_index(inplace = True, drop = True)

#fill in dupes seats and full pk
rankings_dupes['full_pk'] = np.nan

# print(rankings_dupes.columns)
for i,j in enumerate(rankings_dupes['district_code']):
    district_code = j
    name = rankings_dupes['Last Name'].iloc[i]
    full_pks = get_key(name, ms_legs_lookup)
    # print(full_pks)
    # print(rankings_dupes.columns)
    
    for ip,p in enumerate(full_pks):
        # print(p)
        # print(type(p))
        p_str = str(p).strip()
        # print(f' p = {p}, type:{type(p)}')
        # print(f' j = {j}, type:{type(j)}')

        match = re.findall(r'(?<=^\d{3})\d{3}(?=\d{2})', p_str)
        if j == match[0]:
            # print('finally found it')
            # print(type(p))
            # print(p_str)
            rankings_dupes.loc[i,'full_pk'] = p_str
            seat_match = re.findall(r'\d{2}$', p_str)
            rankings_dupes.loc[i,'seat_num'] = seat_match[0]
            break





infl_non_dupes = add_seats(df = infl_rankings)
# print(infl_non_dupes.columns)
# print(rankings_dupes.columns)
infl_non_dupes = infl_non_dupes.loc[:,['full_pk', 'primary_key', 'seat_num','First Name', 'Last Name', 'leader']]
rankings_dupes = rankings_dupes.loc[:,['full_pk', 'primary_key', 'seat_num', 'First Name', 'Last Name','leader']]
# print(infl_non_dupes.columns)
# print(rankings_dupes.columns)
leadership_files = pd.concat([infl_non_dupes, rankings_dupes])
leadership_files.reset_index(inplace=True, drop=True)
# leaders_lookup = leaders_lookup.loc[:, ['helper', "leader"]]

leadership_dict = (leadership_files.loc[:,['full_pk', 'leader']]).set_index('full_pk')['leader'].to_dict()




issues with the district match
State Abbreviation                          WV
Chamber                                  House
full title            Lt. Governor Craig Blair
First Name                               Craig
Last Name                                Blair
Party                               Republican
district                                   NaN
tenure                                       3
leader                                        
state_code                                 NaN
chamber_code                               NaN
district_code                              NaN


### Influence Score calculation
Pulls in committee data, leadership values, and tenure to calculate tenure score

In [92]:

#pulling in data from legislator files, pulls in 
from collections import Counter

in_process = []
influence_scores = []
full_dfs = {}
for k,v in dfs_w_pk.items():
    # display_markdown(f' # {k}', raw = True)
    # v = dfs.get('AL_house')
    df = v
    
    #Conneticut is all in one file since there committies are all joint
    #This splits them up and puts them into a list, otherwise single files get put into a list of one
    if re.search(r'^CT', str(k)):
        house = df[df['Chamber'] == "House"]
        house.reset_index(inplace=True, drop=True)

        
        senate = df[df['Chamber'] == "Senate"]
        senate.reset_index(inplace=True, drop=True)
        # #print(house.to_string())
        # #print(senate.to_string())
        dfs_temp = [house, senate]
        # print(k)
    
    else:
        dfs_temp = [df]
        # print(k)

    # print(dfs_temp, sep = ' , ')
    #going through each sheet after compiling them
    for d in dfs_temp:
        # print('###########')
        # print('non-pk one')
        # print(d.head())
        # print(d.head(2))
    

        
        #getting all columns except for committee columns
        col_list = d.columns.to_list()
        for ic,col in enumerate(col_list):
            if re.search(r'^leader', str(col)):
                index_start = ic+1
                break
            else:
                continue
  
        # to_append = d.iloc[:,:index_start]
        
        # in_process.append(to_append)
        
        # d_coms = d.iloc[:, f'-{index_start}'index_start:]
        # #print(d.shape[1])

        d_coms = d.iloc[:, [0] + list(range(index_start, (d.shape[1]-1)))]
        # #print(d_2.columns)


        #getting list of committee memberships, list would include a collection of "none, Member, Vice Chair, Chair, or even Minority Chair"
        comm_dict = {}
        for i,dc in enumerate(d_coms['full_pk']):
            
            coms_list = d_coms.iloc[i,1:].to_list()
            comm_dict[dc] = coms_list

        


        # print(d.to_string())
        # #getting majority party and splitting up by dems and repubs
        party_list_uc = d['Party'].to_list()   
        d['influence_score'] = np.nan
        for i,hv in enumerate(d['full_pk']):
            
            
            #variable declaration
            score = 1
            first_tier = False
            second_tier = False
            other_tier = False
            in_maj_party = False
            is_chair = False
            is_vice = False
            member = False
            minority_mem = False
            
            
            #retrieving values
            value = leadership_dict.get(hv)
            # print(value)
            d.loc[i,'leaders'] = value
            if re.search(r'\[\d\]', str(value)):
                #print('found a top leader')
                if re.search(r'\[1\]', str(value)):
                    first_tier = True
                elif re.search(r'\[2\]',str(value)):
                    second_tier = True                
            else:
                other_tier = True

            #get the majority party
            if is_majority_party(party_list_uc, str(d['Party'].iloc[i])):
                in_maj_party = True

            #get comms list
            leg_comms = comm_dict.get(hv)
            # #print("****Legislator's Comms")
            for leg in leg_comms:
                
                if isinstance(leg, float):
                    continue
                elif re.search(r'^[Cc]hair', str(leg)):
                    is_chair = True
                elif re.search(r'[Vv]ice-?\s?[Cc]hair', str(leg)):
                    is_vice = True
                elif re.search(r'[Mm]ember', str(leg)):
                    member = True
                elif re.search(r'[Mm]inority', str(leg)):
                    minority_mem = True
                # else:
                #     print("something else")

    
            #scoring
            if in_maj_party == True:
                #print('in majority party')
                if first_tier == True:
                    score = 20
                    #print("speaker")
                elif second_tier == True:
                    score = 15
                    #print("other majority leaders")

                elif is_chair == True:
                    score = 15
                    #print('chair of a committee')
                elif is_vice == True:
                    score = 10
                    #print('vice chair of a committe')
                elif other_tier == True:
                    score = 10
                    #print('other majority leadership')
                elif member == True:
                    score = 5   
            elif in_maj_party == False:
                #print('not in majority party')
                if first_tier == True:
                    score = 15
                    #print('minority leader')
                elif is_chair == True:
                    score = 15
                    #print('chair of a committee')

                elif second_tier == True:
                    score = 10
                elif is_vice == True:
                    score = 10
                    #print('vice chair of a committe')

                elif minority_mem == True:
                    score = 5
                    #print('is minority ranking mem in committee')
                elif member == True:
                    score = 5
                    #print('is a committee member')
                elif other_tier == True:
                    score = 5
                    #print('other minority leadership')


            #pull out tenure modifier
            tenure = d['tenure'].iloc[i]
            if tenure > 10:
                score += 3
            elif tenure > 6:
                score += 2
            elif tenure > 2:
                score += 1


            #make sure 20 is max score
            if score > 20:
                score = 20

            if score == 1:
                continue



            #assign score to influence score column
            d.loc[i,'influence_score'] = score
        
        #df creation and appending to list of dfs
        full_df = d.copy()
        # print(full_df.head(2))
        final_df = d.loc[:,['full_pk', 'First Name', 'Last Name', 'Party', 'tenure', 'influence_score']]
        influence_scores.append(final_df)
        full_dfs[k] = full_df



In [93]:

#pull together all dfs and export
leg_infl_df = pd.concat(influence_scores)
leg_infl_df = leg_infl_df.dropna(subset='full_pk')
leg_infl_df.reset_index(drop = True, inplace= True)



leg_infl_df = leg_infl_df.rename(columns={"First Name": "first_name", "Last Name": "last_name", "Party": 'party'})
print(*leg_infl_df.columns, sep=', ')


leg_infl_df

full_pk, first_name, last_name, party, tenure, influence_score


Unnamed: 0,full_pk,first_name,last_name,party,tenure,influence_score
0,10006300,Cynthia,Almond,Republican,4.0,11.0
1,10006600,Alan,Baker,Republican,19.0,13.0
2,10004900,Russell,Bedsole,Republican,5.0,11.0
3,10008000,Chris,Blackshear,Republican,9.0,12.0
4,10006100,Ronald,Bolton,Republican,3.0,11.0
...,...,...,...,...,...,...
1955,57101702,Tom,Takubo,Republican,11.0,12.0
1956,57100402,Eric,Tarr,Republican,13.0,18.0
1957,57101402,Jay,Taylor,Republican,3.0,13.0
1958,57100102,Ryan,Weld,Republican,9.0,11.0


In [95]:
from datetime import date
year = 2025
os.chdir(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\influence scores\{year}')
leg_infl_df.to_csv(f"leg_infl_df{str(date.today()).replace('-','_')}.csv", index=False)

# leg_infl_df


## Pulling committee info for priority analysis

In [None]:
# this is to get information on committee scores for priority analaysis
comms_dfs = []
for k,v in full_dfs.items():
    # print('#############################')
    # print(k)

    # print(v.to_string())
    for i,j in enumerate(v.columns):
        # print(i)
        # print(j)
        if i == 0:
            continue
        subset = v.iloc[:,[0,i]]
        col = subset.columns[-1]

        subset[col] = subset[col].notnull().astype('int')
        subset = subset[subset[col]==1]
        
        state_match = re.match(r'(^[A-Z]{2})', str(k))
        state = state_match.group(1)

        chamber = str(k).split('_', 1)[-1]

        com = str(col)
        pk_list = subset['full_pk'].to_list()
        pks = "|".join(pk_list)
        com_df = pd.DataFrame({"state": [state], "chamber":[chamber], "committee": [com], "pks":[pks]})
        comms_dfs.append(com_df)
comms_summary = pd.concat(comms_dfs)

# comms_summary

comms_summary_final = comms_summary.assign(
    pks=comms_summary['pks'].str.split('|')  # Split the string into a list
).explode('pks')  # Create a new row for each list element
# comms_summary_final



In [None]:
#pull committee files
for k,v in full_dfs.items():
    print('############')
    print(k)
    # print(v.head(2))
    
    # print('_______________')
    v = v.drop(['leaders', 'influence_score'], axis = 1).reset_index(drop = True)
    # print(v.head(2))

    # print('_______________')
    col_select = np.r_[0:1, 15:(len(v.columns)-1)]
    v = v.iloc[:,col_select]
    # print(v.head(2))
    print((v.dropna(how = 'all', axis = 0)).head(2))

    column_to_keep = 'full_pk'

    # Drop rows where all columns except 'A' are NaN
    filtered_v = v.dropna(subset=[col for col in v.columns if col != column_to_keep], how='all').reset_index(drop = True)
    print(v.head())
    full_dfs[k] = filtered_v
    # for i,dc in enumerate(v.columns):
    #     if 'influence' in str(dc):
    #         i
    #     print(f'[{i}]: {dc}')


In [87]:
from datetime import date

comm_sum_name = f'comms_summary{str(date.today()).replace('-','_')}.xlsx'
comms_summary_final.to_excel(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\comm info\{comm_sum_name}', sheet_name=f'{comm_sum_name.replace('.xlsx',"")}', index=False)
# comms_summary_final
        

### Defunct
Chunk below is vistigial of using rankings list from ncls website

Cell below is an older chunk that looked through the raw legislator files, cell above contains the same information

In [31]:
# leader_dfs = []
# for i,j in enumerate(leader_rankings_df['position']):
#     if re.search(r'[Ss]peaker', str(j)):
#         continue
#     elif re.search(r'[Mm]ajority|[Mm]inority', str(j)):
#         # #print(j)
#         continue
#     else:
#         # #print('***not found***')
#         # #print(j)
#         # #print("**************")
#         # #print(leader_rankings_df.iloc[i,:].to_string())
#         df2 = pd.DataFrame(columns=['state', 'position', 'chamber'])
#         df2 = df2._append(leader_rankings_df.iloc[i], ignore_index=True)
#         # #print(type(df))
#         leader_dfs.append(df2)
#         # #print('\n')

# outliers = pd.concat(leader_dfs)


In [None]:
# ranking_file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\leader_rankings.csv"
# rankings = pd.read_csv(ranking_file)


# file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\leadership_ranking.xlsx"
# leader_rankings_df = pd.read_excel(file)
# #print(*leader_rankings_df.columns)
# leader_rankings_df['state'] = leader_rankings_df['state'].fillna(method="ffill")

# n = len(leader_rankings_df)
# break_point = False

# for i,j in enumerate(leader_rankings_df['state']):
#     if "Wyoming" in str(j) and "Alabama" in leader_rankings_df['state'].iloc[i+1]:
#         index_stop = i + 1
#         break_point = True


#     else:
#         continue

#     if break_point == True:
#         house_list = ['House']*index_stop
#         senate_list = ['Senate']*(n-index_stop)
#         full_list = house_list + senate_list
#         leader_rankings_df['chamber'] = full_list
#         leader_rankings_df.dropna(inplace=True)
#         break

# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data')
# leader_rankings_df.to_csv('leader_rankings.csv', index_label= False, index=False)

# #print(leader_rankings_df[leader_rankings_df['state'].str.contains('Connecticut')].to_string())

