Needed Data:


# Imports and Definitions

## Imports

In [1]:
import os, sys, json, re, xlrd  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
from datetime import date
import pandas as pd             # Provides data structures and data analysis tools
from openpyxl import Workbook
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_coding_r, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, add_seats, get_key, get_recent_file

## Definitions

In [2]:
def bordered(text):
    
    if isinstance(text, int) or isinstance(text, str):
        text = str(text)
    try:
        lines = text.splitlines()
        width = max(len(s) for s in lines)
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)
    except:
        lines = [text]
        width = len(str(lines[0]))
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)



In [3]:
def is_majority_party(list, x) :
    rep = [x for x in list if "Republican" in str(x)]
    dem = [x for x in list if "Democrat" in str(x)]

    rep_count = len(rep)
    dem_count = len(dem)

    if rep_count > dem_count:
        maj_party = "Republican"
    elif dem_count > rep_count:
        maj_party = "Democrat"
    else:
        print('somehow they are equal')

    if maj_party == x:
        return True
    else: 
        return False
    


# Influence Scores

## Data Gathering
Gather data and clean for legislator data

### Key Lookup from "Key_Creation.py"
Pull in ref key from main key file, this file purely

In [4]:
##Pulling in Legislator reference file comes from outside file and creating df
leg_lookup = get_recent_file("*.csv", r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\key_creation\2025")
# print(leg_lookup)


In [5]:
#leg_lookup formatting
leg_lookup_ref = pd.read_csv(leg_lookup)
leg_lookup_ref.columns
leg_lookup_ref = leg_lookup_ref.dropna(subset='full_pk')
name_cols = [x for x in leg_lookup_ref.columns if re.search(r'[Nn]ame', str(x))]
# print(name_cols)
for n in name_cols:
    if 'first' in n:
        f_name = str(n)
    elif 'last' in n:
        l_name = str(n)


# leg_lookup_ref

In [6]:
#change data types
leg_lookup_ref['full_pk'] = leg_lookup_ref['full_pk'].astype(int)
leg_lookup_ref['primary_key'] = leg_lookup_ref['primary_key'].astype(int)

In [7]:

leg_lookup_ref.columns
##troubleshoot line to look at columns
# print(*leg_lookup_ref.columns, sep = " | ")

##create leg_lookup dict (not used anywhere)
leg_lookup_dict = (leg_lookup_ref.loc[:,['full_pk', l_name]]).set_index('full_pk')[l_name].to_dict() 

##take out multiseat legislature and create dict
multis_legs = leg_lookup_ref[~leg_lookup_ref['full_pk'].astype(str).str.endswith('00')]
print(*multis_legs.columns, sep = " | ")
ms_legs_lookup = (multis_legs.loc[:,['full_pk', l_name]]).set_index('full_pk')[l_name].to_dict()


##pks of all multiseat legislatures
ms_pks = multis_legs['full_pk'].to_list()

# leg_lookup_ref


full_pk | primary_key | district_code | state abbreviation | chamber | title | first name | last name | party | district | date assumed office | name | tenure | leader | state_code | chamber_code


## Manual Leadership Files
pulls in data from manually curated leadership positions

In [8]:
#manual leadership file for upload
leadership_positions_file = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\all_legs_files_w_rankings.csv"
leaders_lookup = pd.read_csv(leadership_positions_file)

#create primary key for leadership file
infl_rankings, rankings_dupes = create_pk(leaders_lookup, 'district', 'Chamber')

# for i,j in enumerate(infl_rankings['primary_key']):
#     print(type(j))
#     print(j)
# rankings_dupes.columns

infl_rankings = infl_rankings.dropna(axis = 0, subset='district')
infl_rankings.reset_index(inplace = True, drop = True)

#fill in dupes seats and full pk
rankings_dupes['full_pk'] = np.nan

# print(rankings_dupes.columns)
for i,j in enumerate(rankings_dupes['district_code']):
    district_code = j
    print(j)
    name = rankings_dupes['Last Name'].iloc[i]
    full_pks = get_key(name, ms_legs_lookup)
    # print(full_pks)
    # print(rankings_dupes.columns)
    
    for ip,p in enumerate(full_pks):
        # print(p)
        # print(type(p))
        p_str = str(int(p)).strip()
        # print(f' p = {p}, type:{type(p)}')
        # print(f' j = {j}, type:{type(j)}')

        match = re.findall(r'(?<=^\d{3})\d{3}(?=\d{2})', p_str)
        # print(match)
        if j == match[0]:
            # print('finally found it')
            # print(type(p))
            # print(f'p_str: {p_str}')
            rankings_dupes.loc[i,'full_pk'] = p_str
            seat_match = re.findall(r'\d{2}$', p_str)
            # try:
            rankings_dupes.loc[i,'seat_num'] = seat_match[0]
            # except:
                # print(f'something wrong with match: {seat_match}')
            break


issues with the district match
State Abbreviation                          WV
Chamber                                  House
full title            Lt. Governor Craig Blair
First Name                               Craig
Last Name                                Blair
Party                               Republican
district                                   NaN
tenure                                       3
leader                                        
state_code                                 NaN
chamber_code                               NaN
district_code                              NaN
001
001
001
001
002
002
002
002
003
003
003
003
004
004
004
004
005
005
005
005
006
006
006
006
007
007
007
007
008
008
008
008
009
009
009
009
010
010
010
010
011
011
011
011
012
012
012
012
013
013
013
013
014
014
014
014
015
015
016
016
016
016
017
017
017
017
018
018
019
019
020
020
021
021
022
022
023
023
024
024
025
025
026
026
027
027
028
028
029
029
030
030
031
031
032
032
033
033
034
034
035
0

In [9]:

#format data
infl_non_dupes = add_seats(df = infl_rankings)
# print(infl_non_dupes.columns)
# print(rankings_dupes.columns)
infl_non_dupes = infl_non_dupes.loc[:,['full_pk', 'primary_key', 'seat_num','First Name', 'Last Name', 'leader']]
rankings_dupes = rankings_dupes.loc[:,['full_pk', 'primary_key', 'seat_num', 'First Name', 'Last Name','leader']]
# print(infl_non_dupes.columns)
# print(rankings_dupes.columns)
leadership_files = pd.concat([infl_non_dupes, rankings_dupes])
leadership_files.reset_index(inplace=True, drop=True)
# leaders_lookup = leaders_lookup.loc[:, ['helper', "leader"]]

leadership_dict = (leadership_files.loc[:,['full_pk', 'leader']]).set_index('full_pk')['leader'].to_dict()



### Legislator DF's
these are the full compiled files with all legislature info

In [10]:
#Pulling in DF's w Pk from all_legs files
os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\bridges\all_leg_dfs_w_pk')
pk_files = glob.glob('*.csv')

dfs_w_pk = {}
for p in pk_files:
    df = pd.read_csv(p)
    dfs_w_pk[p.replace('.csv', '')] = df
    # print(df.head().to_string())


## Get Influence Score
Pulls in committee data, leadership values, and tenure to calculate tenure score

In [11]:

pk_name = (leg_lookup_ref.loc[:,['primary_key', "name"]]).set_index('primary_key')["name"].to_dict() 
name_fpk = (leg_lookup_ref.loc[:,['name', "full_pk"]]).set_index('name')["full_pk"].to_dict() 


for i,j in leg_lookup_ref.items():
    print(i)
    print(j)


full_pk
0       10006300
1       10006600
2       10004900
3       10008000
4       10006100
          ...   
1980    57101501
1981    57100102
1982    57101502
1983    57100502
1984    57101002
Name: full_pk, Length: 1984, dtype: int64
primary_key
0       100063
1       100066
2       100049
3       100080
4       100061
         ...  
1980    571015
1981    571001
1982    571015
1983    571005
1984    571010
Name: primary_key, Length: 1984, dtype: int64
district_code
0       63.0
1       66.0
2       49.0
3       80.0
4       61.0
        ... 
1980    15.0
1981     1.0
1982    15.0
1983     5.0
1984    10.0
Name: district_code, Length: 1984, dtype: float64
state abbreviation
0       AL
1       AL
2       AL
3       AL
4       AL
        ..
1980    WV
1981    WV
1982    WV
1983    WV
1984    WV
Name: state abbreviation, Length: 1984, dtype: object
chamber
0        House
1        House
2        House
3        House
4        House
         ...  
1980    Senate
1981    Senate
1982    Sen

In [12]:

for k,v in name_fpk.items():
    if re.search(r'^WV|^ND', str(k)):
        print(k)
        print(v)

ND Rep. Bert Anderson (R-ND-002)
43000201
ND Rep. Dick Anderson (R-ND-006)
43000601
ND Rep. Karen Anderson (R-ND-019)
43001901
ND Rep. Landon Bahl (R-ND-017)
43001701
ND Rep. Mike Beltz (R-ND-020)
43002001
ND Rep. Mike Berg (R-ND-008)
43000801
ND Rep. Macy Bolinske (R-ND-040)
43004001
ND Rep. Glenn Bosch (R-ND-030)
43003001
ND Rep. Michael "Mike" Brandenburg (R-ND-028)
43002801
ND Rep. Collette Brown (D-ND-009)
43000901
ND Rep. Nels Christianson (R-ND-018)
43001801
ND Rep. Josh Christy (R-ND-027)
43002701
ND Rep. Liz Conmy (D-ND-011)
43001101
ND Rep. Jayme Davis (D-ND-009)
43000902
ND Rep. Gretchen Dobervich (D-ND-011)
43001102
ND Rep. Jason Dockter (R-ND-007)
43000701
ND Rep. Ty Dressler (R-ND-036)
43003601
ND Rep. Clayton Fegley (R-ND-004B)
43000401
ND Rep. Lisa Finley-DeVille (D-ND-004A)
43000402
ND Rep. Jay Fisher (R-ND-005)
43000501
ND Rep. Austin Foss (D-ND-044)
43004401
ND Rep. Kathy Frelich (R-ND-015)
43001501
ND Rep. Karen Grindberg (R-ND-041)
43004101
ND Rep. Jim Grueneich (R

In [13]:
#dfs_w_pk to dfs_w_fpk 

leg_lookup_ref
trbl_pk = []
infl_dfs = {}
for k,v in dfs_w_pk.items():
    # print(v.columns)
    state = list(set(v['State Abbreviation'].to_list()))[0]
    print(state)
    v['full_pk'] = np.nan
    if state == "ND" or state == "WV":
        
        for i,(a,b) in enumerate(zip(v['primary_key'], v['Name'])):
            print('$$$$$$$$$$$$$$$$$')
            print(b)
            # print(type(a))
            leg_per_a = leg_lookup_ref[leg_lookup_ref['primary_key'] == a].reset_index(drop = True)
            if leg_per_a.empty:
                print("the results came back empty")
                break
            else:
                print(f'length is {len(leg_per_a)}')
                print(leg_per_a.to_string())
            for ii,rl in enumerate(leg_per_a['last name']):
                if str(rl).lower() in str(b).lower():
                    print(leg_per_a.loc[ii,['full_pk']])
                    full_pk_to_assign = leg_per_a.loc[ii,['full_pk']]
                    break
            v.loc[i,['full_pk']] = full_pk_to_assign
    else:
        v['full_pk'] = v['primary_key'].astype(str) + '00'

            # print(str(a))
            # print(str(b))
            # print('#######ref value')

            # ref_val = name_fpk.get(str(b))
            # print('################')
            # if ref_val == str(b):
            #     print(True)
            # else: 
                # trbl_pk.append(str(a))
        # continue
    infl_dfs[k] = v
    # for i,(a,b) in enumerate(zip(v['primary_key'], v['Name'])):
        # print(j)
        # print(type(j))
        
        # print(str(b))
    # for i,j in enumerate(v):
# print(len(trbl_pk))

AL
AL
CT
CT
IL
IL
IN
IN
KS
KS
MO
MO
NC
NC
ND
$$$$$$$$$$$$$$$$$
ND Rep. Bert Anderson (R-ND-002)
length is 2
    full_pk  primary_key  district_code state abbreviation chamber                        title first name last name       party  district  date assumed office                                      name  tenure leader  state_code  chamber_code
0  43000201       430002            2.0                 ND   House  North Dakota Representative       Bert  Anderson  Republican       2.0                 2014          ND Rep. Bert Anderson (R-ND-002)      11    NaN        43.0           0.0
1  43000202       430002            2.0                 ND   House  North Dakota Representative     Donald  Longmuir  Republican       2.0                 2016  ND Rep. Donald "Don" Longmuir (R-ND-002)       9    NaN        43.0           0.0
full_pk    43000201
Name: 0, dtype: object
$$$$$$$$$$$$$$$$$
ND Rep. Dick Anderson (R-ND-006)
length is 2
    full_pk  primary_key  district_code state abbreviatio

In [22]:
#create coms file for reference
coms_file = r'c:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\committee info\committee update files\aggregated\comms_info.xlsx'
comms_df = pd.read_excel(coms_file)
# comms_df = comm
comms_df.head(3)

Unnamed: 0,fpk,com_name,role,state
0,22006600,Adoption and Child Welfare,Chairperson,IL
1,22004200,Adoption and Child Welfare,Vice-Chairperson,IL
2,22006300,Adoption and Child Welfare,Republican Spokesperson,IL


In [15]:
for k,v in infl_dfs.items():
    print(f'{k}: {v.head(2).to_string()}')

AL_house:    primary_key  district_code State Abbreviation Chamber                   Title First Name Last Name       Party  district  Date Assumed Office                                                                                                                           Committee List                               Name  tenure leader  state_code  chamber_code   full_pk
0       100063             63                 AL   House  Alabama Representative    Cynthia    Almond  Republican        63                 2021                                    Ethics and Campaign Finance (Vice Chair), Rules, Judiciary, Ways and Means Education, Reapportionment  AL Rep. Cynthia Almond (R-AL-063)       4    NaN          10             0  10006300
1       100066             66                 AL   House  Alabama Representative       Alan     Baker  Republican        66                 2006  Local Legislation (Chair), Education Policy (Vice Chair), Ways and Means Education, Baldwin County Legislati

In [None]:
#scoring influence



#pulling in data from legislator files, pulls in 
from collections import Counter

in_process = []
influence_scores = []
full_dfs = {}
# iterating through dfs (one for each chamber of each state execept CT)
for k,v in infl_dfs.items():


    ####################
    # whole state iteration
    ####################
    # display_markdown(f' # {k}', raw = True)
    # v = dfs.get('AL_house')
    df = v
    # print(*df.columns, sep = " | ")
    
    #Conneticut is all in one file since there committies are all joint
    #This splits them up and puts them into a list, otherwise single files get put into a list of one
    if re.search(r'^CT', str(k)):
        house = df[df['Chamber'] == "House"]
        house.reset_index(inplace=True, drop=True)

        
        senate = df[df['Chamber'] == "Senate"]
        senate.reset_index(inplace=True, drop=True)
        # #print(house.to_string())
        # #print(senate.to_string())
        dfs_temp = [house, senate]
        # print(k)
    
    else:
        dfs_temp = [df]
        # print(k)

    
    ####################
    # individual df iteration
    ####################

    #still going through each df for each chamber of each state
    for d in dfs_temp:
        d = d.reset_index(drop = True)
        # print('###########')
        # print('non-pk one')
        # print(d.columns)
        print(d.head(2))
        # break
        # print('working on: ' + str(d))
    

        
        


        # print(d.to_string())
        # #getting majority party and splitting up by dems and repubs
        party_list_uc = d['Party'].to_list()   
        d['influence_score'] = np.nan
        #going through legislators in file
        for i,hv in enumerate(d['full_pk']):
            
            
            #variable declaration
            score = 1
            first_tier = False
            second_tier = False
            other_tier = False
            in_maj_party = False
            is_chair = False
            is_vice = False
            member = False
            minority_mem = False
            



            ###################
            #get tenure and leader data 
            ###################
            try: #try looking for tenure and looking at leaders
                if d.loc[i,'tenure'] != 1: #if they are new then they shouldn't be looked up
                    # print(d.loc[i,'tenure'])
                    value = leadership_dict.get(hv)
                    # print(value)
                    d.loc[i,'leaders'] = value
                    if re.search(r'\[\d\]', str(value)):
                        #print('found a top leader')
                        if re.search(r'\[1\]', str(value)):
                            first_tier = True
                        elif re.search(r'\[2\]',str(value)):
                            second_tier = True                
                    else:
                        other_tier = True
            except:
                print('something didnt work')
                
            ###################
            #get the majority party
            ###################
            if is_majority_party(party_list_uc, str(d['Party'].iloc[i])):
                in_maj_party = True


            ###################
            #get comm info
            ###################
            state_initial = list(set(d['State Abbreviation'].to_list()))[0]
            
            com_states = ['NC', 'OH', 'IL', 'IN', 'KS', 'ND', 'OK', 'VA']
            if state_initial in com_states:
                print(f'state abbv is {state_initial}')
                #get comms list (taking out until committees are established)
                # print(type(hv))
                # print(type(comms_df.loc[0,'fpk']))
                print(f'local: {type(hv)}')
                print(f'ref: {type(comms_df.loc[0,'fpk'])}')
                
                #find all values in the comm df and find list of roles for legislator
                results = comms_df[comms_df['fpk'] == int(hv)]
                role_check = list(set(results.loc[:,'role'].to_list()))
                role_check = [x for x in role_check if not isinstance(x, float)]
                #check list of roles
                
                if len(role_check) != 0:        #checks if value returned
                    print(f"role check has type of: {type(role_check)}")
                    # print(f'role check is {len(role_check)} long')
                    print(role_check)
                    # print(f'FPK IS {hv}')

                    #only one value
                    if len(role_check) == 1: 
                        highest_role = role_check[0].strip().lower()

                    elif len(role_check) > 1:
                        
                        #take members out and recheck
                        role_check = [y for y in role_check if 'member' not in str(y).lower().strip()]
                        
                        if len(role_check) == 0:
                            highest_role = 'member'
                    
                        
            
                        
                #                
                    if 'member' in highest_role:
                        member = True

                    else:
                        for leg in role_check:  #look through each value in the role check
                            
                            if isinstance(leg, float):
                                continue
                            elif re.search(r'^[Cc]hair', str(leg)):
                                is_chair = True
                                break
                            elif re.search(r'[Vv]ice-?\s?[Cc]hair', str(leg)):
                                is_vice = True
                                break
                            elif re.search(r'[Mm]ember', str(leg)):
                                member = True
                                break
                            elif re.search(r'[Mm]inority', str(leg)):
                                minority_mem = True
                                break
                            else:
                                print("something else")
                

            ##############
            #Final scoring
            ##############

            if in_maj_party == True:
                #print('in majority party')
                if first_tier == True:
                    score = 20
                    #print("speaker")
                elif second_tier == True:
                    score = 15
                    #print("other majority leaders")

                elif is_chair == True:
                    score = 15
                    #print('chair of a committee')
                elif is_vice == True:
                    score = 10
                    #print('vice chair of a committe')
                elif other_tier == True:
                    score = 10
                    #print('other majority leadership')
                elif member == True:
                    score = 5   
            elif in_maj_party == False:
                #print('not in majority party')
                if first_tier == True:
                    score = 15
                    #print('minority leader')
                elif is_chair == True:
                    score = 15
                    #print('chair of a committee')

                elif second_tier == True:
                    score = 10
                elif is_vice == True:
                    score = 10
                    #print('vice chair of a committe')

                elif minority_mem == True:
                    score = 5
                    #print('is minority ranking mem in committee')
                elif member == True:
                    score = 5
                    #print('is a committee member')
                elif other_tier == True:
                    score = 5
                    #print('other minority leadership')


            #pull out tenure modifier
            tenure = d['tenure'].iloc[i]
            if tenure > 10:
                score += 3
            elif tenure > 6:
                score += 2
            elif tenure > 2:
                score += 1


            #make sure 20 is max score
            if score > 20:
                score = 20
            
            #if senate base score is 2
            if score == 1:
                chamber = str(d.loc[i,'Chamber'])
                if chamber == 'Senate':
                    score = 2
                



            #assign score to influence score column
            d.loc[i,'influence_score'] = score
        
        #df creation and appending to list of dfs
        full_df = d.copy()
        # print(full_df.head(2))
        final_df = d.loc[:,['full_pk', 'First Name', 'Last Name', 'Party', 'tenure', 'influence_score']]
        influence_scores.append(final_df)
        full_dfs[k] = full_df



In [24]:
#export full_dfs

# Convert DataFrames to dictionaries
full_dfs_json = {key: df.to_dict(orient="records") for key, df in full_dfs.items()}


# # Save data to JSON file
os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\bridges\committee_info')
with open(f"full_dfs_w_comms_{str(date.today()).replace('-', '_')}.json", "w") as f:
    json.dump(full_dfs_json, f)
    save_file_name = f.name
    print(save_file_name)

full_dfs_w_comms_2025_03_31.json


In [25]:
#pull together all dfs
leg_infl_df = pd.concat(influence_scores)
leg_infl_df = leg_infl_df.dropna(subset='full_pk')
leg_infl_df.reset_index(drop = True, inplace= True)


leg_infl_df = leg_infl_df.rename(columns={"First Name": "first_name", "Last Name": "last_name", "Party": 'party'})
print(*leg_infl_df.columns, sep=', ')


no_influence = leg_infl_df[leg_infl_df['influence_score'].isna()]
no_influence

full_pk, first_name, last_name, party, tenure, influence_score


Unnamed: 0,full_pk,first_name,last_name,party,tenure,influence_score


In [26]:
#export
from datetime import date
year = 2025
os.chdir(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\bridges\compiling_calcs\{year}')
leg_infl_df.to_csv(f"leg_infl_df{str(date.today()).replace('-','_')}.csv", index=False)
leg_infl_df.to_excel(f"leg_infl_df{str(date.today()).replace('-','_')}.xlsx", index=False)

# leg_infl_df


In [6]:
infl_file = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\influence scores\2025\leg_infl_df2024_12_11.csv"
influence = pd.read_csv(infl_file)
leg_infl_df = influence.copy()

In [None]:
leg_infl_df


Unnamed: 0,full_pk,first_name,last_name,party,tenure,influence_score
0,10006300,Cynthia,Almond,Republican,4,11.0
1,10006600,Alan,Baker,Republican,19,13.0
2,10004900,Russell,Bedsole,Republican,5,11.0
3,10008000,Chris,Blackshear,Republican,9,12.0
4,10006100,Ronald,Bolton,Republican,3,11.0
...,...,...,...,...,...,...
1881,57101501.0,Darren,Thorne,Republican,0,10.0
1882,57100102.0,Ryan,Weld,Republican,9,12.0
1883,57101502.0,Thomas,Willis,Republican,1,
1884,57100502.0,Michael,Woelfel,Democrat,1,


# Pulling committee info for priority analysis

In [None]:
import json
import os

# Define the path where the file was saved
file_path = r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\bridges\committee_info'

# Construct the file name dynamically if needed
from datetime import date
file = get_recent_file('full_dfs_w_comms*.json', file_path)
full_file_path = os.path.join(file_path, file)
print(full_file_path)

# Load the JSON file
with open(full_file_path, "r") as f:
    full_dfs_json = json.load(f)

# Convert dictionaries back to DataFrames
import pandas as pd
full_dfs = {key: pd.DataFrame(value) for key, value in full_dfs_json.items()}

# # Check the loaded data
print(full_dfs.keys())  # Prints the names of the DataFrames loaded

for k,v in full_dfs.items():
    print(v.head())


In [None]:
# this is to get information on committee scores for priority analaysis
comms_dfs = []
for k,v in full_dfs.items():
    # print('#############################')
    # print(k)

    # print(v.to_string())
    for i,j in enumerate(v.columns):
        # print(i)
        # print(j)
        if i == 0:
            continue
        subset = v.iloc[:,[0,i]]
        col = subset.columns[-1]

        subset[col] = subset[col].notnull().astype('int')
        subset = subset[subset[col]==1]
        
        state_match = re.match(r'(^[A-Z]{2})', str(k))
        state = state_match.group(1)

        chamber = str(k).split('_', 1)[-1]

        com = str(col)
        pk_list = subset['full_pk'].to_list()
        pks = "|".join(pk_list)
        com_df = pd.DataFrame({"state": [state], "chamber":[chamber], "committee": [com], "pks":[pks]})
        comms_dfs.append(com_df)
comms_summary = pd.concat(comms_dfs)

# comms_summary

comms_summary_final = comms_summary.assign(
    pks=comms_summary['pks'].str.split('|')  # Split the string into a list
).explode('pks')  # Create a new row for each list element
# comms_summary_final



In [None]:
#pull committee files
for k,v in full_dfs.items():
    print('############')
    print(k)
    # print(v.head(2))
    
    # print('_______________')
    v = v.drop(['leaders', 'influence_score'], axis = 1).reset_index(drop = True)
    # print(v.head(2))

    # print('_______________')
    col_select = np.r_[0:1, 15:(len(v.columns)-1)]
    v = v.iloc[:,col_select]
    # print(v.head(2))
    print((v.dropna(how = 'all', axis = 0)).head(2))

    column_to_keep = 'full_pk'

    # Drop rows where all columns except 'A' are NaN
    filtered_v = v.dropna(subset=[col for col in v.columns if col != column_to_keep], how='all').reset_index(drop = True)
    print(v.head())
    full_dfs[k] = filtered_v
    # for i,dc in enumerate(v.columns):
    #     if 'influence' in str(dc):
    #         i
    #     print(f'[{i}]: {dc}')


In [None]:
from datetime import date

comm_sum_name = f'comms_summary{str(date.today()).replace('-','_')}.xlsx'
comms_summary_final.to_excel(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\bridges\committee_info\{comm_sum_name}', sheet_name=f'{comm_sum_name.replace('.xlsx',"")}', index=False)
# comms_summary_final
        