In [2]:
import os, sys, json, re, xlrd  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
from datetime import date
import pandas as pd             # Provides data structures and data analysis tools
from openpyxl import Workbook
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, add_seats, get_key

### Definitions

In [3]:
def bordered(text):
    
    if isinstance(text, int) or isinstance(text, str):
        text = str(text)
    try:
        lines = text.splitlines()
        width = max(len(s) for s in lines)
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)
    except:
        lines = [text]
        width = len(str(lines[0]))
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)



In [4]:
def is_majority_party(list, x) :
    rep = [x for x in list if "Republican" in str(x)]
    dem = [x for x in list if "Democrat" in str(x)]

    rep_count = len(rep)
    dem_count = len(dem)

    if rep_count > dem_count:
        maj_party = "Republican"
    elif dem_count > rep_count:
        maj_party = "Democrat"
    else:
        print('somehow they are equal')

    if maj_party == x:
        return True
    else: 
        return False
    


### Data Gathering
Gather data and clean for legislator data

In [5]:

#gather all legislator files from done folder
#committee data should be updated before pulling this

os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\done')
legislator_files = glob.glob('*.xlsx') 



In [6]:
#compiles legislator files into one file
#goes through each sheet and retrieves sheet as dataframe
dfs = {}
for i,file in enumerate(legislator_files):
    #print('working on file:' + str(file))
    # file = legislator_files[0]
    # xls = pd.ExcelFile(file)
    sheets_dict = pd.read_excel(file, engine="openpyxl", sheet_name=None)
    sheet_names = list(sheets_dict.keys())
    for s in sheet_names:
        df = pd.read_excel(file, engine="openpyxl", sheet_name=s)
        
        
        filename =  f'{s}'
        dfs[filename] = df




In [7]:

# this may not even be used
#trims files to not include committee data
compiling = []
for k,v in dfs.items():
    #print(*v.columns, sep = " | ")
    df = v.iloc[:, :9]
    compiling.append(df)
    #print(k," is in")

#pull togther all newly trimmed df's
all_legs_files = pd.concat(compiling)
all_legs_files.reset_index(inplace=True, drop=True)

# os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024')
# all_legs_files.to_csv(f'all_legs_files_{str(date.today()).replace('-','_')}.csv', index=False)




In [8]:
#Pulling in Legislator reference file
leg_lookup = r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\leg_lookup_df.csv'
leg_lookup_ref = pd.read_csv(leg_lookup)

leg_lookup_dict = (leg_lookup_ref.loc[:,['full_pk', 'Last Name']]).set_index('full_pk')['Last Name'].to_dict() 
# ms_legs_lookup = (ms_legs.loc[:,['full_pk', 'Last Name']]).set_index('full_pk')['Last Name'].to_dict()

ms_legs = leg_lookup_ref[~leg_lookup_ref['full_pk'].astype(str).str.endswith('00')]

ms_legs_lookup = (ms_legs.loc[:,['full_pk', 'Last Name']]).set_index('full_pk')['Last Name'].to_dict()

# for k,v in ms_legs_lookup.items():
#     print(f'{k} - type: {type(k)}')
#     print(f'{v} - type: {type(v)}')

# leg_lookup_ref_noo = leg_lookup_ref[~leg_lookup_ref['full_pk'].astype(str).str.endswith('00')]
# leg_lookup_ref_noo = (leg_lookup_ref_noo.loc[:,['full_pk', 'Last Name']]).set_index('full_pk')['Last Name'].to_dict()
# leg_lookup_ref_noo


In [9]:

cleaned_df,duplicates = create_pk(all_legs_files,'district', 'Chamber')
duplicates['full_pk'] = np.nan
for i,j in enumerate(duplicates['Last Name']):
    full_pk = get_key(j, ms_legs_lookup)
    duplicates.loc[i,['full_pk']] = int(full_pk)
    # print(full_pk)

duplicates = duplicates.loc[:,['full_pk', 'primary_key', 'First Name', 'Last Name']]

non_dupes = add_seats(df = cleaned_df)
non_dupes = non_dupes.loc[:,['full_pk', 'primary_key', 'First Name', 'Last Name']]
leg_files_fpk = pd.concat([non_dupes, duplicates])


keepnames didnt work
full_pk,primary_key,State Abbreviation,Chamber,full title,First Name,Last Name,Party,district,tenure,leader,seat,state_code,chamber_code,district_code


In [10]:


# cleaned_df.loc[:,['full_pk']] = np.nan
# for i,j in enumerate(cleaned_df['Last Name']):
#     # print(j)
#     test_fpk = int(str(cleaned_df.loc[i,'primary_key'])+'00')
#     # print(test_fpk)
#     # print(type(test_fpk))
#     # break
#     results = leg_lookup_dict.get(test_fpk)
#     if str(results) == 'None':
#         print(results)
    

    # full_pk = get_key(j, leg_lookup_ref_noo)
    # print(full_pk)
    # duplicates.loc[i,['full_pk']] = int(full_pk)
    # print(full_pk)

# cleaned_df.loc['full_pk', 'primary_key', 'First Name', 'Last Name']

### Importing manually edited leadership positions file

In [11]:
leadership_positions_file = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\all_legs_files_w_rankings.csv"
leaders_lookup = pd.read_csv(leadership_positions_file)

#create primary key for leadership file
infl_rankings, rankings_dupes = create_pk(leaders_lookup, 'district', 'Chamber')

# for i,j in enumerate(infl_rankings['primary_key']):
#     print(type(j))
#     print(j)
    

issues with the district match
primary_key                                NaN
State Abbreviation                          WV
Chamber                                  House
full title            Lt. Governor Craig Blair
First Name                               Craig
Last Name                                Blair
Party                               Republican
district                                   NaN
tenure                                       3
leader                                        
state_code                                 NaN
chamber_code                               NaN
district_code                              NaN


In [12]:

infl_rankings = infl_rankings.dropna(axis = 0, subset='district')
infl_rankings.reset_index(inplace = True, drop = True)

#fill in dupes seats and full pk
rankings_dupes['full_pk'] = np.nan
for i,j in enumerate(duplicates['Last Name']):
    full_pk = get_key(j, ms_legs_lookup)


    rankings_dupes.loc[i,['full_pk']] = int(full_pk)
    # print(full_pk)

infl_non_dupes = add_seats(df = infl_rankings)
print(infl_non_dupes.columns)
print(rankings_dupes.columns)
infl_non_dupes = infl_non_dupes.loc[:,['full_pk', 'primary_key', 'First Name', 'Last Name', 'leader']]
rankings_dupes = rankings_dupes.loc[:,['full_pk', 'primary_key', 'First Name', 'Last Name','leader']]
# print(infl_non_dupes.columns)
# print(rankings_dupes.columns)
leadership_files = pd.concat([infl_non_dupes, rankings_dupes])
leadership_files.reset_index(inplace=True, drop=True)
# leaders_lookup = leaders_lookup.loc[:, ['helper', "leader"]]

leadership_dict = (leadership_files.loc[:,['full_pk', 'leader']]).set_index('full_pk')['leader'].to_dict()

# for k,v in leadership_dict.items():
#     print(k,v)
# ms_legs_lookup = (ms_legs.loc[:,['full_pk', 'Last Name']]).set_index('full_pk')['Last Name'].to_dict()

keepnames didnt work
full_pk,primary_key,State Abbreviation,Chamber,full title,First Name,Last Name,Party,district,tenure,leader,state_code,chamber_code,district_code
Index(['full_pk', 'primary_key', 'State Abbreviation', 'Chamber', 'full title',
       'First Name', 'Last Name', 'Party', 'district', 'tenure', 'leader',
       'state_code', 'chamber_code', 'district_code'],
      dtype='object')
Index(['primary_key', 'State Abbreviation', 'Chamber', 'full title',
       'First Name', 'Last Name', 'Party', 'district', 'tenure', 'leader',
       'state_code', 'chamber_code', 'district_code', 'full_pk'],
      dtype='object')


In [13]:

#check for leaks (commented out otherwise)

# infl_rankings.loc[:,['full_pk']] = np.nan
# for i,j in enumerate(cleaned_df['Last Name']):
#     # print(j)
#     test_fpk = int(str(cleaned_df.loc[i,'primary_key'])+'00')
#     # print(test_fpk)
#     # print(type(test_fpk))
#     # break
#     results = leg_lookup_dict.get(test_fpk)
#     if str(results) == 'None':
#         print(results)

In [14]:

# leaders_lookup['helper'] = leaders_lookup['State Abbreviation'].astype(str)+ "-"+ leaders_lookup['Chamber'].astype(str)+ "-"+leaders_lookup['district'].astype(str)

# leaders_lookup.loc[leaders_lookup['helper'].str.contains(r'^ND-House'), 'helper'] = leaders_lookup['State Abbreviation'].astype(str)+ "-"+ leaders_lookup['Chamber'].astype(str)+ "-"+leaders_lookup['district'].astype(str)+"-"+leaders_lookup['Last Name'].astype(str)


# leaders_lookup = leaders_lookup.loc[:, ['helper', "leader"]]
# leaders_lookup.dropna(subset='leader', inplace=True)
# leaders_lookup.reset_index(inplace=True, drop=True)
# leaders_dict = leaders_lookup.to_dict()


#print(k,v,"\n") for k,v in leaders_dict.items()]

### Influence Score calculation
Pulls in committee data, leadership values, and tenure to calculate tenure score

In [None]:

#pulling in data from legislator files, pulls in 
from collections import Counter


influence_scores = []
for k,v in dfs.items():
    # display_markdown(f' # {k}', raw = True)
    # v = dfs.get('AL_house')
    df = v
    #Conneticut is all in one file since there committies are all joint
    #This splits them up and puts them into a list, otherwise single files get put into a list of one
    if re.search(r'^CT', str(k)):
        house = df[df['Chamber'] == "House"]
        house.reset_index(inplace=True, drop=True)

        
        senate = df[df['Chamber'] == "Senate"]
        senate.reset_index(inplace=True, drop=True)
        # #print(house.to_string())
        # #print(senate.to_string())
        dfs_temp = [house, senate]
    
    else:
        dfs_temp = [df]

    
    
    for d in dfs_temp:
        #Putting helper column in the front
        # d['helper'] = d['State Abbreviation'].astype(str)+ "-"+ d['Chamber'].astype(str)+ "-"+d['district'].astype(str)
        # d.loc[d['helper'].str.contains(r'^ND-House'), 'helper'] = d['State Abbreviation'].astype(str)+ "-"+ d['Chamber'].astype(str)+ "-"+d['district'].astype(str)+"-"+d['Last Name'].astype(str)
        # # d.loc[d['state'].isna() | (d['state'] == "") | (d['district'].isna()), 'helper'] = None
        
        cleaned_df,duplicates = create_pk(d,'district', 'Chamber',  drop_extra_codes = True)
        cleaned_df.reset_index(inplace= True, drop = True)
        duplicates.reset_index(inplace= True, drop = True)

        duplicates['full_pk'] = np.nan
        for i,j in enumerate(duplicates['Last Name']):
            full_pk = get_key(j, ms_legs_lookup)
            duplicates.loc[i,['full_pk']] = int(full_pk)
            # print(full_pk)

        # print(duplicates.to_string())
        # print(cleaned_df.to_string())

        # duplicates = duplicates.loc[:,['full_pk', 'primary_key', 'First Name', 'Last Name']]


        non_dupes = add_seats(df = cleaned_df)
        
        


        # non_dupes = non_dupes.loc[:,['full_pk', 'primary_key', 'First Name', 'Last Name']]
        first_column = non_dupes.pop('full_pk')
        non_dupes.insert(0, 'full_pk', first_column)

        first_column = duplicates.pop('full_pk')
        duplicates.insert(0, 'full_pk', first_column)

        # print('########################')
        # print(*non_dupes.columns, sep=" , ")
        # print(*duplicates.columns, sep=" , ")
        # print('########################')


        d = pd.concat([non_dupes, duplicates])
        d.reset_index(inplace=True, drop=True)
        # first_column = d.pop('full_pk')
        # d.insert(0, 'full_pk', first_column)
        # print(duplicates.head(2).to_string())
        # print(cleaned_df.head(2).to_string())
        

        
        #getting all columns except for committee columns
        col_list = d.columns.to_list()
        for ic,col in enumerate(col_list):
            if re.search(r'^leader', str(col)):
                index_start = ic+1
                break
            else:
                continue
        
        # d_coms = d.iloc[:, f'-{index_start}'index_start:]
        # #print(d.shape[1])

        d_coms = d.iloc[:, [0] + list(range(index_start, (d.shape[1]-1)))]
        # #print(d_2.columns)


        #getting list of committee memberships, list would include a collection of "none, Member, Vice Chair, Chair, or even Minority Chair"
        comm_dict = {}
        for i,dc in enumerate(d_coms['full_pk']):
            
            coms_list = d_coms.iloc[i,1:].to_list()
            comm_dict[dc] = coms_list

        



        # #getting majority party and splitting up by dems and repubs
        party_list_uc = d['Party'].to_list()   
        d['influence_score'] = np.nan
        for i,hv in enumerate(d['full_pk']):
            #variable declaration
            score = 1
            first_tier = False
            second_tier = False
            other_tier = False
            in_maj_party = False
            is_chair = False
            is_vice = False
            member = False
            minority_mem = False
            
            
            #retrieving values
            value = leadership_dict.get(hv)
            d.loc[i,'leaders'] = value
            if re.search(r'\[\d\]', str(value)):
                #print('found a top leader')
                if re.search(r'\[1\]', str(value)):
                    first_tier = True
                elif re.search(r'\[2\]',str(value)):
                    second_tier = True                
            else:
                other_tier = True

            #get the majority party
            if is_majority_party(party_list_uc, str(d['Party'].iloc[i])):
                in_maj_party = True

            #get comms list
            leg_comms = comm_dict.get(hv)
            # #print("****Legislator's Comms")
            for leg in leg_comms:
                
                if isinstance(leg, float):
                    continue
                elif re.search(r'^[Cc]hair', str(leg)):
                    is_chair = True
                elif re.search(r'[Vv]ice-?\s?[Cc]hair', str(leg)):
                    is_vice = True
                elif re.search(r'[Mm]ember', str(leg)):
                    member = True
                elif re.search(r'[Mm]inority', str(leg)):
                    minority_mem = True
                # else:
                #     print("something else")

    
            #scoring
            if in_maj_party == True:
                #print('in majority party')
                if first_tier == True:
                    score = 20
                    #print("speaker")
                elif second_tier == True:
                    score = 15
                    #print("other majority leaders")

                elif is_chair == True:
                    score = 15
                    #print('chair of a committee')
                elif is_vice == True:
                    score = 10
                    #print('vice chair of a committe')
                elif other_tier == True:
                    score = 10
                    #print('other majority leadership')
                elif member == True:
                    score = 5   
            elif in_maj_party == False:
                #print('not in majority party')
                if first_tier == True:
                    score = 15
                    #print('minority leader')
                elif is_chair == True:
                    score = 15
                    #print('chair of a committee')

                elif second_tier == True:
                    score = 10
                elif is_vice == True:
                    score = 10
                    #print('vice chair of a committe')

                elif minority_mem == True:
                    score = 5
                    #print('is minority ranking mem in committee')
                elif member == True:
                    score = 5
                    #print('is a committee member')
                elif other_tier == True:
                    score = 5
                    #print('other minority leadership')


            #pull out tenure modifier
            tenure = d['tenure'].iloc[i]
            if tenure > 10:
                score += 3
            elif tenure > 6:
                score += 2
            elif tenure > 2:
                score += 1


            #make sure 20 is max score
            if score > 20:
                score = 20

            if score == 1:
                continue



            #assign score to influence score column
            d.loc[i,'influence_score'] = score
        
        #df creation and appending to list of dfs
        final_df = d.loc[:,['full_pk', 'First Name', 'Last Name', 'influence_score']]
        influence_scores.append(final_df)


#pull together all dfs and export
leg_infl_df = pd.concat(influence_scores)
leg_infl_df.reset_index(drop = True, inplace= True)
# os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files')
# leg_infl_df.to_csv("leg_infl_df.csv", index=False)
        
leg_infl_df

### Defunct
Chunk below is vistigial of using rankings list from ncls website

Cell below is an older chunk that looked through the raw legislator files, cell above contains the same information

In [31]:
# leader_dfs = []
# for i,j in enumerate(leader_rankings_df['position']):
#     if re.search(r'[Ss]peaker', str(j)):
#         continue
#     elif re.search(r'[Mm]ajority|[Mm]inority', str(j)):
#         # #print(j)
#         continue
#     else:
#         # #print('***not found***')
#         # #print(j)
#         # #print("**************")
#         # #print(leader_rankings_df.iloc[i,:].to_string())
#         df2 = pd.DataFrame(columns=['state', 'position', 'chamber'])
#         df2 = df2._append(leader_rankings_df.iloc[i], ignore_index=True)
#         # #print(type(df))
#         leader_dfs.append(df2)
#         # #print('\n')

# outliers = pd.concat(leader_dfs)


In [None]:
# ranking_file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\leader_rankings.csv"
# rankings = pd.read_csv(ranking_file)


# file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\leadership_ranking.xlsx"
# leader_rankings_df = pd.read_excel(file)
# #print(*leader_rankings_df.columns)
# leader_rankings_df['state'] = leader_rankings_df['state'].fillna(method="ffill")

# n = len(leader_rankings_df)
# break_point = False

# for i,j in enumerate(leader_rankings_df['state']):
#     if "Wyoming" in str(j) and "Alabama" in leader_rankings_df['state'].iloc[i+1]:
#         index_stop = i + 1
#         break_point = True


#     else:
#         continue

#     if break_point == True:
#         house_list = ['House']*index_stop
#         senate_list = ['Senate']*(n-index_stop)
#         full_list = house_list + senate_list
#         leader_rankings_df['chamber'] = full_list
#         leader_rankings_df.dropna(inplace=True)
#         break

# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data')
# leader_rankings_df.to_csv('leader_rankings.csv', index_label= False, index=False)

# #print(leader_rankings_df[leader_rankings_df['state'].str.contains('Connecticut')].to_string())

