### Imports

In [129]:

import os, sys, json, datetime, re, xlrd  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
from openpyxl import Workbook
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

### Definitions

In [130]:
def bordered(text):
    
    if isinstance(text, int):
        text = str(text)
    try:
        lines = text.splitlines()
        width = max(len(s) for s in lines)
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)
    except:
        lines = [text]
        width = len(str(lines[0]))
        res = ['┌' + '─' * width + '┐']
        for s in lines:
            res.append('│' + (s + ' ' * width)[:width] + '│')
        res.append('└' + '─' * width + '┘')
        return '\n'.join(res)



    

### gathering leg files

In [131]:
os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\done')
legislator_files = glob.glob('*.xlsx') 

for i,file in enumerate(legislator_files):
    if '_legislators' not in str(file).lower():
        print("deleting: " + str(legislator_files[i]))
        del legislator_files[i] 

In [132]:

#compiles legislator files into one file
dfs = []
for i,file in enumerate(legislator_files):
    print('working on file:' + str(file))
    # file = legislator_files[0]
    # xls = pd.ExcelFile(file)
    sheets_dict = pd.read_excel(file, engine="openpyxl", sheet_name=None)
    sheet_names = list(sheets_dict.keys())
    for s in sheet_names:
        df = pd.read_excel(file, engine="openpyxl", sheet_name=s)
        df = df.iloc[:,:9]
        dfs.append(df)

    df = pd.concat(dfs)

working on file:AL_legislators.xlsx
working on file:CT_legislators.xlsx
working on file:IL_legislators.xlsx
working on file:IN_Legislators.xlsx
working on file:KS_legislators.xlsx
working on file:MO_legislators.xlsx
working on file:NC_legislators.xlsx
working on file:ND_legislators.xlsx
working on file:NM_legislators.xlsx
working on file:OH_legislators.xlsx
working on file:OK_legislators.xlsx
working on file:VA_legislators.xlsx
working on file:WV_legislators.xlsx


In [133]:
all_legs = pd.concat(dfs)
# print(all_legs.columns)

all_legs['helper'] = all_legs['State Abbreviation'].astype(str)+ "-"+ all_legs['Chamber'].astype(str)+ "-"+all_legs['district'].astype(str)
all_legs.reset_index(inplace = True, drop = True)
# duplicates = all_legs.index.duplicated()
# all_legs[duplicates])

all_legs.loc[all_legs['helper'].str.contains(r'^ND-House'), 'helper'] = all_legs['State Abbreviation'].astype(str)+ "-"+ all_legs['Chamber'].astype(str)+ "-"+all_legs['district'].astype(str)+"-"+all_legs['Last Name'].astype(str)
first_column = all_legs.pop('helper')
all_legs.insert(0,'helper', first_column) 


# export
os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files')
all_legs.to_csv('list_of_legislators_11_15_2024.csv', index=False)

### gathering attendance data

In [156]:
# import files
os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data')
events = glob.glob("*.xlsx")
print(events)


['DE_LEG_ED_Dinner_2023.xlsx', 'ECLS_2024.xlsx', 'ElevateNC_C4_M3.xlsx', 'ElevateNC_C4_M4.xlsx', 'HKF_C10_S1.xlsx', 'HKF_Regional_Visit_(FAU).xlsx', 'HSPF_C4_M1.xlsx', 'HSPF_C4_M2.xlsx', 'HSPF_C4_M3.xlsx', 'MO_SLR_2023.xlsx', 'NCCCS_M4.xlsx', 'NC_EC_Roundtable_2024.xlsx', 'NC_HLR_2024.xlsx', 'ND_Literacy_taskforce_2024.xlsx', 'ND_SLR_2023.xlsx', 'ND_SLR_2024.xlsx', 'ND_TRR_M1.xlsx', 'ND_TRR_m2.xlsx', 'ND_TRR_m3.xlsx', 'OH_SLR_2023.xlsx', 'OH_SLR_2024.xlsx', 'OK_SLR_2023.xlsx', 'OK_SLR_2024.xlsx', 'SC_Leg_Ed_Dinner_2023.xlsx', 'The_Path_Forward_2024.xlsx', 'WV_SLR_2023.xlsx', 'WV_SLR_2024.xlsx']


In [157]:
#regex list set up
state_list = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", 
    "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", 
    "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", 
    "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", 
    "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", 
    "New Hampshire", "New Jersey", "New Mexico", "New York", 
    "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", 
    "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", 
    "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", 
    "West Virginia", "Wisconsin", "Wyoming", "District of Columbia"
]
state_abbreviations = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", "DC"
]
#editing for finding state intitals at beginning of string only
state_abbreviations_reg = []
for abv in state_abbreviations:
    for_regex = f'^{abv}'
    state_abbreviations_reg.append(for_regex)
#compiling regex patterns for looking for states
state_pat = re.compile("|".join(state_list))
state_abv_pat = re.compile("|".join(state_abbreviations_reg))
# print(state_abv_pat)
state_ref = dict(zip(state_list, state_abbreviations))

### Filling in State info

looks for state names in the title, org, and state fields

In [158]:
dfs = []
vals_changed = 0
for event in events:
    df = pd.read_excel(event)
    # print('######################')
    # print(bordered(event))
    
    # print(*df.columns)
    event_name = str(event).split('.')[0].strip().replace('_', ' ')
    df = df.iloc[:,:8]
    df.loc[:,'event name'] = event_name
    
    break_all = False
    # #print(df)
    # continue
    for i,state in enumerate(df['state']):
       
        # #print('----------------------------')
        # if isinstance(state, float):
        # if re.search(r'[Rr]epresentative|[Ss]enator|[Ll]egislator',str(df['title'].iloc[i])) or re.search(r'[Ss]enate|[Hh]ouse of ([Rr]epresentatives)?(Delegates)?|[Dd]istrict|[Ss]tate [Hh]ouse', str(df['org'].iloc[i])):
    
    
        # display_markdown(f'### title', raw=True)
        # print(df.loc[i,'title'])
        # display_markdown(f'### org', raw=True)
        # print(df.loc[i,'org'])
    
    
    
        # continue
        #print("^^^^^^^^^^^")
        #print("found a match")
        # #print(df.loc[i,['first_name', 'last_name']])
        
        testing_string = str(df['title'].iloc[i]) + " " + str(df['org'].iloc[i])
        # #print(testing_string)
        testing_string = testing_string.lstrip('nan').lstrip().strip()
        # #print(re.match(r'[Rr]epresentative|[Ss]enator|[Ll]egislator|[Ss]enate|[Hh]ouse of ([Rr]epresentatives)?(Delegates)?|[Dd]istrict|[Ss]tate [Hh]ouse',str(testing_string)))
        # continue
        # #print('###########')
        # #print(df.loc[i, list(df.columns[:5]) + [df.columns[-1]]])
        # #print('\n')
        state_match_uc = re.findall(state_pat, str(df['org'].iloc[i]))
        state_match = [x for x in state_match_uc if len(x) > 0]
        
        # First match test
        if len(state_match) == 0:
            #print('no regular state match')
            #print(state_match_uc)
            state_abv_match_uc = re.findall(state_abv_pat, str(df['org'].iloc[i]))
            state_abv_match = [x for x in state_abv_match_uc if len(x) > 0]
            # Second match test
            if len(state_abv_match) == 0:
                #print('no state abbreviation match')
                #print(state_abv_match_uc)
                state_abv_event_match_uc = re.findall(state_abv_pat, str(df['event name'].iloc[i]))
                state_abv_event_match = [x for x in state_abv_event_match_uc if len(x) > 0]
                # Third match test
                if len(state_abv_event_match) == 0:
                    #print('no state abv event match')
                    #print(state_abv_event_match_uc)
                    break
                elif len(state_abv_event_match) > 1:
                    #print('more than one match?')
                    break_all = True
                    break
                else:
                    #print("abv in event match")
                    state_val = str(state_abv_event_match[0])
                    # df.loc[i,'state'] = None
                    df.loc[i,'state'] = str(df.loc[i,'state'])
                    df.loc[i,'state'] = state_val
                    #print(state_val)
                    vals_changed += 1
            elif len(state_abv_match) > 1:
                #print('more than one match?')
                #print(state_abv_match)
                #print(df.loc[i, list(df.columns[:5]) + [df.columns[-1]]])
                break_all = True
                break
            else:
                #print("regular abreviation match")
                
                state_val = str(state_abv_match[0])
                # df.loc[i,'state'] = None
                df.loc[i,'state'] = str(df.loc[i,'state'])
                df.loc[i,'state'] = state_val
                #print(state_val)
                vals_changed += 1

            # #print('###########')
            # #print(df.loc[i, list(df.columns[:5]) + [df.columns[-1]]])
            # #print('\n')
            # break
        elif len(state_match) > 1:
            #print("more than one match?")
            break_all = True
            break
        else:
            #print("normal state match")
            state_val_dirty = str(state_match[0])
            state_val = state_ref.get(state_val_dirty)
            df.loc[i,'state'] = str(df.loc[i,'state'])
            # df.loc[i,'state'] = None
            df.loc[i,'state'] = state_val
            #print(state_val)
            vals_changed += 1
        # else:
        #     # #print('#########################')
        #     # #print('NOT A REP OR SEN')
        #     # #print(df.loc[i,['first_name','last_name','title', 'org']])
        #     continue
            # #print(df.loc[i, list(df.columns[3:5]) + [df.columns[-1]]])
            # #print('\n')
    if break_all == True:
        break
    dfs.append(df)
event_data = pd.concat(dfs)
event_data.reset_index(inplace=True, drop = True)

In [159]:

#looks for state names and replaces them with state initials
for i,j in enumerate(event_data['state']):
    
    if isinstance(j, float):
        continue
    elif re.search(r'[A-Z]{2}', str(j)):
        continue
    else:
        val = state_ref.get(str(j))
        event_data.loc[i,'state'] = str(val)

In [160]:
os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\exports')
# event_data.to_csv("event_data_export_11_7_2024.csv", index=False)

### refining data

This section refines the data done into only legislators and then split by whether districts are found or not

Later on hopefully we wont need to split up by no districts

In [161]:
#pattern set up
title_pattern = r'[Rr]epresentative|[Ss]enator|[Ll]egislator'
org_pattern = r'[Ss]enate|[Hh]ouse of ([Rr]epresentatives)?(Delegates)?|(?<!School )(?:House District|District)|[Ss]tate [Hh]ouse'

In [162]:

#filter for state legislators
filtered_df = event_data[event_data['title'].astype(str).apply(lambda x: bool(re.search(title_pattern, x))) |
                 event_data['org'].astype(str).apply(lambda x: bool(re.search(org_pattern, x)))]


In [163]:

#filter out where can't find district number
no_districts = filtered_df[~(filtered_df['org'].str.contains(r'[Dd]istrict\s?\d{1,3}|[Dd](-|\s)?\d{2,3}', regex=True) | 
                filtered_df['title'].str.contains(r'[Dd]istrict\s?\d{1,3}|[Dd](-|\s)?\d{2,3}', regex=True))]



  no_districts = filtered_df[~(filtered_df['org'].str.contains(r'[Dd]istrict\s?\d{1,3}|[Dd](-|\s)?\d{2,3}', regex=True) |
  filtered_df['title'].str.contains(r'[Dd]istrict\s?\d{1,3}|[Dd](-|\s)?\d{2,3}', regex=True))]


In [164]:

#Data export
# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\exports')
# no_districts.to_csv('no_districts.csv', index=False)


#regine values with districts
w_districts = filtered_df[(filtered_df['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) | 
                filtered_df['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True))]

w_districts.reset_index(inplace=True, drop=True)
w_districts['chamber'] = ""

  w_districts = filtered_df[(filtered_df['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) |
  filtered_df['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True))]


In [168]:
i = 0
for a,b in zip(w_districts.title, w_districts.org):
    # print('#######################')
    # print('***********')
    # print(a)
    # print('***********')
    # print(b)
    # continue
    has_a = False
    has_b = False
    if 'district' in str(a).lower() or re.search(r'[Dd]-?\s?\d{1,3}[A-Za-z]?', str(a)):
        match_a = re.findall(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd]-?\s?\d{1,3}[A-Za-z]?', str(a))
        match_a = [x for x in match_a if len(x) > 0]
        if len(match_a) == 0:
            print('no results for title')
            print(a)
            
        else:
            has_a = True
            match = match_a[0]
            # print("a match: " + match)
            # print('################')
            # print(match_a)
        # print(str(dis))
    
    if 'district' in str(b).lower() or re.search(r'[Dd]-?\s?\d{1,3}[A-Za-z]?', str(b)):
        match_b = re.findall(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd]-?\s?\d{1,3}[A-Za-z]?', str(b))
        match_b = [x for x in match_b if len(x) > 0]
        if len(match_b) == 0:
            print('no results for org')
            print(b)
        
        else:
            has_b = True
            match = match_b[0]
            # print("b match: " + match)
            # print('################')
            # print(match_b)
    # else:
    #     print('no results')
    #     print(a)
    #     print(b)


    if has_b == True or has_a == True:
        match_final = re.findall(r'\d+[A-Za-z]?', str(match))
        # print("final match: " + str(match_final[0]))
        # print("putting it on row: " + str(i))
        w_districts.loc[i, 'district'] = str(match_final[0]).strip().lstrip('0')

    i +=1

### Fix for no districts
The chunk below brings in a manually edited file that incorporates districts from match where available
missing info mostly comes from states where we have not pulled legislator data

In [169]:
patch_file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\archive\no_districts_attendance_patch.xlsx"
districts_patch = pd.read_excel(patch_file)

patched_df = pd.concat([w_districts,districts_patch])
patched_df.reset_index(inplace=True, drop=True)
i = 0
for a,b in zip(patched_df.title, patched_df.org):
    if re.search(r'[Hh]ouse|[Ss]enate', str(b)):
        if re.search(r'[Hh]ouse', str(b)):
            chamber = "House"
        elif re.search(r'[Ss]enate', str(b)):
            chamber = "Senate"
    elif re.search(r'[Rr]epresentative|[Ss]enator|[Dd]elegate', str(a)):
        if re.search(r'[Rr]epresentative|[Dd]elegate', str(a)):
            chamber = "House"
        elif re.search(r'[Ss]enator', str(a)):
            chamber = "Senate"

    try:
        patched_df.loc[i,'chamber'] = str(chamber)
        i += 1
    except:
        i += 1
        continue



### compiling events

takes all of the events 

In [170]:



#creating helper column
patched_df['helper'] = patched_df['state'].astype(str)+ "-"+ patched_df['chamber'].astype(str)+ "-"+patched_df['district'].astype(str)
patched_df.loc[patched_df['helper'].str.contains(r'^CT-Senate', regex=True), 'helper'] = patched_df['state'].astype(str)+ "-"+ patched_df['chamber'].astype(str)+ "-S"+patched_df['district'].astype(str)
patched_df.loc[patched_df['helper'].str.contains(r'^ND-House'), 'helper'] = patched_df['state'].astype(str)+ "-"+ patched_df['chamber'].astype(str)+ "-"+patched_df['district'].astype(str)+"-"+patched_df['last_name'].astype(str)
patched_df.loc[patched_df['state'].isna() | (patched_df['state'] == "") | (patched_df['district'].isna()), 'helper'] = None
patch_minus_nan = patched_df[~(patched_df['helper'].isna())]
# patched_df.loc[patched_df['helper'].str.contains(r'^ND-House'), 'helper'] = patched_df['state'].astype(str)+ "-"+ patched_df['chamber'].astype(str)+ "-"+patched_df['district'].astype(str)+"-"+patched_df['last_name'].astype(str)

# print(patch_minus_nan.columns)
thi_states_df = patch_minus_nan.loc[:,['helper','first_name', 'last_name', 'honorific', 'title', 'org', 'district',
       'role', 'state', 'event name']]

grouped_df = thi_states_df.groupby('helper').agg({
    'first_name': 'first',
    'last_name': 'first',
    'honorific': 'first',
    'title': 'first',
    'org': 'first',
    'district': 'first',
    'state': 'first',
    'event name': lambda x: '|'.join(
        f"{sc} ({ac})" if not pd.isna(ac) else f"{sc}"
        for sc, ac in zip(thi_states_df.loc[x.index, 'event name'], thi_states_df.loc[x.index, 'role'])),

}).reset_index()
# grouped_df.reset_index()
grouped_df.rename(columns={'event name': 'events'}, inplace=True)





## Scoring

Cell below calculates the activities score from the attendance data

In [None]:

grouped_df.loc[:, 'activities_score'] = 0
for i,j in enumerate(grouped_df['events']):
    
    
    # split up events
    event_split = str(j).split('|')
    events = ";".join(event_split)
    # if len(event_split) < 2:
    #     continue


    # print(grouped_df.loc[i,['first_name','last_name' ]])
    fname = grouped_df.at[i,'first_name']
    lname = grouped_df.at[i,'last_name']


    names = [fname, lname]
    name = " ".join(names)
    
    
    
    
    # display_markdown(f' ## {name}', raw=True)
    # print(bordered(events))


    scores = []
    for event in event_split:

        # display_markdown(f' ## {event}', raw=True)
        # #print(name)
        score = 0
        speaker = False
        is_hkf = False
        dev_program = False
        in_state = False
        out_state = False
        is_slr = False
        dinner_or_lunch = False
        # non_slr = False
        speaker = False
        # #print('#################')
        # #print(*grouped_df.loc[i,['helper','first_name', 'last_name', 'events']], sep=" \ ")
        
        
        # #print(bordered(event))
        if re.search(r'\(.+\)', str(event)):
            match = re.findall(r'\(.+\)', str(event))
            match_refine = [x for x in match if len(x) != 0]
            #print("match refine results", match_refine)
            if len(match_refine) != 0:
                for m in match_refine:
                    if re.search('speaker|presenter', str(m).lower()):
                        print('found a speaker')
                        speaker = True
                    elif 'HKF' in str(m):
                        #print('THERE IS HKF IN THE RESULTS')
                        is_hkf = True
                
        
        
        if re.search(r'[Dd]inner|[Ll]unch', str(event)):
            dinner_or_lunch = True


        state = str(grouped_df.loc[i,'helper']).split('-')[0].strip()
        
        # #print(re.match('ECLS', str(event)))
        
        if 'ECLS' not in str(event) or "HKF" not in str(event):
            #print("no ecls or hKF")
            try:
                event_state = re.findall(state_abv_pat, str(event))[0].strip()
                if event_state == state:
                #print("states match")
                    in_state = True
                else:
                    out_state = True
            except:
                
                print(str(event))
                print('no state match')
                

            

        else:
            out_state = True
            
            

        if 'HSPF' in str(event) or 'Elevate' in str(event):
            dev_program = True

        if re.search(r'SLR|HLR',str(event)):
            is_slr = True

        if re.search(r'\s[Mm]\d', str(event)):
            non_slr = True
        
        variables = [
        speaker,
        is_hkf,
        dev_program,
        in_state,
        out_state,
        is_slr,
        dinner_or_lunch
        ]
        # #print('quick look at logic')
        # for var_name, var_value in zip(['speaker', 'is_hfk', 'dev_program', 'in_state', 'out_state', 'is_slr', 'dinner_or_lunch', 'non_slr', 'out_of_state'], variables):
        #     #print(bordered(f"{var_name}: {var_value}"))
        

        #Event data
        if is_slr == True:
            score += 15
            # print(f'adding 15 for {name} due to being an slr')
        # else:
        #     score += 10
        #     #print(f'adding 10 for {name}')

        elif dev_program == True:
            score += 15
            # print(f'adding 15 for {name} due to being in an dev program')
        elif dinner_or_lunch == True:
            score += 5
            # print(f'adding 5 for {name} due to being a lunch or dinner')
        else:
            score += 10
            # print("adding 10 for full day event with no other attributes")

        
        #check for speaker
        if speaker == True:
            if in_state == True:
                score += 0
                #if in state no additional points
                # print(f'adding 0 for {name} for being in state speaker')
            elif out_state == True:
                #if out of state add 5 more points for speakers
                # print(f'adding 5 for {name} due ot being a speaker at an out of state event')
                score += 5
        
        # check for hkf
        if is_hkf == True:
            score += 20
            print(f'adding 20 for {name} due to being hkf')


        
        # print(bordered(score))
        scores.append(score)

    # display_markdown(f' ### {name}', raw=True)
    # print(scores)
    total = sum(scores)
    # print("total: ",total)
    
    
    grouped_df.loc[i, 'activities_score'] = total


            
print(grouped_df.to_string())

    # continue

        # print("%%%%%%%%%%%%%%%")
        # print(*match_refine, sep=' - ')
        # print('%%%%%%%%%%%%%%%')

    # continue

    # speaker = False
    # for event in event_split:
    #     if re.search(r'\(.+\)', str(event)):
    #         match = re.findall(r'\(.+\)', str(event))
    #         match = match[0]
    #         if 'speaker' in str(match).lower():
    #             speaker = True
            
    # if len(event_split) == 1

In [172]:
relationship_scores = grouped_df.loc[:,['helper','first_name','last_name','activities_score', 'events']]
os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files')
relationship_scores.to_csv('relationship_scores.csv', index = False)