## Setup

### Imports

In [2]:
#importing modules
import os, sys, json, datetime, re, xlrd  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
from openpyxl import Workbook
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

import sys
# print(sys.executable)

from setuptools import find_packages
# print(find_packages())


from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, extract_title_and_name, get_recent_file

from datetime import date


### Definitions



In [3]:
def clean_str(str):
    cl = str.lower().replace("-","").replace(" ","").strip()
    return cl


In [4]:
def difference(string1, string2):
      # Split both strings into list items
  string1 = string1.split()
  string2 = string2.split()

  A = set(string1) # Store all string1 list items in set A
  B = set(string2) # Store all string2 list items in set B
 
  str_diff = A.symmetric_difference(B)
  isEmpty = (len(str_diff) == 0)
 
  if isEmpty:
    print("No Difference. Both Strings Are Same")
  else:
    print("The Difference Between Two Strings: ")
    print(str_diff)
  
  print('The programs runs successfully.')

# Driver code to call a function
usr_str1 = 'Educative is good'
usr_str2 = 'Educative is bad'
output = difference(usr_str1, usr_str2)


The Difference Between Two Strings: 
{'good', 'bad'}
The programs runs successfully.


## Data Gathering

### Leg Lookup

In [5]:
#pull in recent year leg lookup

leg_path = fr"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\key_creation\2025"
leg_lookup_file = get_recent_file('*.xlsx', leg_path)
# print(leg_lookup_file)
leg_lookup_ref = pd.read_excel(leg_lookup_file)


In [6]:
#cleans up any na's
leg_lookup_ref = leg_lookup_ref.dropna(how='any', subset=['district_code'])
# leg_lookup_ref_2024 = leg_lookup_ref[~leg_lookup_ref['recorded_year'] == 2025]
# leg_lookup_ref = leg_lookup_ref[leg_lookup_ref['recorded_year'] == 2025]

leg_lookup_ref = leg_lookup_ref.reset_index(drop= True)
leg_lookup_ref.head()


# dupes = leg_lookup_ref[leg_lookup_ref['full_pk'].duplicated(keep=False)]
# dupes


Unnamed: 0,full_pk,primary_key,district_code,state abbreviation,chamber,title,first name,last name,party,district,date assumed office,name,tenure,leader,state_code,chamber_code
0,10006300.0,100063,63.0,AL,House,Alabama Representative,Cynthia,Almond,Republican,63.0,2021,AL Rep. Cynthia Almond (R-AL-063),4,,10.0,0.0
1,10006600.0,100066,66.0,AL,House,Alabama Representative,Alan,Baker,Republican,66.0,2006,AL Rep. Alan Baker (R-AL-066),19,,10.0,0.0
2,10004900.0,100049,49.0,AL,House,Alabama Representative,Russell,Bedsole,Republican,49.0,2020,AL Rep. Russell Bedsole (R-AL-049),5,,10.0,0.0
3,10008000.0,100080,80.0,AL,House,Alabama Representative,Chris,Blackshear,Republican,80.0,2016,AL Rep. Chris Blackshear (R-AL-080),9,,10.0,0.0
4,10006100.0,100061,61.0,AL,House,Alabama Representative,Ronald,Bolton,Republican,61.0,2022,"AL Rep. Ronald ""Ron"" Bolton (R-AL-061)",3,,10.0,0.0


In [7]:
#loop_group creation (last_names associated with key)

#groupby data to get primary key and last names associated with it
loop_group = leg_lookup_ref.groupby(['primary_key']).agg({
    'last name': lambda x: '|'.join(
        list(set(f"{sc}" if not pd.isna(sc) else "not found"
        for sc in leg_lookup_ref.loc[x.index, 'last name'])))
}).reset_index()
loop_group['primary_key'] = loop_group['primary_key'].astype(int)
#set up dict for lookup
loop_dict = loop_group.set_index('primary_key')['last name'].to_dict()
for name, data in loop_group.items():
    print(data)

0       100001
1       100002
2       100003
3       100004
4       100005
         ...  
1917    571013
1918    571014
1919    571015
1920    571016
1921    571017
Name: primary_key, Length: 1922, dtype: int64
0                Pettus
1              Harrison
2             Underwood
3                 Moore
4              Crawford
             ...       
1917    Oliverio|Garcia
1918             Taylor
1919      Willis|Thorne
1920     Barrett|Rucker
1921      Takubo|Nelson
Name: last name, Length: 1922, dtype: object


### Attendance data

#### Pull attendance Data

In [8]:
# import all attendance data files
os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\attendance\attendance data')
events = glob.glob("*.xlsx")
print(*events, sep='\n')

2024_WSELR_attendance_data.xlsx
2025_GAR.xlsx
2025_HLR.xlsx
2025_HLR_bootcamp.xlsx
DE_LEG_ED_Dinner_2023.xlsx
ECLS_2024.xlsx
ECLS_2024_v2.xlsx
ElevateNC_C4_M3.xlsx
ElevateNC_C4_M4.xlsx
ElevateNC_C5_M2_2024.xlsx
HBCU_Caucus_2024.xlsx
HKF_C10_S1.xlsx
HKF_Regional_Visit_FAU.xlsx
HSPF_C4_M1.xlsx
HSPF_C4_M2.xlsx
HSPF_C4_M3.xlsx
IL_SLR_2023.xlsx
MO_SLR_2023.xlsx
NCCCS_M4.xlsx
NC_EC_Roundtable_2024.xlsx
NC_HLR_2024.xlsx
ND_Literacy_taskforce_2024.xlsx
ND_SLR_2023.xlsx
ND_SLR_2024.xlsx
ND_TRR_M1.xlsx
ND_TRR_m2.xlsx
ND_TRR_m3.xlsx
OH_SLR_2023.xlsx
OH_SLR_2024.xlsx
OK_SLR_2023.xlsx
OK_SLR_2024.xlsx
SC_Leg_Ed_Dinner_2023.xlsx
The_Path_Forward_2024.xlsx
WV_SLR_2023.xlsx
WV_SLR_2024.xlsx


#### Filling in State info

looks for state names in the title, org, and state fields

In [9]:
#clean event data
event_dfs = []
vals_changed = 0
for event in events:
    df = pd.read_excel(event)
    # print('######################')
    print(bordered(event))
    subset = df[df['last_name'].str.contains("brown", case=False, na=False)]
    print(subset)
    # print(*df.columns)
    #get event_name
    event_name = str(event).split('.')[0].strip().replace('_', ' ')
    df = df.iloc[:,:8]
    df.loc[:,'event name'] = event_name
    
    break_all = False
    # #print(df)
    # continue
    # display_markdown(f'## {event_name}', raw=True)
    for i,state in enumerate(df['state']):

        
        testing_string = str(df['title'].iloc[i]) + " " + str(df['org'].iloc[i])
        # #print(testing_string)
        testing_string = testing_string.lstrip('nan').lstrip().strip()
   
        state_match_uc = re.findall(state_pat, str(df.loc[i,'org']))
        state_match = [x for x in state_match_uc if len(x) > 0]
        
       

        # First match test
        if len(state_match) == 0:
            ##print('no regular state match')
            ##print(state_match_uc)
            state_abv_match_uc = re.findall(state_abv_pat, str(df['org'].iloc[i]))
            state_abv_match = [x for x in state_abv_match_uc if len(x) > 0]
            # Second match test
            if len(state_abv_match) == 0:
                ##print('no state abbreviation match')
                ##print(state_abv_match_uc)
                state_abv_event_match_uc = re.findall(state_abv_pat, str(df['event name'].iloc[i]))
                state_abv_event_match = [x for x in state_abv_event_match_uc if len(x) > 0]
                # Third match test
                if len(state_abv_event_match) == 0:
                    #print('no state abv event match')
                    #print(state_abv_event_match_uc)
                    #print(f'first name: {str(df.loc[i,'first_name'])}')
                    #print(f'last name: {str(df.loc[i,'last_name'])}')
                    #print(f'role: {str(df.loc[i,'role'])}')
                    #print(f'org: {str(df.loc[i,'org'])}')
                    #print(f'title: {str(df.loc[i,'title'])}')
                    continue
                elif len(state_abv_event_match) > 1:
                    #print('more than one match?')
                    # break_all = True
                    break
                else:
                    ##print("abv in event match")
                    state_val = str(state_abv_event_match[0])
                    # df.loc[i,'state'] = None
                    df.loc[i,'state'] = str(df.loc[i,'state'])
                    df.loc[i,'state'] = state_val
                    ##print(state_val)
                    vals_changed += 1
            elif len(state_abv_match) > 1:
                #print('more than one match?')
                #print(state_abv_match)
                ##print(df.loc[i, list(df.columns[:5]) + [df.columns[-1]]])
                break_all = True
                break
            else:
                #print("regular abreviation match")
                
                state_val = str(state_abv_match[0])
                # df.loc[i,'state'] = None
                df.loc[i,'state'] = str(df.loc[i,'state'])
                df.loc[i,'state'] = state_val
                ##print(state_val)
                vals_changed += 1

            # ##print('###########')
            # ##print(df.loc[i, list(df.columns[:5]) + [df.columns[-1]]])
            # ##print('\n')
            # break
        elif len(state_match) > 1:
            #print("more than one match?")
            # break_all = True
            break
        else:
            # #print("normal state match")
            state_val_dirty = str(state_match[0])
            state_val = state_ref.get(state_val_dirty)
            # df.loc[i,'state'] = str(df.loc[i,'state'])
            # df.loc[i,'state'] = None
            df.loc[i,'state'] = str(state_val)
            #print(state_val)
            vals_changed += 1

    event_dfs.append(df)
event_data = pd.concat(event_dfs)
event_data.reset_index(inplace=True, drop = True)

# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\all attendees testing')
# event_data.to_excel('event_data.xlsx', index=False)
# event_data

┌───────────────────────────────┐
│2024_WSELR_attendance_data.xlsx│
└───────────────────────────────┘
Empty DataFrame
Columns: [first_name, last_name, honorific, title, org, district, role, state, short_convening]
Index: []
┌─────────────┐
│2025_GAR.xlsx│
└─────────────┘
Empty DataFrame
Columns: [first_name, last_name, honorific, title, org, district, role, state, short_convening]
Index: []
┌─────────────┐
│2025_HLR.xlsx│
└─────────────┘
    first_name last_name       honorific                    title  \
15  Gloristine     Brown  Representative  NC House Representative   

                 org  district        role state  short_convening  
15  House District 8       8.0  Legislator    NC              NaN  
┌──────────────────────┐
│2025_HLR_bootcamp.xlsx│
└──────────────────────┘
Empty DataFrame
Columns: [first_name, last_name, honorific, title, org, district, role, state, short_convening]
Index: []
┌──────────────────────────┐
│DE_LEG_ED_Dinner_2023.xlsx│
└──────────────────────────┘

In [10]:
event_data

Unnamed: 0,first_name,last_name,honorific,title,org,district,role,state,event name,attendee_type
0,Elsie,Arntzen,Superintendent Arntzen,Superintendent Elsie Arntzen,Office of Public Instruction,,State Education Executive,,2024 WSELR attendance data,
1,June,Atkinson,Superintendent Atkinson,CEO,Emerald Education,,Speaker | Resource Expert,,2024 WSELR attendance data,
2,Kirsten,Baesler,Superintendent Baesler,Superintendent,North Dakota Department of Public Instruction,,State Education Executive,ND,2024 WSELR attendance data,
3,Maggie,Cicco,Maggie,Research Fellow,Edunomics Lab at Georgetown University,,Speaker | Resource Expert,,2024 WSELR attendance data,
4,Lisa,Coons,Superintendent Coons,Superintendent of Public Instruction,Virginia Department of Education,,Speaker | Resource Expert,VA,2024 WSELR attendance data,
...,...,...,...,...,...,...,...,...,...,...
2061,Hank,Hager,,Chief Counsel,West Virginia State Senate Education Committee,,Invited Guests,WV,WV SLR 2024,
2062,Jeff,Kelley,,Assistant Superintendent of District & School ...,West Virginia Department of Education,,Invited Guests,WV,WV SLR 2024,
2063,JB,McCuskey,Auditor,West Virginia State Auditor,West Virginia State Auditor's Office,,Invited Guests,WV,WV SLR 2024,
2064,Mike,Queen,,Deputy Secretary of State,West Virginia Secretary of State's Office,,Invited Guests,WV,WV SLR 2024,


In [11]:
#looks for state names and replaces them with state initials
for i,j in enumerate(event_data['state']):
    
    if isinstance(j, float):
        continue
    elif re.search(r'[A-Z]{2}', str(j)):
        continue
    else:
        val = state_ref.get(str(j))
        event_data.loc[i,'state'] = str(val)

In [12]:
event_data[event_data['last_name'].str.contains('Haas', na=False)]

Unnamed: 0,first_name,last_name,honorific,title,org,district,role,state,event name,attendee_type
426,Melanie,Haas,,Chair,Kansas State Board of Education,,,KS,ECLS 2024,
1469,Jackie,Haas,Representative,State Representative,Illinois House,79.0,Legislator,IL,IL SLR 2023,


##### Defunct

In [13]:
# # export data summary
# #outputs event name with total attendance
# print(list(set(event_data['event name'].to_list())))
# event_data_summary = event_data.groupby(['event name']).size().reset_index(name='Count')

# event_data_summary.to_excel(r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\metadata\event_sum_info.xlsx", index=False)

In [14]:
# #event data

# from datetime import date
# os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\attendance\bulk data')
# event_data.to_csv(f"all_event_data_export_{str(date.today()).replace('-','_')}.csv", index=False)


#### District Cleaning

##### Filtering for Legislators

In [15]:
#Legislator search pattern set up
title_pattern = r'[Rr]epresentative|[Ss]enator|[Ll]egislator'
org_pattern = r'[Ss]enate|[Hh]ouse of ([Rr]epresentatives)?(Delegates)?|(?<!School )(?:House District|District)|[Ss]tate [Hh]ouse'
exclude_pattern = r'[Aa]id(e)?|[Aa]ssistant|[Ss]taff|[Cc]ounsel'

In [16]:
# Filter for state legislators
filtered_df = event_data[
    (
        event_data['title'].astype(str).apply(lambda x: bool(re.search(title_pattern, x))) |
        event_data['org'].astype(str).apply(lambda x: bool(re.search(org_pattern, x)))
    ) &
    ~(
        event_data['org'].astype(str).apply(lambda x: bool(re.search(exclude_pattern, x))) |
        event_data['title'].astype(str).apply(lambda x: bool(re.search(exclude_pattern, x)))
    )
]
filtered_df[filtered_df['last_name'].str.contains('Haas', na = False)]


Unnamed: 0,first_name,last_name,honorific,title,org,district,role,state,event name,attendee_type
1469,Jackie,Haas,Representative,State Representative,Illinois House,79.0,Legislator,IL,IL SLR 2023,


In [17]:
for i,j in enumerate(filtered_df['district']):
    if isinstance(j, float):
        if str(j) != 'nan': 
            # print(type(j))
            print(j)

29.0
49.0
11.0
17.0
88.0
42.0
3.0
8.0
12.0
83.0
95.0
41.0
15.0
22.0
57.0
32.0
42.0
29.0
107.0
82.0
25.0
13.0
100.0
31.0
45.0
30.0
33.0
31.0
38.0
7.0
21.0
109.0
101.0
34.0
112.0
99.0
20.0
4.0
27.0
24.0
115.0
50.0
58.0
97.0
64.0
63.0
35.0
44.0
5.0
41.0
116.0
43.0
26.0
23.0
68.0
36.0
17.0
42.0
3.0
83.0
95.0
22.0
42.0
82.0
100.0
31.0
33.0
31.0
7.0
112.0
27.0
24.0
41.0
116.0
68.0
52.0
59.0
56.0
4.0
31.0
3.0
30.0
85.0
111.0
79.0
19.0
24.0
60.0
89.0
56.0
66.0
32.0
1.0
108.0
71.0
90.0
54.0
62.0
74.0
61.0
51.0
105.0
84.0
16.0


##### Split

In [18]:
##Split those with districts in them and those not
##Find values with districts
districts_assigned = filtered_df[filtered_df['district'].isna() == False]
districts_not_assigned = filtered_df[filtered_df['district'].isna() == True]

w_districts = districts_not_assigned[(districts_not_assigned['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) | 
                districts_not_assigned['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) )&
                ~(districts_not_assigned['title'].str.contains(r'[Aa]ssistant|[Aa]id(e)?|[Ss]taff', regex=True, na=False))].reset_index(drop=True)

##find no districts
n_districts = districts_not_assigned[~(districts_not_assigned['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) | 
                districts_not_assigned['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True)) &
                ~(districts_not_assigned['title'].str.contains(r'[Aa]ssistant|[Aa]id(e)?|[Ss]taff', regex=True, na=False))].reset_index(drop=True)


  w_districts = districts_not_assigned[(districts_not_assigned['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) |
  districts_not_assigned['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) )&
  ~(districts_not_assigned['title'].str.contains(r'[Aa]ssistant|[Aa]id(e)?|[Ss]taff', regex=True, na=False))].reset_index(drop=True)
  n_districts = districts_not_assigned[~(districts_not_assigned['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) |
  districts_not_assigned['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True)) &
  ~(districts_not_assigned['title'].str.contains(r'[Aa]ssistant|[Aa]id(e)?|[Ss]taff', regex=True, na=False))].reset_index(drop=True)


In [19]:
w_districts[w_districts['last_name'].str.contains('Haas')]

Unnamed: 0,first_name,last_name,honorific,title,org,district,role,state,event name,attendee_type


###### W/ Districts

In [20]:
#Looking for districts in title and org field
i = 0
for a,b in zip(w_districts.title, w_districts.org):


    has_a = False
    has_b = False
    if 'district' in str(a).lower() or re.search(r'[Dd]-?\s?\d{1,3}[A-Za-z]?', str(a)):
        match_a = re.findall(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd]-?\s?\d{1,3}[A-Za-z]?', str(a))
        match_a = [x for x in match_a if len(x) > 0]
        if len(match_a) == 0:
            print('no results for title')
            print(a)
            
        else:
            has_a = True
            match = re.sub(r'[A-Za-z]','',str(match_a[0]))

    
    if 'district' in str(b).lower() or re.search(r'[Dd]-?\s?\d{1,3}[A-Za-z]?', str(b)):
        match_b = re.findall(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd]-?\s?\d{1,3}[A-Za-z]?', str(b))
        match_b = [x for x in match_b if len(x) > 0]
        if len(match_b) == 0:
            print('no results for org')
            print(b)
        
        else:
            has_b = True
            match = re.sub(r'[A-Za-z]','',str(match_b[0]))


    if has_b == True or has_a == True:
        match_final = re.findall(r'\d+[A-Za-z]?', str(match))
        # print("final match: " + str(match_final[0]))
        # print("putting it on row: " + str(i))
        w_districts.loc[i, 'district'] = str(match_final[0]).strip().lstrip('0')

    i +=1
# w_districts

###### w/o Districts

In [21]:
#Data export to create patch file
# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\exports')
# n_districts.to_csv('no_districts.csv', index=False)

In [22]:
# Process patch file 
patch_file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\patch files\no_districts_patch.csv"
districts_patch = pd.read_csv(patch_file)
# districts_patch

###### Patch

In [23]:
#pull together all data
patched_df = pd.concat([w_districts,districts_patch, districts_assigned]).reset_index(drop=True)


#### Final Clean-ups

In [24]:
#looks through and assigns chamber column to either house or senate
patched_df['chamber'] = ""
i = 0
for a,b in zip(patched_df.title, patched_df.org):
    if re.search(r'[Hh]ouse|[Ss]enate', str(b)):
        if re.search(r'[Hh]ouse', str(b)):
            chamber = "House"
        elif re.search(r'[Ss]enate', str(b)):
            chamber = "Senate"
    elif re.search(r'[Rr]epresentative|[Ss]enator|[Dd]elegate', str(a)):
        if re.search(r'[Rr]epresentative|[Dd]elegate', str(a)):
            chamber = "House"
        elif re.search(r'[Ss]enator', str(a)):
            chamber = "Senate"

    try:
        patched_df.loc[i,'chamber'] = str(chamber)
        i += 1
    except:
        i += 1
        continue


#find only thi states
patched_df = patched_df[patched_df['state'].isin(thi_states)]
patched_df = patched_df[~patched_df['district'].isna()]
patched_df.reset_index(inplace=True, drop=True)

#name edits
patched_df['first_name'] = patched_df['first_name'].str.strip().str.title()
patched_df['last_name'] = patched_df['last_name'].str.strip().str.title().str.replace("' ", "'")
# condition = patched_df['last_name'].str.contains(r'(?!\w+)\s(?<!\w)', regex = True)
# patched_df.loc[condition, 'last_name'] = (patched_df['last_name'].str.split(r'\s*,\s*(?=[A-Z])').str[0])
# Identify rows where 'last_name' has two words separated by whitespace
# patched_df


#### Pull in Event Metadata

In [25]:
#grab event dae metadata file
metadata_file = glob.glob(r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\attendance\metadata\*date*")[0]


event_dates_df = pd.read_excel(metadata_file)

event_dates_df = event_dates_df[['event', 'end_date']].drop_duplicates()
# event_dates_df


event_dates_df['recorded_year'] = event_dates_df['end_date'].dt.year.astype(str)
patched_df = pd.merge(patched_df, event_dates_df, how = "left", left_on="event name", right_on='event')
patched_df = patched_df.drop(['event', 'end_date'], axis = 'columns')


#### Create Primary Keys

In [26]:
#function call to create
cleaned_df,duplicates = create_pk(patched_df,'district', 'chamber')
# cleaned_df

In [27]:
#testing chunk
#checking lengths
# clean_dfs = [cleaned_df,duplicates]
# clean_df = pd.concat(clean_dfs)

# lengths = [patched_df, cleaned_df, duplicates]

# for l in lengths:
#     print(len(l))
#     print(l.shape[0])
# grouped_df.reset_index()


In [28]:
#clean non_dupes and rename columns
non_dupe_df = cleaned_df.loc[:,['primary_key', 'first_name','last_name', 'state','event name', 'recorded_year']]
non_dupe_df.rename(columns={'event name': 'events'}, inplace=True)



## Event Summary Data detour

#### Set up

In [29]:
#set upevent summary file
# primary purpose of this is to catch this info for general event information like attendance counts, etc

#pull single event goers
event_data_single = non_dupe_df.copy()
#clean up duplicates data
event_data_multiple = duplicates.loc[:,['primary_key', 'first_name','last_name', 'state','event name', 'recorded_year']]
event_data_multiple.rename(columns={'event name': 'events'}, inplace=True)


#pull event data back together
merged_dfs = [event_data_single, event_data_multiple]
event_sum_df = pd.concat(merged_dfs)
event_sum_df = event_sum_df.reset_index(drop=True)
# event_sum_df

In [30]:
#resplitting (single/non msl vs dupes/msl)

#reconfigure columns
event_sum_df_cl = event_sum_df.loc[:,['primary_key', 'first_name', 'last_name', 'events', 'state']]
event_sum_df_cl['primary_key'] = event_sum_df_cl['primary_key'].astype(str)


# #get single and non msl states
event_sum_df_nd = event_sum_df_cl[~event_sum_df_cl['primary_key'].duplicated()].reset_index(drop = True)
# event_sum_df_dp = event_sum_df_cl[event_sum_df_cl['primary_key'].duplicated()].reset_index(drop = True)
event_sum_df_nd = event_sum_df_nd[~event_sum_df_nd['primary_key'].str.startswith(('430','571'))].reset_index(drop = True)

# #get dupes and msl states
event_sum_df_dp_og = event_sum_df_cl[event_sum_df_cl['primary_key'].duplicated(keep=False)].reset_index(drop = True)
event_sum_df_dp_add = event_sum_df_cl[event_sum_df_cl['primary_key'].str.startswith(('430','571'))].reset_index(drop = True)

#pull the separated dupes back together
event_sum_df_dp = pd.concat([event_sum_df_dp_og,event_sum_df_dp_add]).reset_index(drop = True)
# event_sum_df_cl
# print(*event_sum_df_dp.columns, sep=" | ")

In [31]:
#create helper dict for fpk and last name
leg_lookup_ref['full_pk'] = leg_lookup_ref['full_pk'].astype(int).astype(str)
leg_dict = dict(zip(leg_lookup_ref['full_pk'],leg_lookup_ref['last name']))

In [32]:
#dupe full pk assignment

# Maintain a set to track processed primary keys
# processed_keys = set()
pks_processed = []
event_sum_df_dp['full_pk'] = np.nan
# Iterate over the primary keys
assigned = False
for i, j in enumerate(event_sum_df_dp['primary_key']):
    assignment_single = False
    alt_assignment = False
    continue_all = False
    change_primary_key = False
    full_pk = "blank"

    
    # display_markdown(fr'## {j}', raw = True)
    print('\n')
    print('\n')
    print(fr'^^^^^^^^^^^^^^^^^Iteration {i}^^^^^^^^^^^^^^^^^')
    # print(fr'{{{{{{{{{{{{{{{{{{{{{{{{Iteration {i}}}}}}}}}}}}}}}}}}}}}}}}}')
    print(fr'## {j}')
    row_last_name = event_sum_df_dp.loc[i,['last_name']][0]
    print(f"row's last_name: {row_last_name}")
    print(type(row_last_name))
    # Filter rows with the current key and get lists to check
    new_df = event_sum_df_dp[event_sum_df_dp['primary_key'] == j]
    print(new_df.to_string())
    


    int_j = int(j)
    last_names_pre = new_df['last_name'].to_list()
    last_names_post = list(set(last_names_pre))
    print(f'last names list: {len(last_names_post)}')
    #check if unique list is greater than one if so find the correct names for 2025
    if len(last_names_post) > 1 or str(int(j)).startswith(('43','57')):
        route = "multi"
        # print('###########')
        print('list of last names')
        print(last_names_post)
        # print(j)
        loop_results = loop_dict.get(int_j)

        try:
            loop_results = loop_results.split('|')
        except:
            print(int(j))
            continue
        loop_results = [clean_str(loop) for loop in loop_results]
        print(f'loop_results: {loop_results}')
        for li, l in enumerate(last_names_post): #goes through names returned from subset
            if clean_str(l) not in loop_results:
                #look in the other chamber
                
                
                print(f'{clean_str(l)} was not in any of {loop_results}')
                #get the chamber to find out what to switch
                chamber = re.match(r'^\d{2}(\d{1})\d+', str(j))
                print(chamber.group(1))
                
                if chamber.group(1) == "0":
                    alt = 1
                else:
                    alt = 0
                # print(alt)
                alt = str(alt)

                #new_j for looking up in the other branch of the state
                new_j = re.sub(r"(^\d{2})\d(\d+)", lambda m: f"{m.group(1)}{alt}{m.group(2)}", str(j))
                # print(f'new_j type: {type(new_j)}')

                #repull data from leg lookup
                loop_results_inner = loop_dict.get(int(new_j))
                print(f'loop_results_inner: {loop_results_inner}')
                # print(f'type: {type(loop_results_inner)}')
                
                #if loop results are valid
                if loop_results_inner is not None:
                    if not re.search(r'^57', str(new_j)):
                        loop_inner_list = loop_results_inner.split('|')
                        if l in loop_inner_list:
                            if clean_str(l) == clean_str(row_last_name):
                                full_pk = new_j + "00"
                                result_l_name = l
                                # event_sum_df_dp.loc[i,'primary_key'] = int(new_j)
                                change_primary_key = True
                                # print('#####ASSIGNING######')
                                # print(f'value being assigned: {str(int(full_pk))}')
                                # print('to this row values:')
                                # print(event_sum_df_dp.loc[i,:])
                                # event_sum_df_dp.loc[i,'full_pk'] = str(int(full_pk))
                                continue_all = True
                                alt_assignment = True
                                break
                            else:
                                continue
                        else: #last straw/check
                            #(to make it here it didnt come up in the list of legs for the current chamber and year nor other chamber)
                            last_names_post.remove(l)

                        # continue
                    # else:
                        # event_sum_df_dp.loc[i,'primary_key'] = int(new_j)
                        # continue
                else:
                    print('not in: ')
                    print(loop_results)
                    last_names_post.remove(l)
                        


            else: 
                if clean_str(l) == clean_str(row_last_name):
                    if not str(j).startswith(('430', '571')):
                        assignment_single = True
                        full_pk = str(int(j)) + "00"
                        print("it's in there")
                        break
                    else:
                        for num in ['01','02']:
                            j_lu = str(j)+num
                            print(f'fpk for lookup: {j_lu}')
                            look_result = leg_dict.get(str(j_lu))
                            look_result_lname = clean_str(look_result)
                            print(f'type: {type(look_result)}')
                            if look_result is None:
                                continue
                            elif look_result is None and num == '02':
                                print(f'couldnt find one for {l} at {str(j)}')
                                break
                            elif isinstance(look_result, str) and len(look_result)>0:
                                if clean_str(row_last_name) == look_result_lname:
                                    print(f'look result: {look_result}')
                                    full_pk = j_lu
                                    assignment_single = True
                                    break
                            
                    break

                else:
                    print(f'{l} didnt match row name ({row_last_name})')
                    continue
                    
        

                

                
        # print('final list')
        # print(last_names_post)
    #if only one, just make sure its a current legislator by checking the leg ref
    elif len(last_names_post) == 1:
        route = "single"
        l_name = last_names_post[0]

        print(f'the only last name initially: {l_name}')
        l_name_lu = leg_lookup_ref[leg_lookup_ref['primary_key']== str(int(j))]
        print('[[[[[[[[[]]]]]]]]]')
        print('compare the names below')
        print(l_name)
        print(l_name_lu['last name'].to_list())
        print('[[[[[[[[[]]]]]]]]]')
        print('df results') 
        
        print(l_name_lu.to_string())
        print('[[[[[[[[[]]]]]]]]]')
        
        try:
            l_name_res = l_name_lu['last name'].to_list()
            print(f'l_name_res: {l_name_res[0]}')
            l=l_name_res[0]
            if l_name_res[0].lower().replace('-', '').replace(' ', '') == l_name.lower().replace('-', '').replace(' ', ''):
                if clean_str(l) == clean_str(row_last_name):
                    print('names match')
                    result_l_name = l_name
                    full_pk = j + "00"
                    # print('#####ASSIGNING######')
                    # event_sum_df_dp.loc[i,'full_pk'] = full_pk
                    print('got assignment')
                    assignment_single = True
                    # continue
                else:
                    print('didnt match row name')
                    # continue
            else:
                print('not a current legislator')
            

        except:
            print('one of the l names isnt right')
            print(l_name_res)
            print(l_name)
            print('breaking now')
            break
    else:
        print('this list is 0?')
        print(last_names_post)
        print('breaking now')
        break
    # print(have_assignment)
    
    # Breaking point for assignments or to keep going
    print('___________assignment break_____________')

    print(f'              {route}                  ')
    print('________________________________________')
    if 'blank' not in str(full_pk) or (full_pk).startswith(r'430'):
        if assignment_single == True:
            event_sum_df_dp.loc[i,'full_pk'] = str(int(full_pk))
            print('#####ASSIGNING value###### (from single assignment)')
            print(f'Setting full_pk to: {str(int(full_pk))}')
            print(f'row name: {row_last_name}')
            print(f'original pk: {str(j)}')




        if alt_assignment == True:
            print(f'result name: {result_l_name}')
            print(f'row name: {row_last_name}')
            
            
            print('#####ASSIGNING updated value######')
            print(f'Setting full_pk to: {str(int(full_pk))}')
            print(f'result name: {result_l_name}')
            print(f'row name: {row_last_name}')
            event_sum_df_dp.loc[i,'full_pk'] = str(int(full_pk))

        if change_primary_key == True:
            print('#####Changing key######')
            print(f'primary key from [{str(int(j))}] to [{str(int(new_j))}] ')
            print(f'result name: {result_l_name}')
            print(f'row name: {row_last_name}')
            event_sum_df_dp.loc[i,'primary_key'] = int(new_j)

        if continue_all == True:
            continue
        
        if alt_assignment == True or assignment_single == True:
            continue
    else:
        # print('using j as primary key to look up ')
        lookup_df = leg_lookup_ref.loc[:,['last name', 'full_pk', 'primary_key']]
        lookup_df = lookup_df[lookup_df['primary_key']== j]
        lookup_seat = dict(zip(lookup_df['last name'], lookup_df['full_pk']))
        lookup_df = lookup_df.set_index('last name')

        #break for outputs
        print("############")
        print('###Lookup###')
        print("############")
        print(lookup_df.to_string())
        print("_________")

        print('\n')
        print('___________________')
        print('Final Assignment loop for remaining')
        print('___________________')
        print('\n')

        #last check for names by going through lookup_seat
        for k in last_names_post:
            print(k)
            # t = lookup_df.loc[k, 'full_pk']
            t = lookup_seat.get(k)
            print(t)
            if str(t) == "None":
                print('-----------------')
                print('none')
                print(f'last name: {k}')
                print('-----------------')
                continue
                # print(j)
                # print(v2_df.to_string())
            print('this should be the full pk')
            print(int(t))
            last_name_at_value = event_sum_df_dp.loc[i,'last_name']
            if k == last_name_at_value:
                print('#####ASSIGNING######')
                event_sum_df_dp.loc[i,'full_pk'] = int(t)







^^^^^^^^^^^^^^^^^Iteration 0^^^^^^^^^^^^^^^^^
## 401005
row's last_name: Jaramillo
<class 'str'>
  primary_key first_name  last_name                  events state  full_pk
0      401005        Leo  Jaramillo               ECLS 2024    NM      NaN
1      401005        Leo  Jaramillo  HKF Regional Visit FAU    NM      NaN
last names list: 1
the only last name initially: Jaramillo
[[[[[[[[[]]]]]]]]]
compare the names below
Jaramillo
['Jaramillo']
[[[[[[[[[]]]]]]]]]
df results
       full_pk primary_key  district_code state abbreviation chamber               title first name  last name     party  district  date assumed office                              name  tenure leader  state_code  chamber_code
1408  40100500      401005            5.0                 NM  Senate  New Mexico Senator        Leo  Jaramillo  Democrat       5.0                 2020  NM Sen. Leo Jaramillo (D-NM-005)       5    NaN        40.0           1.0
[[[[[[[[[]]]]]]]]]
l_name_res: Jaramillo
names match
got assignm

[[[[[[[[[]]]]]]]]]
compare the names below
Ross
['Ross']
[[[[[[[[[]]]]]]]]]
df results
       full_pk primary_key  district_code state abbreviation chamber                          title first name last name       party  district  date assumed office                             name  tenure leader  state_code  chamber_code
1111  42006300      420063           63.0                 NC   House  North Carolina Representative    Stephen      Ross  Republican      63.0                 2022  NC Rep. Stephen Ross (R-NC-063)       3    NaN        42.0           0.0
[[[[[[[[[]]]]]]]]]
l_name_res: Ross
names match
got assignment
___________assignment break_____________
              single                  
________________________________________
#####ASSIGNING value###### (from single assignment)
Setting full_pk to: 42006300
row name: Ross
original pk: 420063




^^^^^^^^^^^^^^^^^Iteration 30^^^^^^^^^^^^^^^^^
## 420044
row's last_name: Smith
<class 'str'>
    primary_key first_name last_name   

In [33]:
#view event_sum_df_dp df
event_sum_df_dp

Unnamed: 0,primary_key,first_name,last_name,events,state,full_pk
0,401005,Leo,Jaramillo,ECLS 2024,NM,40100500
1,401005,Leo,Jaramillo,HKF Regional Visit FAU,NM,40100500
2,571013,Michael,Oliverio,HKF Regional Visit FAU,WV,57101302
3,420057,Ashton,Clemmons,HSPF C4 M1,NC,
4,420109,Donnie,Loftis,HSPF C4 M1,NC,42010900
...,...,...,...,...,...,...
333,430025,Cindy,Schreiber-Beck,ND SLR 2023,ND,43002502
334,430037,Vicky,Steiner,ND SLR 2023,ND,43003702
335,431006,Paul,Thomas,ND SLR 2023,ND,43100600
336,430033,Bill,Tveit,ND SLR 2023,ND,43003302


In [34]:
#final clean ups
#add full pk to nondupes
event_sum_df_nd['full_pk'] = event_sum_df_nd['primary_key'] + "00"
event_sum_df_nd['source'] = 'normal'
event_sum_df_dp['source'] = "dupe"
event_sum_dfs = [event_sum_df_nd,event_sum_df_dp]

event_summary_df = pd.concat(event_sum_dfs).reset_index(drop=True)
event_summary_df['events'] = event_summary_df['events'].str.replace('(', '').str.replace(')', '').str.strip()
event_summary_df.columns
event_summary_df

#move full_pk column
first_column = event_summary_df.pop('full_pk')
event_summary_df.insert(0, 'full_pk', first_column)


#### Party Data

In [35]:
#divide by missing to get parties
missing = event_summary_df[event_summary_df['full_pk'].isna()].reset_index(drop=True)
norms = event_summary_df[~event_summary_df['full_pk'].isna()].reset_index(drop=True)
norms['full_pk'] = norms['full_pk'].astype(int).astype(str)


#set up dict lookup for norms 
party_dict = dict(zip(leg_lookup_ref['full_pk'], leg_lookup_ref['party']))
party_lookup = leg_lookup_ref.loc[:,['full_pk','party']]
party_lookup['full_pk'] = party_lookup['full_pk'].astype(int).astype(str)

#merge to get parties
norms_w_party = pd.merge(norms, party_lookup, how = "left",on='full_pk').reset_index(drop=True)


In [36]:
#last year info lookup setup 

#file and data import
legs_path_2024 = r"C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\legislator data\all_leg_files\2024"
legs_2024 = pd.read_excel(get_recent_file(r"*",legs_path_2024))


#dict setup
leg_dict_24_name = dict(zip(legs_2024['primary_key'], legs_2024['Last Name']))
leg_dict_24_party = dict(zip(legs_2024['primary_key'], legs_2024['Party']))



In [37]:
#pulls in party for older non active leg
missing['party']  = np.nan  
for i,j in enumerate(missing['primary_key']):
    legs = leg_dict_24_name.get(j)
    
    if isinstance(legs, str):
        party = leg_dict_24_party.get(j)
        missing.loc[i,['party']] = party
        # print(party)

    # print(legs)
    # print(type(legs))
    if legs is None:
        continue
        # print(missing.loc[i,:])

In [38]:
#pull party df's back together

dfs = [missing,norms_w_party]
event_summary_df = pd.concat(dfs).reset_index(drop =True)
# event_summary_df

In [39]:
#*testing*

prim_keys_after = list(set(event_sum_df['primary_key'].to_list()))
prim_keys_before = event_sum_df['primary_key'].to_list()
test = [prim_keys_before,prim_keys_after]
for t in test:
    print(len(t))




462
310


In [40]:
# create full name column
event_summary_df['full_name'] = event_summary_df['first_name'] + " " + event_summary_df['last_name']

move_full_name = event_summary_df

# event_summary_df.head(2)

### Final Cleaning

In [41]:
#eliminate generated duplicates between first name, last name, and events
event_summary_df = event_summary_df.drop_duplicates(subset=['first_name', 'last_name', 'events'])


In [42]:
#assigning seats
pk_list = list(set((event_summary_df['primary_key'].to_list())))

for p in pk_list:
    # if "420057" in p:
    print(str(p))
    
    ## return rows matching pk
    p_rows = event_summary_df[event_summary_df['primary_key'] == p]
    # print(p_rows.to_string())
    name_list = list(set(p_rows['full_name'].to_list()))
    
    
    #skip logic
    if len(name_list) == 1:
        continue 
    else:
        print(name_list)
        print(p_rows.to_string())

    #
    has_nan = False
    fpk_ls = p_rows['full_pk'].to_list()
    if str(p).startswith('430'):
        new_ls = list(set(fpk_ls))
        print(f'new_ls: {new_ls}')
        have_seat_one = False
        have_seat_two = False
        for n in new_ls:
            if str(n)[:-2] == "01":
                if have_seat_one == True:
                    print("already have one")
                    break
                else:
                    have_seat_one = True
            if str(n)[:-2] == "02":
                if have_seat_one == True:
                    print("already have one")
                    break
                else:
                    have_seat_two = True
   

    

        
    for xi, x in enumerate(fpk_ls):
        if str(x) == 'nan':
            has_nan = True 
        elif xi == (len(fpk_ls)-1 ):
            break
    if has_nan == True:
        print('__________')
        print(p_rows.to_string())
        print('__________')
        print('\n')
        # to_change = p_rows.index.to_list()
        # for t in to_change:
        #     event_summary_df.loc[t, ['full_pk']] = np.nan


        
event_summary_df

340004
440099
420057
['Ashton Clemmons', 'Tracy Clark']
      full_pk primary_key first_name last_name                 events state source     party        full_name
0         NaN      420057     Ashton  Clemmons             HSPF C4 M1    NC   dupe  Democrat  Ashton Clemmons
2         NaN      420057     Ashton  Clemmons  NC EC Roundtable 2024    NC   dupe  Democrat  Ashton Clemmons
482  42005700      420057      Tracy     Clark               2025 HLR    NC   dupe  Democrat      Tracy Clark
__________
      full_pk primary_key first_name last_name                 events state source     party        full_name
0         NaN      420057     Ashton  Clemmons             HSPF C4 M1    NC   dupe  Democrat  Ashton Clemmons
2         NaN      420057     Ashton  Clemmons  NC EC Roundtable 2024    NC   dupe  Democrat  Ashton Clemmons
482  42005700      420057      Tracy     Clark               2025 HLR    NC   dupe  Democrat      Tracy Clark
__________


341020
440024
420099
441019
160065
45001

Unnamed: 0,full_pk,primary_key,first_name,last_name,events,state,source,party,full_name
0,,420057,Ashton,Clemmons,HSPF C4 M1,NC,dupe,Democrat,Ashton Clemmons
1,,420083,Kevin,Crutchfield,NC EC Roundtable 2024,NC,dupe,Republican,Kevin Crutchfield
2,,420057,Ashton,Clemmons,NC EC Roundtable 2024,NC,dupe,Democrat,Ashton Clemmons
3,,420032,Frank,Sossamon,NC EC Roundtable 2024,NC,dupe,Republican,Frank Sossamon
4,,421042,Rachel,Hunt,NC HLR 2024,NC,dupe,Democrat,Rachel Hunt
...,...,...,...,...,...,...,...,...,...
547,57101401,571014,Jay,Taylor,WV SLR 2024,WV,dupe,Republican,Jay Taylor
548,43000902,430009,Jayme,Davis,ND SLR 2023,ND,dupe,Democrat,Jayme Davis
549,43000501,430005,Jay,Fisher,ND SLR 2023,ND,dupe,Republican,Jay Fisher
550,43002602,430026,Jeremy,Olson,ND SLR 2023,ND,dupe,Republican,Jeremy Olson


In [43]:
#testing print statements
# for i,j in enumerate(event_summary_df['full_pk']):
#     pk = event_summary_df.loc[i,['primary_key']]
#     # print(pk[0])
#     # continue
#     if str(j) == 'nan':
#         continue
#     elif str(j)[:-2] == str(pk[0]):
#         continue
#     else:
#         print('________')
#         print(str(j)[:-2])
#         print(str(pk[0]))
#         print('________')

In [44]:
#print out for event_summary columns
print(event_summary_df.columns)
event_summary_df['full_pk'] = event_summary_df['full_pk'].astype('Int64')


Index(['full_pk', 'primary_key', 'first_name', 'last_name', 'events', 'state',
       'source', 'party', 'full_name'],
      dtype='object')


In [45]:
#export event summary
os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\Events')
es_file_name = fr'event_summary_{str(date.today()).replace('-','_')}.xlsx'
es_file_name_csv = fr'event_summary_{str(date.today()).replace('-','_')}.csv'
event_summary_df.to_excel(es_file_name,sheet_name='event_summary')
event_summary_df.to_csv(es_file_name_csv, index=False)
# bills_and_legislators.to_excel(, sheet_name=f'{file_name_bl.replace(".xlsx", "")}', index=False)


## Back to main quest

In [46]:
# set up lookup for primary keys
#grouping data together and getting list of events per legislator
grouped_df = duplicates.groupby(['primary_key','last_name']).agg({
    'state': 'first',
    'first_name': 'first',
    'event name': lambda x: '|'.join(
        list(set(f"{sc} ({ac})" if not pd.isna(ac) else f"{sc}"
        for sc, ac in zip(duplicates.loc[x.index, 'event name'], duplicates.loc[x.index, 'role']))))

}).reset_index()
# grouped_df.reset_index()
grouped_df.rename(columns={'event name': 'events'}, inplace=True)
# state_coding
# grouped_df
# print(cleaned_df.columns)



In [47]:
#pull event data back together
merged_dfs = [grouped_df, non_dupe_df]
leg_events_df = pd.concat(merged_dfs)
leg_events_df = leg_events_df.reset_index(drop=True)
# leg_events_df


In [48]:
# Handle single seat legislature
'''
Description:
- takes in legislators found in events
- handles unique values and adds "00" for the seat
'''

#find none dupes and add seat
leg_events_df_nodupes = leg_events_df[~leg_events_df.duplicated(subset='primary_key', keep = False)]
leg_events_df_nodupes['seat'] = '00'
leg_events_df_nodupes_dict = dict(zip(leg_events_df_nodupes['primary_key'], leg_events_df_nodupes['last_name']))



In [49]:
# Handle multi seat legislatures
#find dupes
leg_events_df_dupes = leg_events_df[leg_events_df.duplicated(subset='primary_key', keep = False)]
leg_events_df_dupes.reset_index(inplace = True, drop = True)

#create seat for dupes
leg_events_df_dupes['seat'] = np.nan
for i,j in enumerate(leg_events_df_dupes['primary_key']):
    #ensure j is an integer
    j = int(j)
    
    #get name we are looking for
    name_to_check = leg_events_df_dupes.loc[i,'last_name']
    trunc_name = str(name_to_check)[:3]
    # print(f"checking for {trunc_name}")
    
    #look for the key, get results, and split up the names into a list
    results = loop_dict.get(j)
    names = results.split('|')
    
    #get length of names
    n = len(names)-1
    
    #go through and check if the name matches either of the one in the list and return the seat
    for ik,name in enumerate(names):
        # print(f"going through {ik}")
        # print(type(name))
        # print(type(trunc_name))
        if re.search(f'^{trunc_name.strip()}', str(name)):
            # print('found it')
            seat = ik + 1
            leg_events_df_dupes.loc[i,'seat'] = f'0{seat}'
            break
        #stops if we are on the last iteration and still no seat
        elif ik == n:

            print(leg_events_df_dupes.loc[i,['state']])
            print(leg_events_df_dupes.loc[i,['primary_key']])
            temp_pk = leg_events_df_dupes.loc[i,['primary_key']]
            try:
                if re.findall(r'(?<=^\d\d)\d', str(temp_pk))[0] == 0:
                    chamber = "lower"
            except:
                print('idk')
                print('_____________')
                print(str(temp_pk))
                test = re.findall(r'(?<=^\d\d)\d', str(temp_pk))
                print(test)
                print('_____________')
            else:
                chamber = "higher"
            if chamber != "lower":
                continue



            new_pk = re.findall(r'^\d{2}', str(temp_pk))[0]+ "1" + re.findall(r'(?<=^\d\d\d)\d+', str(temp_pk))[0]
            print(leg_events_df_nodupes_dict.get(new_pk))
            
         
            print(f"couldn't find one for '{trunc_name}' aka '{name_to_check}' in {names} with length for it being {len(trunc_name)}")

# leg_lookup_ref

state    NC
Name: 1, dtype: object
primary_key    420032
Name: 1, dtype: object
idk
_____________
primary_key    420032
Name: 1, dtype: object
[]
_____________
state    NC
Name: 3, dtype: object
primary_key    420057
Name: 3, dtype: object
idk
_____________
primary_key    420057
Name: 3, dtype: object
[]
_____________
state    NC
Name: 7, dtype: object
primary_key    420083
Name: 7, dtype: object
idk
_____________
primary_key    420083
Name: 7, dtype: object
[]
_____________
state    NC
Name: 9, dtype: object
primary_key    421042
Name: 9, dtype: object
idk
_____________
primary_key    421042
Name: 9, dtype: object
[]
_____________
state    ND
Name: 13, dtype: object
primary_key    430006
Name: 13, dtype: object
idk
_____________
primary_key    430006
Name: 13, dtype: object
[]
_____________
state    ND
Name: 32, dtype: object
primary_key    430042
Name: 32, dtype: object
idk
_____________
primary_key    430042
Name: 32, dtype: object
[]
_____________
state    ND
Name: 34, dtype: objec

In [50]:
#pull all event data back together
events_dfs = [leg_events_df_dupes, leg_events_df_nodupes]
events_df = pd.concat(events_dfs)
events_df.dropna(subset=['seat'], inplace=True)
events_df.reset_index(inplace=True, drop=True)
# events_df


# Scoring

Cell below calculates the activities score from the attendance data

In [51]:
#Calculating Score for loop


# print(events_df.columns)
#For loop description: goes through events column and gathers information for activities scoring
events_df.loc[:, 'activities_score'] = 0
for i,j in enumerate(events_df['events']):
    
    # split up events
    event_split = str(j).split('|')
    events = ";".join(event_split)
    # if len(event_split) < 2:
    #     continue


    #compile name for print statements

    # fname = grouped_df.at[i,'first_name']
    lname = events_df.at[i,'last_name']

    # names = [fname, lname]
    # name = " ".join(names)
    
    
    # display_markdown(f' ## {name}', raw=True)
    # print(bordered(events))

    #For loop description: go through each event and score 
    scores = []
    for event in event_split:

        #intializing boolean values for scoring    
        score = 0
        speaker = False
        is_hkf = False
        dev_program = False
        in_state = False
        out_state = False
        is_slr = False
        dinner_or_lunch = False
        # non_slr = False
        speaker = False
        # #print('#################')
        # #print(*grouped_df.loc[i,['helper','first_name', 'last_name', 'events']], sep=" \ ")
        
        
        # #print(bordered(event))

        #Look through for roles in events
        if re.search(r'\(.+\)', str(event)):
            match = re.findall(r'\(.+\)', str(event))
            match_refine = [x for x in match if len(x) != 0]
            #print("match refine results", match_refine)
            if len(match_refine) != 0:
                for m in match_refine:
                    if re.search('speaker|presenter', str(m).lower()):
                        # print('found a speaker')
                        speaker = True
                    elif 'HKF' in str(m):
                        #print('THERE IS HKF IN THE RESULTS')
                        is_hkf = True
                
        
        #is it just a short engagment such as a dinner or lunch?
        if re.search(r'[Dd]inner|[Ll]unch', str(event)):
            dinner_or_lunch = True

        #get state
        state = events_df.loc[i,'state']
        
        #looking for whether events where in state or out of state
        if 'ECLS' not in str(event) or "HKF" not in str(event):
            #print("no ecls or hKF")
            try:
                event_state = re.findall(state_abv_pat, str(event))[0].strip()
                if event_state == state:
                #print("states match")
                    in_state = True
                else:
                    out_state = True
            except:
                out_state = True
                # print(str(event))
                # print('no state match')
        else:
            out_state = True
            
            
        #lower dev program?
        if 'HSPF' in str(event) or 'Elevate' in str(event):
            dev_program = True

        #State Legislator event?
        if re.search(r'SLR|HLR',str(event)):
            is_slr = True


        # if re.search(r'\s[Mm]\d', str(event)):
        #     non_slr = True
        
        variables = [
        speaker,
        is_hkf,
        dev_program,
        in_state,
        out_state,
        is_slr,
        dinner_or_lunch
        ]

        #Trouble shooting print statement to make sure logic is working
        # #print('quick look at logic')
        # for var_name, var_value in zip(['speaker', 'is_hfk', 'dev_program', 'in_state', 'out_state', 'is_slr', 'dinner_or_lunch', 'non_slr', 'out_of_state'], variables):
        #     #print(bordered(f"{var_name}: {var_value}"))
        

        #Event data scoring 
        if is_slr == True:
            score += 15
            # print(f'adding 15 for {name} due to being an slr')
        # else:
        #     score += 10
        #     #print(f'adding 10 for {name}')

        elif dev_program == True:
            score += 15
            # print(f'adding 15 for {name} due to being in an dev program')
        elif dinner_or_lunch == True:
            score += 5
            # print(f'adding 5 for {name} due to being a lunch or dinner')
        else:
            score += 10
            # print("adding 10 for full day event with no other attributes")

        
        #check for speaker
        if speaker == True:
            if in_state == True:
                score += 0
                #if in state no additional points
                # print(f'adding 0 for {name} for being in state speaker')
            elif out_state == True:
                #if out of state add 5 more points for speakers
                # print(f'adding 5 for {name} due ot being a speaker at an out of state event')
                score += 5
        
        # check for hkf
        if is_hkf == True:
            score += 20
            # print(f'adding 20 for {lname} due to being hkf')


        
        # print(bordered(score))
        scores.append(score)

    # display_markdown(f' ### {name}', raw=True)
    # print(scores)
    total = sum(scores)
    # print("total: ",total)
    
    
    events_df.loc[i, 'activities_score'] = total

In [52]:
#df cleanup
#export activity scores df
activity_scores = events_df.loc[:,['primary_key','seat','first_name','last_name','activities_score', 'events']]

#make full_pk and convert to int
activity_scores['full_pk'] = activity_scores['primary_key'].astype(str) + activity_scores['seat'].astype(str)
activity_scores['full_pk'] = activity_scores['full_pk'].astype(int)

#put full_pk to front
activity_scores = activity_scores.drop('seat', axis=1)
first_column = activity_scores.pop('full_pk')
activity_scores.insert(0, 'full_pk', first_column)


# print(*activity_scores.columns, sep = ' , ')
activity_scores

Unnamed: 0,full_pk,primary_key,first_name,last_name,activities_score,events
0,42003201,420032,Bryan,Cohn,15,2025 HLR (Legislator)
1,42005701,420057,Tracy,Clark,15,2025 HLR (Legislator)
2,42005801,420058,Amos,Quick,15,2025 HLR (Legislator)
3,42005801,420058,Amos,Quick Iii,15,NC HLR 2024 (Legislator)
4,42008301,420083,Grant,Campbell,30,2025 HLR (Legislator)|2025 HLR bootcamp (Legis...
...,...,...,...,...,...,...
315,44006700,440067,Melanie,Miller,15,OH SLR 2024
316,44102100,441021,Kent,Smith,15,OH SLR 2024
317,44004900,440049,Jim,Thomas,15,OH SLR 2024
318,44003600,440036,Andrea,White,15,OH SLR 2024


# Export

In [53]:
#export activity scores
year = 2025
os.chdir(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\THII_build files\2025\bridges\compiling_calcs\{year}')
activity_scores.to_csv(f'activity_scores{str(date.today()).replace('-','_')}.csv', index = False)



# activity_scores