## Setup

### Imports

In [1]:
#importing modules
import os, sys, json, datetime, re, xlrd  # Provides OS-dependent functionality, system-specific parameters, JSON handling, and date/time manipulation
import pandas as pd             # Provides data structures and data analysis tools
from openpyxl import Workbook
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import glob
import time
from tqdm import tqdm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown

import sys
# print(sys.executable)

from setuptools import find_packages
# print(find_packages())


from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, extract_title_and_name, get_recent_file

from datetime import date


## Data Gathering

## gathering leg files

## get all leg files

In [2]:
#pull in key lookup

leg_path = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\connectors\legislator lookup"
leg_lookup_file = get_recent_file('*.xlsx', leg_path)



In [3]:


leg_lookup_ref = pd.read_excel(leg_lookup_file)



leg_lookup_ref = leg_lookup_ref.dropna(how='any', subset=['district_code'])
leg_lookup_ref_2024 = leg_lookup_ref[~leg_lookup_ref['recorded_year'] == 2025]
leg_lookup_ref = leg_lookup_ref[leg_lookup_ref['recorded_year'] == 2025]

leg_lookup_ref = leg_lookup_ref.reset_index(drop= True)
leg_lookup_ref.head()


leg_lookup_ref.drop(["Unnamed: 0"], axis = 1)
# len(leg_lookup_ref)

leg_lookup_ref.to_excel(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\legislator data\all_legs_files\all_leg_records.xlsx', index = False)

dupes = leg_lookup_ref[leg_lookup_ref['full_pk'].duplicated(keep=False)]
dupes


KeyError: 'recorded_year'

In [None]:


#groupby data to get primary key and last names associated with it
loop_group = leg_lookup_ref.groupby(['primary_key']).agg({
    'Last Name': lambda x: '|'.join(
        list(set(f"{sc}" if not pd.isna(sc) else "not found"
        for sc in leg_lookup_ref.loc[x.index, 'Last Name'])))
}).reset_index()
loop_group['primary_key'] = loop_group['primary_key'].astype(int)
#set up dict for lookup
loop_dict = loop_group.set_index('primary_key')['Last Name'].to_dict()
for name, data in loop_group.items():
    print(data)

0       100001
1       100002
2       100003
3       100004
4       100005
         ...  
1916    571012
1917    571013
1918    571014
1919    571016
1920    571017
Name: primary_key, Length: 1921, dtype: int64
0                Pettus
1              Harrison
2             Underwood
3                 Moore
4              Crawford
             ...       
1916       Martin|Queen
1917    Garcia|Oliverio
1918             Taylor
1919     Rucker|Barrett
1920      Nelson|Takubo
Name: Last Name, Length: 1921, dtype: object


: 

## Attendance Data

### gathering attendance data

In [None]:
# import all attendance data files
os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data')
events = glob.glob("*.xlsx")
print(*events, sep='\n')

: 

### Filling in State info

looks for state names in the title, org, and state fields

In [None]:
#clean event data
event_dfs = []
vals_changed = 0
for event in events:
    df = pd.read_excel(event)
    # print('######################')
    # print(bordered(event))
    
    # print(*df.columns)
    #get event_name
    event_name = str(event).split('.')[0].strip().replace('_', ' ')
    df = df.iloc[:,:8]
    df.loc[:,'event name'] = event_name
    
    break_all = False
    # #print(df)
    # continue
    # display_markdown(f'## {event_name}', raw=True)
    for i,state in enumerate(df['state']):

        
        testing_string = str(df['title'].iloc[i]) + " " + str(df['org'].iloc[i])
        # #print(testing_string)
        testing_string = testing_string.lstrip('nan').lstrip().strip()
   
        state_match_uc = re.findall(state_pat, str(df.loc[i,'org']))
        state_match = [x for x in state_match_uc if len(x) > 0]
        
       

        # First match test
        if len(state_match) == 0:
            ##print('no regular state match')
            ##print(state_match_uc)
            state_abv_match_uc = re.findall(state_abv_pat, str(df['org'].iloc[i]))
            state_abv_match = [x for x in state_abv_match_uc if len(x) > 0]
            # Second match test
            if len(state_abv_match) == 0:
                ##print('no state abbreviation match')
                ##print(state_abv_match_uc)
                state_abv_event_match_uc = re.findall(state_abv_pat, str(df['event name'].iloc[i]))
                state_abv_event_match = [x for x in state_abv_event_match_uc if len(x) > 0]
                # Third match test
                if len(state_abv_event_match) == 0:
                    #print('no state abv event match')
                    #print(state_abv_event_match_uc)
                    #print(f'first name: {str(df.loc[i,'first_name'])}')
                    #print(f'last name: {str(df.loc[i,'last_name'])}')
                    #print(f'role: {str(df.loc[i,'role'])}')
                    #print(f'org: {str(df.loc[i,'org'])}')
                    #print(f'title: {str(df.loc[i,'title'])}')
                    continue
                elif len(state_abv_event_match) > 1:
                    #print('more than one match?')
                    # break_all = True
                    break
                else:
                    ##print("abv in event match")
                    state_val = str(state_abv_event_match[0])
                    # df.loc[i,'state'] = None
                    df.loc[i,'state'] = str(df.loc[i,'state'])
                    df.loc[i,'state'] = state_val
                    ##print(state_val)
                    vals_changed += 1
            elif len(state_abv_match) > 1:
                #print('more than one match?')
                #print(state_abv_match)
                ##print(df.loc[i, list(df.columns[:5]) + [df.columns[-1]]])
                break_all = True
                break
            else:
                #print("regular abreviation match")
                
                state_val = str(state_abv_match[0])
                # df.loc[i,'state'] = None
                df.loc[i,'state'] = str(df.loc[i,'state'])
                df.loc[i,'state'] = state_val
                ##print(state_val)
                vals_changed += 1

            # ##print('###########')
            # ##print(df.loc[i, list(df.columns[:5]) + [df.columns[-1]]])
            # ##print('\n')
            # break
        elif len(state_match) > 1:
            #print("more than one match?")
            # break_all = True
            break
        else:
            # #print("normal state match")
            state_val_dirty = str(state_match[0])
            state_val = state_ref.get(state_val_dirty)
            # df.loc[i,'state'] = str(df.loc[i,'state'])
            # df.loc[i,'state'] = None
            df.loc[i,'state'] = str(state_val)
            #print(state_val)
            vals_changed += 1

    event_dfs.append(df)
event_data = pd.concat(event_dfs)
event_data.reset_index(inplace=True, drop = True)

# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\all attendees testing')
# event_data.to_excel('event_data.xlsx', index=False)
# event_data

: 

In [None]:
#looks for state names and replaces them with state initials
for i,j in enumerate(event_data['state']):
    
    if isinstance(j, float):
        continue
    elif re.search(r'[A-Z]{2}', str(j)):
        continue
    else:
        val = state_ref.get(str(j))
        event_data.loc[i,'state'] = str(val)

: 

In [None]:
# export data summary
print(list(set(event_data['event name'].to_list())))
event_data_summary = event_data.groupby(['event name']).size().reset_index(name='Count')


event_data_summary.to_excel(r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\metadata\event_sum_info.xlsx", index=False)

['SC Leg Ed Dinner 2023', 'OH SLR 2023', 'HSPF C4 M2', 'ND SLR 2023', 'ECLS 2024 v2', 'WV SLR 2023', 'ECLS 2024', '2024 WSELR attendance data', 'MO SLR 2023', 'OK SLR 2023', 'NC HLR 2024', 'ND Literacy taskforce 2024', 'ElevateNC C4 M3', 'ND TRR m2', 'OK SLR 2024', 'ND TRR M1', 'ND SLR 2024', 'ElevateNC C5 M2 2024', 'HKF C10 S1', 'OH SLR 2024', 'NCCCS M4', 'HSPF C4 M3', 'DE LEG ED Dinner 2023', 'NC EC Roundtable 2024', 'ElevateNC C4 M4', 'HBCU Caucus 2024', 'WV SLR 2024', 'ND TRR m3', 'HKF Regional Visit FAU', 'The Path Forward 2024', 'HSPF C4 M1']


: 

In [None]:
#event data

from datetime import date
os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\exports')

event_data.to_csv(f"event_data_export_{str(date.today()).replace('-','_')}.csv", index=False)


: 

### District Cleaning

In [None]:
#Legislator search pattern set up
title_pattern = r'[Rr]epresentative|[Ss]enator|[Ll]egislator'
org_pattern = r'[Ss]enate|[Hh]ouse of ([Rr]epresentatives)?(Delegates)?|(?<!School )(?:House District|District)|[Ss]tate [Hh]ouse'
exclude_pattern = r'[Aa]id(e)?|[Aa]ssistant|[Ss]taff'

: 

In [None]:
# Filter for state legislators
filtered_df = event_data[
    (
        event_data['title'].astype(str).apply(lambda x: bool(re.search(title_pattern, x))) |
        event_data['org'].astype(str).apply(lambda x: bool(re.search(org_pattern, x)))
    ) &
    ~(
        event_data['org'].astype(str).apply(lambda x: bool(re.search(exclude_pattern, x)))
    )
]

NameError: name 'event_data' is not defined

: 

In [None]:
##Split those with districts in them and those not
##Find values with districts
w_districts = filtered_df[(filtered_df['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) | 
                filtered_df['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True)) &
                ~(filtered_df['title'].str.contains(r'[Aa]ssistant|[Aa]id(e)?|[Ss]taff', regex=True, na=False))].reset_index(drop=True)

##find no districts
n_districts = filtered_df[~(filtered_df['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) | 
                filtered_df['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True)) &
                ~(filtered_df['title'].str.contains(r'[Aa]ssistant|[Aa]id(e)?|[Ss]taff', regex=True, na=False))].reset_index(drop=True)


  w_districts = filtered_df[(filtered_df['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) |
  filtered_df['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True)) &
  ~(filtered_df['title'].str.contains(r'[Aa]ssistant|[Aa]id(e)?|[Ss]taff', regex=True, na=False))].reset_index(drop=True)
  n_districts = filtered_df[~(filtered_df['org'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True) |
  filtered_df['title'].str.contains(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd](-|\s)?\d{2,3}[A-Za-z]?', regex=True)) &
  ~(filtered_df['title'].str.contains(r'[Aa]ssistant|[Aa]id(e)?|[Ss]taff', regex=True, na=False))].reset_index(drop=True)


Unnamed: 0,first_name,last_name,honorific,title,org,district,role,state,event name,attendee_type
0,Sonia,Galaviz,Representative Galaviz,Idaho Representative District 16,Idaho House of Representatives,,Guest | Other,ID,2024 WSELR attendance data,
1,Kyra,Hoffner,Senator,District 014 Senator,Deleware Senate,,,DE,DE LEG ED Dinner 2023,
2,Russell,Huxtable,Senator,District 006 Senator,Deleware Senate,,,DE,DE LEG ED Dinner 2023,
3,Laura,Sturgeon,Senator,District 004 Senator,Deleware Senate,,,DE,DE LEG ED Dinner 2023,
4,Jeff,Hilovsky,Representative,District 004 Representative,Deleware House of Representatives,,,DE,DE LEG ED Dinner 2023,
...,...,...,...,...,...,...,...,...,...,...
254,Ben,Queen,Senator,Senator,Senate District 12,,Legislator,WV,WV SLR 2024,
255,Patricia,Rucker,Senator,Senator,Senate District 16,,Legislator,WV,WV SLR 2024,
256,Jay,Taylor,Senator,Senator,Senate District 14,,Legislator,WV,WV SLR 2024,
257,Darren,Thorne,Delegate,Delegate,House District 89,,Legislator,WV,WV SLR 2024,


: 

In [None]:
#Looking for districts in title and org field
i = 0
for a,b in zip(w_districts.title, w_districts.org):


    has_a = False
    has_b = False
    if 'district' in str(a).lower() or re.search(r'[Dd]-?\s?\d{1,3}[A-Za-z]?', str(a)):
        match_a = re.findall(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd]-?\s?\d{1,3}[A-Za-z]?', str(a))
        match_a = [x for x in match_a if len(x) > 0]
        if len(match_a) == 0:
            print('no results for title')
            print(a)
            
        else:
            has_a = True
            match = re.sub(r'[A-Za-z]','',str(match_a[0]))

    
    if 'district' in str(b).lower() or re.search(r'[Dd]-?\s?\d{1,3}[A-Za-z]?', str(b)):
        match_b = re.findall(r'[Dd]istrict\s?\d{1,3}[A-Za-z]?|[Dd]-?\s?\d{1,3}[A-Za-z]?', str(b))
        match_b = [x for x in match_b if len(x) > 0]
        if len(match_b) == 0:
            print('no results for org')
            print(b)
        
        else:
            has_b = True
            match = re.sub(r'[A-Za-z]','',str(match_b[0]))


    if has_b == True or has_a == True:
        match_final = re.findall(r'\d+[A-Za-z]?', str(match))
        # print("final match: " + str(match_final[0]))
        # print("putting it on row: " + str(i))
        w_districts.loc[i, 'district'] = str(match_final[0]).strip().lstrip('0')

    i +=1
# w_districts

: 

In [None]:
#Data export to create patch file
# os.chdir(r'C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\exports')
# n_districts.to_csv('no_districts.csv', index=False)

: 

#### Patching Districts

In [None]:
# Process patch file 
patch_file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\patch files\no_districts_patch.csv"
districts_patch = pd.read_csv(patch_file)

#pull together all data
patched_df = pd.concat([w_districts,districts_patch]).reset_index(drop=True)
patched_df



: 

In [None]:

#looks through and assigns chamber column to either house or senate
patched_df['chamber'] = ""
i = 0
for a,b in zip(patched_df.title, patched_df.org):
    if re.search(r'[Hh]ouse|[Ss]enate', str(b)):
        if re.search(r'[Hh]ouse', str(b)):
            chamber = "House"
        elif re.search(r'[Ss]enate', str(b)):
            chamber = "Senate"
    elif re.search(r'[Rr]epresentative|[Ss]enator|[Dd]elegate', str(a)):
        if re.search(r'[Rr]epresentative|[Dd]elegate', str(a)):
            chamber = "House"
        elif re.search(r'[Ss]enator', str(a)):
            chamber = "Senate"

    try:
        patched_df.loc[i,'chamber'] = str(chamber)
        i += 1
    except:
        i += 1
        continue


#find only thi states
patched_df = patched_df[patched_df['state'].isin(thi_states)]
patched_df = patched_df[~patched_df['district'].isna()]
patched_df.reset_index(inplace=True, drop=True)

#name edits
patched_df['first_name'] = patched_df['first_name'].str.strip().str.title()
patched_df['last_name'] = patched_df['last_name'].str.strip().str.title().str.replace("' ", "'")
# condition = patched_df['last_name'].str.contains(r'(?!\w+)\s(?<!\w)', regex = True)
# patched_df.loc[condition, 'last_name'] = (patched_df['last_name'].str.split(r'\s*,\s*(?=[A-Z])').str[0])
# Identify rows where 'last_name' has two words separated by whitespace
# patched_df


: 

#### Pull in Supplemental info

In [None]:
#add in year of event
event_dates_file = r"C:\Users\clutz\OneDrive - THE HUNT INSTITUTE\Documents\Data\attendance data\metadata\event_date_data_v2.xlsx"
event_dates_df = pd.read_excel(event_dates_file)

event_dates_df = event_dates_df[['event', 'end_date']].drop_duplicates()
event_dates_df


event_dates_df['recorded_year'] = event_dates_df['end_date'].dt.year.astype(str)
patched_df = pd.merge(patched_df, event_dates_df, how = "left", left_on="event name", right_on='event')
patched_df = patched_df.drop(['event', 'end_date'], axis = 'columns')


: 

## Create Primary Keys

In [None]:
cleaned_df,duplicates = create_pk(patched_df,'district', 'chamber')


: 

In [None]:
#checking lengths
# clean_dfs = [cleaned_df,duplicates]
# clean_df = pd.concat(clean_dfs)

# lengths = [patched_df, cleaned_df, duplicates]

# for l in lengths:
#     print(len(l))
#     print(l.shape[0])
# grouped_df.reset_index()


: 

In [None]:

#clean non_dupes and rename columns
non_dupe_df = cleaned_df.loc[:,['primary_key', 'first_name','last_name', 'state','event name', 'recorded_year']]
non_dupe_df.rename(columns={'event name': 'events'}, inplace=True)


: 

## Event Summary Data detour

In [None]:

#create event summary file
# primary purpose of this is to catch this info for general event information like attendance counts, etc

#pull single event goers
event_data_single = non_dupe_df.copy()
#clean up duplicates data
event_data_multiple = duplicates.loc[:,['primary_key', 'first_name','last_name', 'state','event name', 'recorded_year']]
event_data_multiple.rename(columns={'event name': 'events'}, inplace=True)


#pull event data back together
merged_dfs = [event_data_single, event_data_multiple]
event_sum_df = pd.concat(merged_dfs)
event_sum_df = event_sum_df.reset_index(drop=True)
event_sum_df

: 

In [None]:
#testing for event summary


event_sum_df_cl = event_sum_df.loc[:,['primary_key', 'first_name', 'last_name', 'events']]
event_sum_df_cl['primary_key'] = event_sum_df_cl['primary_key'].astype(str)
#get single and non msl states
event_sum_df_nd = event_sum_df_cl[~event_sum_df_cl['primary_key'].duplicated()].reset_index(drop = True)
event_sum_df_nd = event_sum_df_nd[~event_sum_df_nd['primary_key'].str.startswith(('430','571'))].reset_index(drop = True)
#get dupes and msl states
event_sum_df_dp_og = event_sum_df_cl[event_sum_df_cl['primary_key'].duplicated(keep=False)].reset_index(drop = True)
event_sum_df_dp_add = event_sum_df_cl[event_sum_df_cl['primary_key'].str.startswith(('430','571'))].reset_index(drop = True)

event_sum_df_dp = pd.concat([event_sum_df_dp_og,event_sum_df_dp_add]).reset_index(drop = True)


: 

In [None]:


# Maintain a set to track processed primary keys
# processed_keys = set()

event_sum_df_dp['full_pk'] = np.nan
# Iterate over the primary keys
for i, j in enumerate(event_sum_df_dp['primary_key']):
    
    # Filter rows with the current key and get lists to check
    new_df = event_sum_df_dp[event_sum_df_dp['primary_key'] == j]
    int_j = int(j)
    last_names_pre = new_df['last_name'].to_list()
    last_names_post = list(set(last_names_pre))

    #check if unique list is greater than one if so find the correct names for 2025
    if len(last_names_post) > 1 or j.startswith(('43','57')):
        
        # print('###########')
        # print('these have more than one after')
        # print(last_names_post)
        # print(j)
        loop_results = loop_dict.get(int_j)

        try:
            loop_results = loop_results.split('|')
        except:
            print(j)
            continue
        for l in last_names_post:
            if l not in loop_results:
                print('deleting: '+ l)
                last_names_post.remove(l)
                print('not in:')
                print(loop_results)
        print('final list')
        print(last_names_post)
    else:
        full_pk = j + "00"
        event_sum_df_dp.loc[i,'full_pk'] = full_pk


        continue
    
    # print('using j as primary key to look up ')
    lookup_df = leg_lookup_ref.loc[:,['Last Name', 'full_pk', 'primary_key']]
    
    lookup_df = lookup_df[lookup_df['primary_key']== j]
    
    lookup_df = lookup_df.set_index('Last Name')
    # print(lookup_df['Last'])


    for k in last_names_post:
        t = lookup_df.loc[k, 'full_pk']
        print(t)
        if str(t) == "None":
            print('-----------------')
            print('none')
            print('-----------------')
            # print(j)
            # print(v2_df.to_string())
        print('this should be the full pk')
        print(t)
        last_name_at_value = event_sum_df_dp.loc[i,'last_name']
        if k == last_name_at_value:
            event_sum_df_dp.loc[i,'full_pk'] = t


    
# leg_lookup_ref
    

    # Update the processed keys set
    # processed_keys.add(j)

    # Print the filtered DataFrame
    # print(new_df.to_string())
    

final list
['Oliverio']
57101302
this should be the full pk
57101302
deleting: Thomas
not in:
['Anderson', 'Vollmer']
final list
['Anderson']
43000601
this should be the full pk
43000601
final list
['Beltz']
43002001
this should be the full pk
43002001
deleting: Boschee
not in:
['Foss', 'Hanson']
final list
[]
final list
['Braunberger']
43101000
this should be the full pk
43101000
final list
['Conley']
43101200
this should be the full pk
43101200
deleting: Cory
not in:
['Osowski', "O'Brien"]
final list
["O'Brien"]
43004201
this should be the full pk
43004201
final list
['Davison']
43104100
this should be the full pk
43104100
deleting: Estenson
not in:
['Weston']
final list
[]
final list
['Hager']
43002101
this should be the full pk
43002101
final list
['Meier', 'Heinert']
43003202
this should be the full pk
43003202
43003201
this should be the full pk
43003201
final list
['Jonas', 'Schauer']
43001301
this should be the full pk
43001301
43001302
this should be the full pk
43001302
final

: 

In [None]:
event_sum_df_nd['full_pk'] = event_sum_df_nd['primary_key'] + "00"
event_sum_dfs = [event_sum_df_nd,event_sum_df_dp]
event_summary_df = pd.concat(event_sum_dfs).reset_index(drop=True)
event_summary_df['events'] = event_summary_df['events'].str.replace('(', '').str.replace(')', '').str.strip()
event_summary_df.columns
event_summary_df

Unnamed: 0,primary_key,first_name,last_name,events,full_pk
0,421028,Gladys,Robinson,HSPF C4 M3,42102800
1,340086,Joe,Adams,MO SLR 2023,34008600
2,340113,Phil,Amato,MO SLR 2023,34011300
3,340071,Ladonna,Appelbaum,MO SLR 2023,34007100
4,340119,Brad,Banderman,MO SLR 2023,34011900
...,...,...,...,...,...
459,430025,Cindy,Schreiber-Beck,ND SLR 2023,43002502
460,430037,Vicky,Steiner,ND SLR 2023,43003702
461,430006,Paul,Thomas,ND SLR 2023,
462,430033,Bill,Tveit,ND SLR 2023,43003302


: 

In [None]:

#*testing*

#test chunk to make sure all full_pks are matched up correctly
# test_df = event_summary_df.loc[:,['full_pk', 'first_name', 'last_name']]
# # dupe_testing = event_summary_df[event_summary_df.duplicated(subset=['last_name', 'full_pk'], keep=False)]
# # dupe_testing

# test_df['group_help'] = test_df['full_pk'].str.extract(r'(^\d{3})')
# test_df = test_df[test_df.duplicated(subset = ['last_name'], keep= False)]
# test_df


# test_group = test_df.groupby(['last_name', 'group_help']).agg({
#     'first_name': lambda x: '|'.join(set(x.dropna().astype(str))),
#     'full_pk': lambda x: '|'.join(set(x.dropna().astype(str)))
# })


#groupby data to get primary key and last names associated with it
loop_test_group = leg_lookup_ref.groupby(['full_pk']).agg({
    'Last Name': lambda x: '|'.join(
        list(set(f"{sc}" if not pd.isna(sc) else "not found"
        for sc in leg_lookup_ref.loc[x.index, 'Last Name'])))
}).reset_index()

loop_test_group_2024 = leg_lookup_ref_2024.groupby(['full_pk']).agg({
    'Last Name': lambda x: '|'.join(
        list(set(f"{sc}" if not pd.isna(sc) else "not found"
        for sc in leg_lookup_ref_2024.loc[x.index, 'Last Name'])))
}).reset_index()

loop_test_group['full_pk'] = loop_test_group['full_pk'].astype(int)
loop_test_dict = loop_test_group.set_index('full_pk')['Last Name'].to_dict()


loop_test_group_2024['full_pk'] = loop_test_group_2024['full_pk'].astype(int)
loop_test_dict_2024 = loop_test_group_2024.set_index('full_pk')['Last Name'].to_dict()

for i,e in enumerate(event_summary_df['full_pk']):
    # print(e)
    try:
        int_e = int(e)
        # print('succeeded')
    except:
        print('#############')
        print(f'the full_pk is {e}')
        print(event_summary_df.iloc[i,:])

        continue
    # print(type(int_e))
    result = loop_test_dict.get(int_e)
    actual = str(event_summary_df.loc[i,'last_name'])
    if result is None:
        continue
    
    result_cl = result.lower().strip().replace(' ', '').replace('-', '')
    actual_cl = actual.lower().strip().replace(' ', '').replace('-', '')

    if result == actual:
        continue
    elif result.lower().strip().replace(' ', '').replace('-', '') == actual.lower().strip().replace(' ', '').replace('-', ''):
        continue
    else:
        
        
        result_2024 = loop_test_dict_2024.get(int_e)
        
        if result_2024 is None:
            continue
        
        result_2024_cl = result_2024.lower().strip().replace(' ', '').replace('-', '')
        if result_2024 == actual or result_2024_cl == actual_cl:
            print('previous legislator')
            continue
        else:
            print(f'{result} | {actual}')
            event = str(event_summary_df.loc[i,'events'])
            print(f'{event}')
            print('\n')
            continue


    # print(actual)


# test_group

: 

In [None]:
#*testing*

prim_keys_after = list(set(event_sum_df['primary_key'].to_list()))
prim_keys_before = event_sum_df['primary_key'].to_list()
test = [prim_keys_before,prim_keys_after]
for t in test:
    print(len(t))




: 

In [None]:
#export event summary
os.chdir(r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\events')
es_file_name = fr'event_summary_{str(date.today()).replace('-','_')}.xlsx'
es_file_name_csv = fr'event_summary_{str(date.today()).replace('-','_')}.csv'
event_summary_df.to_excel(es_file_name, index=False)
event_summary_df.to_csv(es_file_name_csv, index=False)


: 

## Back to main quest

In [None]:
# Handling multi event legislators



#grouping data together and getting list of events per legislator
grouped_df = duplicates.groupby(['primary_key','last_name']).agg({
    'state': 'first',
    'first_name': 'first',
    'event name': lambda x: '|'.join(
        list(set(f"{sc} ({ac})" if not pd.isna(ac) else f"{sc}"
        for sc, ac in zip(duplicates.loc[x.index, 'event name'], duplicates.loc[x.index, 'role']))))

}).reset_index()
# grouped_df.reset_index()
grouped_df.rename(columns={'event name': 'events'}, inplace=True)
# state_coding
# grouped_df
# print(cleaned_df.columns)


#pull event data back together
merged_dfs = [grouped_df, non_dupe_df]
leg_events_df = pd.concat(merged_dfs)
leg_events_df = leg_events_df.reset_index(drop=True)
leg_events_df


: 

In [None]:

# Handle single seat legislature
'''
Description:
- takes in legislators found in events
- handles unique values and adds "00" for the seat
'''

#find none dupes and add seat
leg_events_df_nodupes = leg_events_df[~leg_events_df.duplicated(subset='primary_key', keep = False)]
leg_events_df_nodupes['seat'] = '00'
leg_events_df_nodupes_dict = dict(zip(leg_events_df_nodupes['primary_key'], leg_events_df_nodupes['last_name']))



: 

In [None]:
# Handle multi seat legislatures


#find dupes
leg_events_df_dupes = leg_events_df[leg_events_df.duplicated(subset='primary_key', keep = False)]
leg_events_df_dupes.reset_index(inplace = True, drop = True)


#create seat for dupes
leg_events_df_dupes['seat'] = np.nan
for i,j in enumerate(leg_events_df_dupes['primary_key']):
    #ensure j is an integer
    j = int(j)
    
    #get name we are looking for
    name_to_check = leg_events_df_dupes.loc[i,'last_name']
    trunc_name = str(name_to_check)[:3]
    # print(f"checking for {trunc_name}")
    
    #look for the key, get results, and split up the names into a list
    results = loop_dict.get(j)
    names = results.split('|')
    
    #get length of names
    n = len(names)-1
    
    #go through and check if the name matches either of the one in the list and return the seat
    for ik,name in enumerate(names):
        # print(f"going through {ik}")
        # print(type(name))
        # print(type(trunc_name))
        if re.search(f'^{trunc_name.strip()}', str(name)):
            # print('found it')
            seat = ik + 1
            leg_events_df_dupes.loc[i,'seat'] = f'0{seat}'
            break
        #stops if we are on the last iteration and still no seat
        elif ik == n:

            print(leg_events_df_dupes.loc[i,['state']])
            print(leg_events_df_dupes.loc[i,['primary_key']])
            temp_pk = leg_events_df_dupes.loc[i,['primary_key']]
            try:
                if re.findall(r'(?<=^\d\d)\d', str(temp_pk))[0] == 0:
                    chamber = "lower"
            except:
                print('idk')
                print('_____________')
                print(str(temp_pk))
                test = re.findall(r'(?<=^\d\d)\d', str(temp_pk))
                print(test)
                print('_____________')
            else:
                chamber = "higher"
            if chamber != "lower":
                continue



            new_pk = re.findall(r'^\d{2}', str(temp_pk))[0]+ "1" + re.findall(r'(?<=^\d\d\d)\d+', str(temp_pk))[0]
            print(leg_events_df_nodupes_dict.get(new_pk))
            
         
            print(f"couldn't find one for '{trunc_name}' aka '{name_to_check}' in {names} with length for it being {len(trunc_name)}")

leg_lookup_ref

: 

In [None]:
# print(leg_events_df_nodupes.columns)
# print(leg_events_df_dupes.columns)

#pull all event data back together
events_dfs = [leg_events_df_dupes, leg_events_df_nodupes]
events_df = pd.concat(events_dfs)
events_df.dropna(subset=['seat'], inplace=True)
events_df.reset_index(inplace=True, drop=True)
events_df


Unnamed: 0,primary_key,last_name,state,first_name,events,seat
0,430001,Hatlestad,ND,Patrick,ND SLR 2023,02
1,430001,Richter,ND,David,ND SLR 2023 (Opening Remarks)|ND SLR 2024 (Leg...,01
2,430002,Anderson,ND,Dick,ND SLR 2023,01
3,430002,Longmuir,ND,Donald,ND SLR 2024 (Legislator),02
4,430006,Anderson,ND,Dick,ND SLR 2024 (Legislator),01
...,...,...,...,...,...,...
248,450069,Tedford,OK,Mark,OK SLR 2023,00
249,160093,Walker,CT,Toni,ECLS 2024,00
250,341009,Washington,MO,Barbara,ECLS 2024,00
251,451048,Young,OK,George,OK SLR 2023,00


: 

## Scoring

Cell below calculates the activities score from the attendance data

In [None]:
#Calculating Score for loop


# print(events_df.columns)
#For loop description: goes through events column and gathers information for activities scoring
events_df.loc[:, 'activities_score'] = 0
for i,j in enumerate(events_df['events']):
    
    
    # split up events
    event_split = str(j).split('|')
    events = ";".join(event_split)
    # if len(event_split) < 2:
    #     continue


    #compile name for print statements

    # fname = grouped_df.at[i,'first_name']
    lname = events_df.at[i,'last_name']

    # names = [fname, lname]
    # name = " ".join(names)
    
    
    # display_markdown(f' ## {name}', raw=True)
    # print(bordered(events))

    #For loop description: go through each event and score 
    scores = []
    for event in event_split:

        #intializing boolean values for scoring    
        score = 0
        speaker = False
        is_hkf = False
        dev_program = False
        in_state = False
        out_state = False
        is_slr = False
        dinner_or_lunch = False
        # non_slr = False
        speaker = False
        # #print('#################')
        # #print(*grouped_df.loc[i,['helper','first_name', 'last_name', 'events']], sep=" \ ")
        
        
        # #print(bordered(event))

        #Look through for roles in events
        if re.search(r'\(.+\)', str(event)):
            match = re.findall(r'\(.+\)', str(event))
            match_refine = [x for x in match if len(x) != 0]
            #print("match refine results", match_refine)
            if len(match_refine) != 0:
                for m in match_refine:
                    if re.search('speaker|presenter', str(m).lower()):
                        # print('found a speaker')
                        speaker = True
                    elif 'HKF' in str(m):
                        #print('THERE IS HKF IN THE RESULTS')
                        is_hkf = True
                
        
        #is it just a short engagment such as a dinner or lunch?
        if re.search(r'[Dd]inner|[Ll]unch', str(event)):
            dinner_or_lunch = True

        #get state
        state = events_df.loc[i,'state']
        
        #looking for whether events where in state or out of state
        if 'ECLS' not in str(event) or "HKF" not in str(event):
            #print("no ecls or hKF")
            try:
                event_state = re.findall(state_abv_pat, str(event))[0].strip()
                if event_state == state:
                #print("states match")
                    in_state = True
                else:
                    out_state = True
            except:
                out_state = True
                # print(str(event))
                # print('no state match')
        else:
            out_state = True
            
            
        #lower dev program?
        if 'HSPF' in str(event) or 'Elevate' in str(event):
            dev_program = True

        #State Legislator event?
        if re.search(r'SLR|HLR',str(event)):
            is_slr = True


        # if re.search(r'\s[Mm]\d', str(event)):
        #     non_slr = True
        
        variables = [
        speaker,
        is_hkf,
        dev_program,
        in_state,
        out_state,
        is_slr,
        dinner_or_lunch
        ]

        #Trouble shooting print statement to make sure logic is working
        # #print('quick look at logic')
        # for var_name, var_value in zip(['speaker', 'is_hfk', 'dev_program', 'in_state', 'out_state', 'is_slr', 'dinner_or_lunch', 'non_slr', 'out_of_state'], variables):
        #     #print(bordered(f"{var_name}: {var_value}"))
        

        #Event data scoring 
        if is_slr == True:
            score += 15
            # print(f'adding 15 for {name} due to being an slr')
        # else:
        #     score += 10
        #     #print(f'adding 10 for {name}')

        elif dev_program == True:
            score += 15
            # print(f'adding 15 for {name} due to being in an dev program')
        elif dinner_or_lunch == True:
            score += 5
            # print(f'adding 5 for {name} due to being a lunch or dinner')
        else:
            score += 10
            # print("adding 10 for full day event with no other attributes")

        
        #check for speaker
        if speaker == True:
            if in_state == True:
                score += 0
                #if in state no additional points
                # print(f'adding 0 for {name} for being in state speaker')
            elif out_state == True:
                #if out of state add 5 more points for speakers
                # print(f'adding 5 for {name} due ot being a speaker at an out of state event')
                score += 5
        
        # check for hkf
        if is_hkf == True:
            score += 20
            # print(f'adding 20 for {lname} due to being hkf')


        
        # print(bordered(score))
        scores.append(score)

    # display_markdown(f' ### {name}', raw=True)
    # print(scores)
    total = sum(scores)
    # print("total: ",total)
    
    
    events_df.loc[i, 'activities_score'] = total

: 

In [None]:
#export activity scores df
activity_scores = events_df.loc[:,['primary_key','seat','first_name','last_name','activities_score', 'events']]

#make full_pk and convert to int
activity_scores['full_pk'] = activity_scores['primary_key'].astype(str) + activity_scores['seat'].astype(str)
activity_scores['full_pk'] = activity_scores['full_pk'].astype(int)

#put full_pk to front
activity_scores = activity_scores.drop('seat', axis=1)
first_column = activity_scores.pop('full_pk')
activity_scores.insert(0, 'full_pk', first_column)
print(*activity_scores.columns, sep = ' , ')
activity_scores

full_pk , primary_key , first_name , last_name , activities_score , events


Unnamed: 0,full_pk,primary_key,first_name,last_name,activities_score,events
0,43000102,430001,Patrick,Hatlestad,15,ND SLR 2023
1,43000101,430001,David,Richter,30,ND SLR 2023 (Opening Remarks)|ND SLR 2024 (Leg...
2,43000201,430002,Dick,Anderson,15,ND SLR 2023
3,43000202,430002,Donald,Longmuir,15,ND SLR 2024 (Legislator)
4,43000601,430006,Dick,Anderson,15,ND SLR 2024 (Legislator)
...,...,...,...,...,...,...
248,45006900,450069,Mark,Tedford,15,OK SLR 2023
249,16009300,160093,Toni,Walker,10,ECLS 2024
250,34100900,341009,Barbara,Washington,10,ECLS 2024
251,45104800,451048,George,Young,15,OK SLR 2023


In [38]:


year = 2025
os.chdir(fr'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\Legislators Data\leg_data_update_10_2024\build files\activity scores\{year}')
activity_scores.to_csv(f'activity_scores{str(date.today()).replace('-','_')}.csv', index = False)



# activity_scores