In [1]:
#imports
import os, sys, json, re  # Provides OS-dependent functionality, system-specific parameters, JSON handling
import pandas as pd             # Provides data structures and data analysis tools
import numpy as np              # Supports large, multi-dimensional arrays and matrices
import requests
import time
import xlsxwriter
from tqdm import tqdm
from datetime import date #date/time manipulation
import glob as glob

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import display_markdown


from cprl_functions.state_capture import thi_states,state_ref, state_coding, state_coding_r, state_pat, state_abv_pat
from cprl_functions.text_printing import bordered
from cprl_functions.defined_functions import create_pk, add_seats, get_key, get_recent_file


In [2]:
state_pat

re.compile(r'Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming|District of Columbia',
           re.UNICODE)

In [3]:
def clean_rows(df, f_row_pat,l_col_pat):
    #get start row and cut df
    for row in df.itertuples(index=True):
        # print(len(row))
        # print( type(row))
        # print(row.values)
        row_string = "|".join(map(str, row))
        # print(row_string)

        if re.search(f_row_pat, row_string):
            # print(row.Index)
            # print(row_string)
            start_index = row.Index
            continue


    #implement row narrowing
    event_data_cl = df.iloc[start_index:,:].reset_index(drop=True)
    event_data_cl.columns = event_data_cl.iloc[0]
    event_data_cl = event_data_cl.iloc[1:,:]


    #get col subset
    for name,value in event_data_cl.items():
        print(name)
        if re.search(l_col_pat, str(name)):
            stop_col = name
            print(f'stop_col: {stop_col}')
            break

    #implement col narrowing
    attendance_tracking_df = event_data_cl.loc[:,:stop_col]
    return attendance_tracking_df
# attendance_tracking_df
# print(event_data_cl.columns)
# print(event_data.head(2).to_string())


In [4]:
#data setup
file = r'c:\Users\clutz\Downloads\THI_Events_Post_Event_1743700931.xlsx'
data = pd.read_excel(file)
df = clean_rows(data, r'Name', r'^[Cc]ontent Team')
df.columns

Name
Event Days - Start
Event Days - End
Mo.
Yr.
Planner
Backup Planner
Status
City/State
Content Team
stop_col: Content Team


Index(['Name', 'Event Days - Start', 'Event Days - End', 'Mo.', 'Yr.',
       'Planner', 'Backup Planner', 'Status', 'City/State', 'Content Team'],
      dtype='object', name=0)

In [5]:
df['Event Days - Start'] = pd.to_datetime(df['Event Days - Start']).dt.strftime('%x')
df['Event Days - End'] = pd.to_datetime(df['Event Days - End']).dt.strftime('%x')

In [6]:
attendance_folder = r'C:\Users\clutz\THE HUNT INSTITUTE\The Hunt Institute Team Site - Documents\Development (formerly Grants Management)\!Administrative\Christian\THII\attendance\attendance data'
attendance_files = glob.glob(os.path.join(attendance_folder, '*.xlsx'))

In [None]:
for file in attendance_files:
    filename = file.split('\\')[-1]
    print(filename)
    takeout  = ['[Dd]ata', '[Aa]ttendance','\.xlsx']
    takeout_pat = re.compile('|'.join(takeout))
    filename_components = [x for x in re.sub(takeout_pat, '', filename).split('_') if len(x)!=0]

2024_WSELR_attendance_data.xlsx
2025_GAR.xlsx
2025_HLR.xlsx
2025_HLR_bootcamp.xlsx
DE_LEG_ED_Dinner_2023.xlsx
ECLS_2024.xlsx
ECLS_2024_v2.xlsx
ElevateNC_C4_M3.xlsx
ElevateNC_C4_M4.xlsx
ElevateNC_C5_M2_2024.xlsx
HBCU_Caucus_2024.xlsx
HKF_C10_S1.xlsx
HKF_Regional_Visit_FAU.xlsx
HSPF_C4_M1.xlsx
HSPF_C4_M2.xlsx
HSPF_C4_M3.xlsx
IL_SLR_2023.xlsx
MO_SLR_2023.xlsx
NCCCS_M4.xlsx
NC_EC_Roundtable_2024.xlsx
NC_HLR_2024.xlsx
ND_Literacy_taskforce_2024.xlsx
ND_SLR_2023.xlsx
ND_SLR_2024.xlsx
ND_TRR_M1.xlsx
ND_TRR_m2.xlsx
ND_TRR_m3.xlsx
OH_SLR_2023.xlsx
OH_SLR_2024.xlsx
OK_SLR_2023.xlsx
OK_SLR_2024.xlsx
SC_Leg_Ed_Dinner_2023.xlsx
The_Path_Forward_2024.xlsx
WV_SLR_2023.xlsx
WV_SLR_2024.xlsx


In [None]:
for file in attendance_files:
    is_cohort = False
    is_slr = False

    
    filename = file.split('\\')[-1]
    takeout  = ['[Dd]ata', '[Aa]ttendance','\.xlsx']
    takeout_pat = re.compile('|'.join(takeout))
    filename_components = [x for x in re.sub(takeout_pat, '', filename).split('_') if len(x)!=0]

    #get year
    year = [x for x in filename_components if re.search(r'\d{4}', x)]
    filename_components = [x for x in filename_components if x not in year]


    #look for leg retreats
    leg_retreats = [x for x in filename_components if re.search(r'^[HWSE]{1,3}LR', x)]
    if len(leg_retreats) != 0: is_slr = True
    
    #look for cohor
    leg_retreats = [x for x in filename_components if re.search(r'C\d', x)]
    if len(leg_retreats) != 0: is_cohort = True


    
    status_dict = {'slr':is_slr,'cohort':is_cohort}
    # filename_components = [x for x in filename_components if x not in year]
    
    print(bordered(f'{filename_components}'))
    for k,v in status_dict.items():
        if v == True:
            print(k)
    print('___________________')
    
        # print(filename_components)
        # pr
    
    
    # print(year)
    # filename.replace('data', ()).split('_')
    # print(file.split('\\')[-1])




┌─────────┐
│['WSELR']│
└─────────┘
slr
___________________
┌───────┐
│['GAR']│
└───────┘
___________________
┌───────┐
│['HLR']│
└───────┘
slr
___________________
┌───────────────────┐
│['HLR', 'bootcamp']│
└───────────────────┘
slr
___________________
┌─────────────────────────────┐
│['DE', 'LEG', 'ED', 'Dinner']│
└─────────────────────────────┘
___________________
┌────────┐
│['ECLS']│
└────────┘
___________________
┌──────────────┐
│['ECLS', 'v2']│
└──────────────┘
___________________
┌─────────────────────────┐
│['ElevateNC', 'C4', 'M3']│
└─────────────────────────┘
cohort
___________________
┌─────────────────────────┐
│['ElevateNC', 'C4', 'M4']│
└─────────────────────────┘
cohort
___________________
┌─────────────────────────┐
│['ElevateNC', 'C5', 'M2']│
└─────────────────────────┘
cohort
___________________
┌──────────────────┐
│['HBCU', 'Caucus']│
└──────────────────┘
___________________
┌────────────────────┐
│['HKF', 'C10', 'S1']│
└────────────────────┘
cohort
______________

  takeout  = ['[Dd]ata', '[Aa]ttendance','\.xlsx']
