In [189]:
import pandas as pd
import altair as alt 
import numpy as np
import datetime as dt
from datetime import timedelta

alt.data_transformers.enable("vegafusion")
alt.themes.enable("fivethirtyeight")

ThemeRegistry.enable('fivethirtyeight')

In [106]:
# Read in survey df 
def read_survey_df(): 
    df = pd.read_csv('/Users/ghazalin/NH_inspections_static/data/NH_SurveyDates_Sep2024.csv')
    df = df.rename(columns={'CMS Certification Number (CCN)':'CCN', 'Survey Date': 'survey_date','Type of Survey':'survey_type', 'Survey Cycle':'survey_cycle', 'Processing Date':'process_date'})
    
    return df 
survey_df = read_survey_df()

      CCN survey_date           survey_type  survey_cycle process_date
0  015009  2023-03-02      Health Complaint             2   2024-09-01
1  015009  2023-03-02       Health Standard             1   2024-09-01
2  015009  2023-02-15  Fire Safety Standard             1   2024-09-01
3  015009  2019-08-21  Fire Safety Standard             2   2024-09-01
4  015009  2019-08-21       Health Standard             2   2024-09-01
5  015009  2018-08-09  Fire Safety Standard             3   2024-09-01
6  015009  2018-08-01       Health Standard             3   2024-09-01
7  015010  2023-08-30     Infection Control             2   2024-09-01
8  015010  2022-04-13  Fire Safety Standard             1   2024-09-01
9  015010  2022-04-09       Health Standard             1   2024-09-01


In [107]:
def standard_surveys_only(df): 
    print(df['survey_type'].value_counts())
    df = df.loc[df['survey_type']=='Health Standard'].copy()
    print(df.head(10))
    print(df.shape[0])
    return df
survey_df = standard_surveys_only(survey_df)

survey_type
Health Complaint         54456
Health Standard          44249
Fire Safety Standard     44216
Infection Control        17878
Fire Safety Complaint      570
Name: count, dtype: int64
       CCN survey_date      survey_type  survey_cycle process_date
1   015009  2023-03-02  Health Standard             1   2024-09-01
4   015009  2019-08-21  Health Standard             2   2024-09-01
6   015009  2018-08-01  Health Standard             3   2024-09-01
9   015010  2022-04-09  Health Standard             1   2024-09-01
11  015010  2019-06-13  Health Standard             2   2024-09-01
13  015010  2018-06-07  Health Standard             3   2024-09-01
15  015012  2022-03-24  Health Standard             1   2024-09-01
17  015012  2019-06-06  Health Standard             2   2024-09-01
19  015012  2018-05-03  Health Standard             3   2024-09-01
22  015014  2023-09-13  Health Standard             1   2024-09-01
44249


In [108]:
def remove_surveys_after_date(df, cutoff_date): 
    print(df.shape[0])
    df['cutoff_date'] = pd.to_datetime(cutoff_date)
    df['survey_date'] = pd.to_datetime(df['survey_date'])
    df = df.loc[df['survey_date']<df['cutoff_date']].copy()
    df['year'] = df['survey_date'].dt.year
    print(df['year'].value_counts())
    print(df.shape[0])
    print(df.head(5))
    return df
    
pre_apr2023_surveys_df = remove_surveys_after_date(survey_df, '03-31-2023')



44249
year
2022    9425
2021    6844
2019    6340
2023    2609
2018    2443
2020    2286
2017     547
2016      33
Name: count, dtype: int64
30527
       CCN survey_date      survey_type  survey_cycle process_date  \
1   015009  2023-03-02  Health Standard             1   2024-09-01   
4   015009  2019-08-21  Health Standard             2   2024-09-01   
6   015009  2018-08-01  Health Standard             3   2024-09-01   
9   015010  2022-04-09  Health Standard             1   2024-09-01   
11  015010  2019-06-13  Health Standard             2   2024-09-01   

   cutoff_date  year  
1   2023-03-31  2023  
4   2023-03-31  2019  
6   2023-03-31  2018  
9   2023-03-31  2022  
11  2023-03-31  2019  


In [109]:
# calculate time between input date and last survey 
def identify_last_standard_survey(df): 
    """Identify the last standard survey for each nursing home"""
    df = df.loc[df['survey_type']=='Health Standard']
    df = df.sort_values(by=['CCN','survey_date'], ascending=False)
    print(df.shape[0])
    df = df.drop_duplicates(subset=['CCN'], keep='first')
    print(df.shape[0]) 
    return df 
    
last_standard_survey_df = identify_last_standard_survey(pre_apr2023_surveys_df)
# 14730 nursing homes

30527
14730


In [110]:
# Read in provider df 
def read_provider_df(): 
    df = pd.read_csv('/Users/ghazalin/NH_inspections_static/data/NH_ProviderInfo_Sep2024.csv', dtype={'ZIP Code':str})
    df = df.rename(columns={'Date First Approved to Provide Medicare and Medicaid Services':'date_approved','CMS Certification Number (CCN)':'CCN','Provider Name':'provider_name','Provider Address':'provider_address','ZIP Code':'zip'})
    # print(df.columns.to_list())
    df = df[['CCN','date_approved','provider_name','State','zip']]
    df['zip'] = df['zip'].astype(str)

    return df

provider_df = read_provider_df()


In [111]:
def provider_active_cutoff(df,date):
    df['date_approved'] = pd.to_datetime(df['date_approved'])
    new_col_name = f'active_on_{date}'
    date = pd.to_datetime(date)
    df[new_col_name] = df['date_approved'] <= date
    return df

provider_df = provider_active_cutoff(provider_df,'01-01-2022')

In [131]:
def merge_provider_survey_df(provider_df, survey_df): 
    print('survey_df rows: ' + str(survey_df.shape[0]))
    print('provider_df rows: ' + str(provider_df.shape[0]))
    merge_df = pd.merge(provider_df, survey_df, how='left',on='CCN', indicator=True)
    print('survey_provider_df rows: ' + str(merge_df.shape[0]))
    
    print(merge_df['_merge'].value_counts())
    return merge_df

provider_survey_df = merge_survey_provider(provider_df, last_standard_survey_df)

# CCN found for all providers

survey_df rows: 14817
provider_df rows: 14730
survey_provider_df rows: 14817
_merge
both          14730
left_only        87
right_only        0
Name: count, dtype: int64


In [137]:
def NH_months_since_last_standard_survey(df, cutoff_date): 
    df['cutoff_date'] = pd.to_datetime(cutoff_date)
    df['survey_date'] = pd.to_datetime(df['survey_date'])
    df['date_diff'] = df['cutoff_date'] - df['survey_date']
    df['months_since_survey'] = (df['cutoff_date'] - df['survey_date']).dt.days/30
    df['months_overdue'] = ((((df['cutoff_date'] - df['survey_date']).dt.days)/30)-15)

    df['overdue_ind'] = np.where(df['months_overdue'].isna()==True, 1, 
                                (np.where(df['months_overdue']>1, 1, 0)))
    

    print(df['months_overdue'].describe())
    print(df.head(10))
    return df
    
NH_months_since_last_standard_survey = NH_months_since_last_standard_survey(provider_survey_df, '03-31-2023')
# NH_months_since_last_standard_survey.to_csv('/Users/ghazalin/NH_inspections_static/OutputData/_033123_NH_months_since_last_standard_survey.csv')


count    14730.000000
mean        -3.253517
std         10.751142
min        -14.966667
25%        -10.466667
50%         -5.866667
75%         -0.300000
max         48.033333
Name: months_overdue, dtype: float64
      CCN date_approved                                     provider_name  \
0  015009    1969-09-01                          BURNS NURSING HOME, INC.   
1  015010    1967-01-01                    COOSA VALLEY HEALTHCARE CENTER   
2  015012    1967-01-01                        HIGHLANDS HEALTH AND REHAB   
3  015014    1967-01-01       EASTVIEW REHABILITATION & HEALTHCARE CENTER   
4  015015    1971-07-01                     PLANTATION MANOR NURSING HOME   
5  015016    1967-01-01              ATHENS HEALTH AND REHABILITATION LLC   
6  015019    1974-01-01                                  MERRY WOOD LODGE   
7  015023    1967-01-01                            HATLEY HEALTH CARE INC   
8  015024    1979-01-01  LIMESTONE NURSING AND REHABILITATION CENTER, LLC   
9  015027    1967

In [145]:
def prop_overdue_surveys(df): 
    df['NH_count'] = 1
    df_groupby = df.groupby(['State']).agg({'overdue_ind':'sum','NH_count':'sum'}).reset_index()
    df_groupby['prop_overdue'] = df_groupby['overdue_ind']/df_groupby['NH_count']
    df_groupby = df_groupby.sort_values(by=['prop_overdue'])
    print(df_groupby)
    return df_groupby
    
prop_overdue_surveys_apr_2023_df = prop_overdue_surveys(NH_months_since_last_standard_survey)

   State  overdue_ind  NH_count  prop_overdue
11    GU            0         1      0.000000
27    MT            0        59      0.000000
40    PR            0         6      0.000000
41    RI            0        74      0.000000
48    VT            0        34      0.000000
52    WY            0        35      0.000000
50    WI            2       326      0.006135
3     AZ            1       142      0.007042
19    LA            2       267      0.007491
45    TX            9      1185      0.007595
39    PA            6       668      0.008982
15    IL            7       680      0.010294
12    HI            1        42      0.023810
31    NH            2        74      0.027027
23    MI           12       424      0.028302
33    NM            2        68      0.029412
51    WV            4       123      0.032520
29    ND            3        75      0.040000
30    NE            8       184      0.043478
13    IA           19       404      0.047030
0     AK            1        20   

In [150]:
# read in surveyor vacancy rate 
def merge_with_surveyor_vacancy_df(state_df):
    vacancy_df = pd.read_csv('/Users/ghazalin/NH_inspections_static/data/state_prop_surveyor_vacancy.csv')
    merge_df = pd.merge(vacancy_df, state_df, how='left', left_on='state_code', right_on='State', indicator=True)
    print(merge_df['_merge'].value_counts())
    merge_df = merge_df.rename(columns={'State_x':'State'})
    merge_df = merge_df.drop(columns=['State_y'])
    return merge_df 

vacancy_overdue_df = merge_with_surveyor_vacancy_df(prop_overdue_surveys_apr_2023_df)


_merge
both          50
left_only      0
right_only     0
Name: count, dtype: int64


In [151]:
print(vacancy_overdue_df.columns)

Index(['State', 'state_code', '2002_prop_vacancy', '2022_prop_vacancy',
       'overdue_ind', 'NH_count', 'prop_overdue', '_merge'],
      dtype='object')


In [199]:
vacancy_overdue_chart = alt.Chart(vacancy_overdue_df).mark_circle(color='#74372F').encode(alt.X('2022_prop_vacancy', title='2022 State Surveyor Vacancy Rates', axis=alt.Axis(format='%')), 
                                                                          alt.Y('prop_overdue',title='State Overdue Survey Rate as of April 1, 2023', axis=alt.Axis(format='%')
                                                                               ))
# state_labels = vacancy_overdue_chart.mark_text(fontSize=7).encode(text='state_code')
vacancy_overdue_chart = vacancy_overdue_chart + vacancy_overdue_chart.transform_regression('2022_prop_vacancy','prop_overdue').mark_line(color='#74372F')
vacancy_overdue_chart = vacancy_overdue_chart.properties(background='white', title=alt.TitleParams(text='Surveyor Vacancy Rates by Overdue Survey Rates', fontSize=15))
vacancy_overdue_chart.save('/Users/ghazalin/NH_inspections_static/images/surveyor_vacancy_by_overdue_survey_scatter.svg')
display(vacancy_overdue_chart)
