In [151]:
import pandas as pd
import altair as alt 
import numpy as np
import datetime as dt
from datetime import timedelta

alt.data_transformers.enable("vegafusion")
alt.themes.enable("fivethirtyeight")

ThemeRegistry.enable('fivethirtyeight')

In [2]:
# Read in survey df 
def read_survey_df(): 
    df = pd.read_csv('/Users/ghazalin/NH_inspections_static/data/NH_SurveyDates_Sep2024.csv')
    df = df.rename(columns={'CMS Certification Number (CCN)':'CCN', 'Survey Date': 'survey_date','Type of Survey':'survey_type', 'Survey Cycle':'survey_cycle', 'Processing Date':'process_date'})
    return df 
survey_df = read_survey_df()

In [3]:
print(survey_df.head(5))

      CCN survey_date           survey_type  survey_cycle process_date
0  015009  2023-03-02      Health Complaint             2   2024-09-01
1  015009  2023-03-02       Health Standard             1   2024-09-01
2  015009  2023-02-15  Fire Safety Standard             1   2024-09-01
3  015009  2019-08-21  Fire Safety Standard             2   2024-09-01
4  015009  2019-08-21       Health Standard             2   2024-09-01


In [4]:
# Read in provider df 
def read_provider_df(): 
    df = pd.read_csv('/Users/ghazalin/NH_inspections_static/data/NH_ProviderInfo_Sep2024.csv', dtype={'ZIP Code':str})
    df = df.rename(columns={'CMS Certification Number (CCN)':'CCN','Provider Name':'provider_name','Provider Address':'provider_address','ZIP Code':'zip'})
    df = df[['CCN','provider_name','State','zip']]
    df['zip'] = df['zip'].astype(str)

    return df

provider_df = read_provider_df()


In [5]:
print(provider_df['zip'].str.len().value_counts())

zip
5    14817
Name: count, dtype: int64


In [6]:
def merge_survey_provider(survey_df, provider_df): 
    print('survey_df rows: ' + str(survey_df.shape[0]))
    print('provider_df rows: ' + str(provider_df.shape[0]))
    merge_df = pd.merge(survey_df, provider_df, how='left',on='CCN', indicator=True)
    print('survey_provider_df rows: ' + str(merge_df.shape[0]))
    print(merge_df['_merge'].value_counts())
    return merge_df

survey_provider_df = merge_survey_provider(survey_df, provider_df)

# CCN found for all providers

survey_df rows: 161369
provider_df rows: 14817
survey_provider_df rows: 161369
_merge
both          161369
left_only          0
right_only         0
Name: count, dtype: int64


In [7]:
def drop_fire_surveys(df): 
    df = df.loc[df['survey_type'].isin(['Fire Safety Standard','Fire Safety Complaint'])==0]
    print(df['survey_type'].value_counts())
    print(df.shape[0])
    return df 

drop_fire_surveys(survey_provider_df)

survey_type
Health Complaint     54456
Health Standard      44249
Infection Control    17878
Name: count, dtype: int64
116583


Unnamed: 0,CCN,survey_date,survey_type,survey_cycle,process_date,provider_name,State,zip,_merge
0,015009,2023-03-02,Health Complaint,2,2024-09-01,"BURNS NURSING HOME, INC.",AL,35653,both
1,015009,2023-03-02,Health Standard,1,2024-09-01,"BURNS NURSING HOME, INC.",AL,35653,both
4,015009,2019-08-21,Health Standard,2,2024-09-01,"BURNS NURSING HOME, INC.",AL,35653,both
6,015009,2018-08-01,Health Standard,3,2024-09-01,"BURNS NURSING HOME, INC.",AL,35653,both
7,015010,2023-08-30,Infection Control,2,2024-09-01,COOSA VALLEY HEALTHCARE CENTER,AL,35150,both
...,...,...,...,...,...,...,...,...,...
161361,745039,2023-06-20,Health Standard,1,2024-09-01,MIDTOWNE MEADOWS HEALTH AND REHAB,TX,76065,both
161362,745040,2023-10-13,Health Complaint,1,2024-09-01,THE SARAH ROBERTS FRENCH HOME,TX,78201,both
161363,745040,2023-10-13,Health Standard,1,2024-09-01,THE SARAH ROBERTS FRENCH HOME,TX,78201,both
161365,745040,2022-08-05,Health Standard,2,2024-09-01,THE SARAH ROBERTS FRENCH HOME,TX,78201,both


In [8]:
# calculate time between September 1st 2024 (time of data collection) and last survey 
def identify_last_standard_survey(df): 
    """Identify the last standard survey for each nursing home"""
    df = df.loc[df['survey_type']=='Health Standard']
    df = df.sort_values(by=['CCN','survey_date'], ascending=False)
    print(df.shape[0]) # 44249, as it should be (total number of health standard surveys)
    df = df.drop_duplicates(subset=['CCN'], keep='first')
    print(df.shape[0]) # 14817, as it should be (total number of nursing homes) 
    return df 
    
last_standard_survey_df = identify_last_standard_survey(survey_provider_df)
    

44249
14817


In [9]:
test = last_standard_survey_df.sort_values(by=['survey_date'])
print(test.head(10))

          CCN survey_date      survey_type  survey_cycle process_date  \
54214  185169  2019-02-15  Health Standard             1   2024-09-01   
54524  185237  2019-02-28  Health Standard             1   2024-09-01   
53819  185029  2019-03-15  Health Standard             1   2024-09-01   
53925  185087  2019-03-26  Health Standard             1   2024-09-01   
55513  185449  2019-03-28  Health Standard             1   2024-09-01   
55136  185348  2019-04-05  Health Standard             1   2024-09-01   
55247  185378  2019-04-12  Health Standard             1   2024-09-01   
54687  185264  2019-04-18  Health Standard             1   2024-09-01   
53837  185042  2019-04-25  Health Standard             1   2024-09-01   
54573  185246  2019-04-25  Health Standard             1   2024-09-01   

                                           provider_name State    zip _merge  
54214  SIGNATURE HEALTHCARE AT JEFFERSON MANOR REHAB ...    KY  40222   both  
54524                       CHEROKEE P

In [10]:
def NH_months_since_last_standard_survey(df): 
    
    df['survey_date'] = pd.to_datetime(df['survey_date'])
    df['process_date'] = pd.to_datetime(df['process_date'])
    df['date_diff'] = df['process_date'] - df['survey_date']
    df['months_since_survey'] = (df['process_date'] - df['survey_date']).dt.days/30
    df['months_overdue'] = ((((df['process_date'] - df['survey_date']).dt.days)/30)-15)
    df['overdue_ind'] = np.where(df['months_overdue']>1, 1, 0)
    print(df['months_overdue'].describe())
    print(df.head(10))
    df.to_csv('/Users/ghazalin/NH_inspections_static/OutputData/NH_months_since_last_standard_survey.csv')
    return df
    
NH_months_since_last_standard_survey = NH_months_since_last_standard_survey(last_standard_survey_df)

count    14817.000000
mean        -2.485679
std         10.779902
min        -14.666667
25%         -9.466667
50%         -5.066667
75%          0.066667
max         52.500000
Name: months_overdue, dtype: float64
           CCN survey_date      survey_type  survey_cycle process_date  \
161368  745049  2024-06-21  Health Standard             1   2024-09-01   
161363  745040  2023-10-13  Health Standard             1   2024-09-01   
161361  745039  2023-06-20  Health Standard             1   2024-09-01   
161359  745038  2023-05-19  Health Standard             1   2024-09-01   
161349  745022  2023-10-25  Health Standard             1   2024-09-01   
161342  745021  2023-09-26  Health Standard             1   2024-09-01   
161327  745019  2024-04-18  Health Standard             1   2024-09-01   
161315  745017  2023-08-17  Health Standard             1   2024-09-01   
161305  745007  2023-11-10  Health Standard             1   2024-09-01   
161281  745006  2024-04-19  Health Standard    

In [11]:
print(NH_months_since_last_standard_survey.shape[0])

14817


In [140]:
def prop_overdue_surveys(df): 
    df['NH_count'] = 1
    df_groupby = df.groupby(['State']).agg({'overdue_ind':'sum','NH_count':'sum'}).reset_index()
    df_groupby['prop_overdue'] = ((df_groupby['overdue_ind']/df_groupby['NH_count'])*100).astype(int)
    df_groupby = df_groupby.sort_values(by=['prop_overdue'])
    df_groupby['prop_overdue_%'] = df_groupby['prop_overdue'].astype(str) + '%'
    df_groupby['prop_<2%'] = df_groupby['prop_overdue'] < 2
    print(df_groupby['State'].loc[df_groupby['prop_<2%']==True].to_list())
    df_groupby = df_groupby.loc[df_groupby['prop_<2%']==False].copy()
    print(df_groupby.shape[0])
    return df_groupby
    
prop_overdue_surveys_df = prop_overdue_surveys(NH_months_since_last_standard_survey)

['AR', 'GU', 'IL', 'ND', 'MN', 'MI', 'LA', 'IN', 'NE', 'MT', 'OK', 'PA', 'PR', 'NM', 'NH', 'IA', 'NV', 'RI']
35


In [134]:
print(prop_overdue_surveys_df)

   State  overdue_ind  NH_count  prop_overdue prop_overdue_%  prop_<2%
2     AR            1       220             0             0%      True
11    GU            0         1             0             0%      True
15    IL            1       680             0             0%      True
29    ND            0        75             0             0%      True
24    MN            1       343             0             0%      True
23    MI            2       424             0             0%      True
19    LA            1       267             0             0%      True
16    IN            1       511             0             0%      True
30    NE            0       184             0             0%      True
27    MT            0        59             0             0%      True
37    OK            1       287             0             0%      True
39    PA            0       668             0             0%      True
40    PR            0         6             0             0%      True
33    

In [158]:
# create a chart of proportion overdue surveys 
def prop_overdue_chart(): 
    prop_overdue_chart = alt.Chart(prop_overdue_surveys_df).mark_bar(color='#74372F').encode(alt.X('prop_overdue',title='',
                                                                                    axis=None),
                                                                              alt.Y('State', title='',
                                                                                    axis=alt.Axis(grid=False,labelFontSize=12, labelColor='black')).sort('-x')
                                                                             ).properties(title=alt.TitleParams(text='Percentage of Facilities Overdue for Inspection by State',
                                                                                         fontSize = 15),
                                                                                         width=400)
    percent_labels = prop_overdue_chart.mark_text(dx=-14, color='white', fontSize=14).encode(text='prop_overdue')
    # low_prop_states = 
    prop_overdue_chart = prop_overdue_chart + percent_labels
    prop_overdue_chart = prop_overdue_chart.configure_view(stroke=None)
    return prop_overdue_chart
prop_overdue_chart()
prop_overdue_chart().save('/Users/ghazalin/NH_inspections_static/images/prop_overdue_barchart.svg')

In [69]:
def average_months_overdue_by_state(df): 
    df['NH_count'] = 1 
    state_groupby = df.groupby(['State']).agg({'months_overdue':'sum','NH_count':'sum'}).reset_index()
    state_groupby['avg_months_overdue'] = state_groupby['months_overdue']/state_groupby['NH_count']
    return state_groupby
average_months_overdue_by_state(NH_months_since_last_standard_survey)
    

Unnamed: 0,State,months_overdue,NH_count,avg_months_overdue
0,AK,-99.4,20,-4.97
1,AL,3976.8,224,17.753571
2,AR,-1681.0,220,-7.640909
3,AZ,-163.933333,142,-1.15446
4,CA,-1783.566667,1164,-1.532274
5,CO,-1081.533333,211,-5.12575
6,CT,782.533333,194,4.033677
7,DC,-11.5,17,-0.676471
8,DE,-245.866667,44,-5.587879
9,FL,-3127.3,696,-4.493247


In [None]:
# How does proportion of NH overdue relate to 