In [104]:
import pandas as pd
import altair as alt 
import numpy as np
import datetime as dt
from datetime import timedelta

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [80]:
# Read in survey df 
def read_survey_df(): 
    df = pd.read_csv('/Users/ghazalin/NH_inspections_static/data/NH_SurveyDates_Sep2024.csv')
    df = df.rename(columns={'CMS Certification Number (CCN)':'CCN', 'Survey Date': 'survey_date','Type of Survey':'survey_type', 'Survey Cycle':'survey_cycle', 'Processing Date':'process_date'})

    return df 
survey_df = read_survey_df()

In [81]:
print(survey_df.head(5))

      CCN survey_date           survey_type  survey_cycle process_date
0  015009  2023-03-02      Health Complaint             2   2024-09-01
1  015009  2023-03-02       Health Standard             1   2024-09-01
2  015009  2023-02-15  Fire Safety Standard             1   2024-09-01
3  015009  2019-08-21  Fire Safety Standard             2   2024-09-01
4  015009  2019-08-21       Health Standard             2   2024-09-01


In [82]:
# Read in provider df 
def read_provider_df(): 
    df = pd.read_csv('/Users/ghazalin/NH_inspections_static/data/NH_ProviderInfo_Sep2024.csv')
    df = df.rename(columns={'CMS Certification Number (CCN)':'CCN','Provider Name':'provider_name','Provider Address':'provider_address','ZIP Code':'zip' })
    df = df[['CCN','provider_name','State','zip']]
    print(df.columns)
    return df

provider_df = read_provider_df()


Index(['CCN', 'provider_name', 'State', 'zip'], dtype='object')


In [51]:
print(len(provider_df['provider_name'].unique()))
print(len(provider_df['CCN'].unique()))
print(provider_df.shape[0])

14626
14817
14817


In [83]:
def merge_survey_provider(survey_df, provider_df): 
    print('survey_df rows: ' + str(survey_df.shape[0]))
    print('provider_df rows: ' + str(provider_df.shape[0]))
    merge_df = pd.merge(survey_df, provider_df, how='left',on='CCN', indicator=True)
    print('survey_provider_df rows: ' + str(merge_df.shape[0]))
    print(merge_df['_merge'].value_counts())
    return merge_df

survey_provider_df = merge_survey_provider(survey_df, provider_df)

# CCN found for all providers

survey_df rows: 161369
provider_df rows: 14817
survey_provider_df rows: 161369
_merge
both          161369
left_only          0
right_only         0
Name: count, dtype: int64


In [84]:
def drop_fire_surveys(df): 
    df = df.loc[df['survey_type'].isin(['Fire Safety Standard','Fire Safety Complaint'])==0]
    print(df['survey_type'].value_counts())
    print(df.shape[0])
    return df 

drop_fire_surveys(survey_provider_df)

survey_type
Health Complaint     54456
Health Standard      44249
Infection Control    17878
Name: count, dtype: int64
116583


Unnamed: 0,CCN,survey_date,survey_type,survey_cycle,process_date,provider_name,State,zip,_merge
0,015009,2023-03-02,Health Complaint,2,2024-09-01,"BURNS NURSING HOME, INC.",AL,35653,both
1,015009,2023-03-02,Health Standard,1,2024-09-01,"BURNS NURSING HOME, INC.",AL,35653,both
4,015009,2019-08-21,Health Standard,2,2024-09-01,"BURNS NURSING HOME, INC.",AL,35653,both
6,015009,2018-08-01,Health Standard,3,2024-09-01,"BURNS NURSING HOME, INC.",AL,35653,both
7,015010,2023-08-30,Infection Control,2,2024-09-01,COOSA VALLEY HEALTHCARE CENTER,AL,35150,both
...,...,...,...,...,...,...,...,...,...
161361,745039,2023-06-20,Health Standard,1,2024-09-01,MIDTOWNE MEADOWS HEALTH AND REHAB,TX,76065,both
161362,745040,2023-10-13,Health Complaint,1,2024-09-01,THE SARAH ROBERTS FRENCH HOME,TX,78201,both
161363,745040,2023-10-13,Health Standard,1,2024-09-01,THE SARAH ROBERTS FRENCH HOME,TX,78201,both
161365,745040,2022-08-05,Health Standard,2,2024-09-01,THE SARAH ROBERTS FRENCH HOME,TX,78201,both


In [100]:
# calculate time between September 1st 2024 (time of data collection) and last survey 
def identify_last_standard_survey(df): 
    """Identify the last standard survey for each nursing home"""
    df = df.loc[df['survey_type']=='Health Standard']
    df = df.sort_values(by=['CCN','survey_date'], ascending=False)
    print(df.shape[0]) # 44249, as it should be (total number of health standard surveys)
    df = df.drop_duplicates(subset=['CCN'])
    print(df.shape[0]) # 14817, as it should be (total number of nursing homes) 
    return df 
    
last_standard_survey_df = identify_last_standard_survey(survey_provider_df)
    

44249
14817


In [101]:
def NH_months_since_last_standard_survey(df): 
    
    df['survey_date'] = pd.to_datetime(df['survey_date'])
    df['process_date'] = pd.to_datetime(df['process_date'])
    df['months_since_last_survey'] = df['process_date'] - df['survey_date']
    df['months_overdue'] = 0
    return df
    
NH_months_since_last_standard_survey = NH_months_since_last_standard_survey(last_standard_survey_df)

In [102]:
print(NH_months_since_last_standard_survey)

           CCN survey_date      survey_type  survey_cycle process_date  \
161368  745049  2024-06-21  Health Standard             1   2024-09-01   
161363  745040  2023-10-13  Health Standard             1   2024-09-01   
161361  745039  2023-06-20  Health Standard             1   2024-09-01   
161359  745038  2023-05-19  Health Standard             1   2024-09-01   
161349  745022  2023-10-25  Health Standard             1   2024-09-01   
...        ...         ...              ...           ...          ...   
30      015015  2020-03-05  Health Standard             1   2024-09-01   
22      015014  2023-09-13  Health Standard             1   2024-09-01   
15      015012  2022-03-24  Health Standard             1   2024-09-01   
9       015010  2022-04-09  Health Standard             1   2024-09-01   
1       015009  2023-03-02  Health Standard             1   2024-09-01   

                                            provider_name State    zip _merge  \
161368  LAS ALTURAS NURSING & 

In [None]:




# calculate time since each of the last surveys 

# groupby state sum for each column 