In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [6]:
# identify local path to data files
local_path = '/Users/katialopes-gilbert/repos/springboard-projects/capstone-project-fqhc-model/data/'

In [7]:
# read in pandas dataframes for service area, health centers, and health center ops/finance
service_area = pd.read_csv(local_path + 'service_area_df.csv')
health_centers = pd.read_csv(local_path + 'health_center_combined.csv')
health_center_ops_finance = pd.read_csv(local_path + 'health_center_ops_finance.csv')

## Service Area dataframe
I want to take a look at the service area dataframe to determine what values I'll keep for further analysis. 

In [8]:
service_area.head()

Unnamed: 0,BHCMISID,GrantNumber,ReportingYear,ZipCode,ZipCodeType,None_UninsuredPatients,Medicaid_CHIP_OtherPublicPatients,MedicarePatients,PrivatePatients,TotalNumberofPatients
0,10030,H80CS00803,2022,1011,ZipCode,,,0.0,,
1,10030,H80CS00803,2022,1013,ZipCode,61.0,1346.0,385.0,182.0,1974.0
2,10030,H80CS00803,2022,1014,ZipCode,0.0,,,0.0,21.0
3,10030,H80CS00803,2022,1104,ZipCode,26.0,791.0,181.0,97.0,1095.0
4,10030,H80CS00803,2022,1108,ZipCode,25.0,640.0,79.0,61.0,805.0


In [16]:
service_area.isnull().sum()

BHCMISID                                 0
GrantNumber                              0
ReportingYear                            0
ZipCode                                  0
ZipCodeType                              0
None_UninsuredPatients               50399
Medicaid_CHIP_OtherPublicPatients    37127
MedicarePatients                     50594
PrivatePatients                      47475
TotalNumberofPatients                19381
dtype: int64

In [27]:
service_area.shape

(97142, 10)

My initial thoughts for this dataset are to count the total zip codes in each health center's defined service area for 2022. I may also want to see how many organizaions serve patients across states and counties. This will require joining this dataframe with another data source. I found US Zip Code and other geographical data at this [website](https://data.opendatasoft.com/explore/dataset/georef-united-states-of-america-zc-point%40public/table/?flg=en-us). I would need to call the API to get the state, county, zip code, and potentially other data as relevant.

In [23]:
health_center_zipcode_count = service_area.groupby('BHCMISID')['ZipCode'].count().reset_index(name='ZipCode_Count')\
                                .sort_values('ZipCode_Count', ascending=False)
health_center_zipcode_count.head(20)

Unnamed: 0,BHCMISID,ZipCode_Count
430,044310,771
194,02E00868,638
1228,09E01087,590
141,021510,584
448,0452150,544
1236,09E01141,533
401,042440,496
1159,094890,475
1145,093110,433
357,041020,409


In [24]:
health_center_zipcode_count.tail(20)

Unnamed: 0,BHCMISID,ZipCode_Count
1091,0912120,6
1012,082490,6
1023,084620,6
1338,106210,6
1339,106220,6
13,010450,5
1354,10E00170,5
1266,09E01294,4
1063,090730,3
1368,11E01249,3


In [33]:
total_patients_zero = service_area[(service_area['ZipCode'] == 0) & (service_area['TotalNumberofPatients'] == 0)]
total_patients_zero

Unnamed: 0,BHCMISID,GrantNumber,ReportingYear,ZipCode,ZipCodeType,None_UninsuredPatients,Medicaid_CHIP_OtherPublicPatients,MedicarePatients,PrivatePatients,TotalNumberofPatients
7,010030,H80CS00803,2022,0,Unknown Residence,0.0,0.0,0.0,0.0,0.0
64,010040,H80CS00443,2022,0,Unknown Residence,0.0,0.0,0.0,0.0,0.0
138,010060,H80CS00741,2022,0,Unknown Residence,0.0,0.0,0.0,0.0,0.0
277,010130,H80CS00002,2022,0,Unknown Residence,0.0,0.0,0.0,0.0,0.0
297,010150,H80CS00311,2022,0,Unknown Residence,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
97103,10E01202,H80CS29023,2022,0,Unknown Residence,0.0,0.0,0.0,0.0,0.0
97120,10E01221,H80CS29030,2022,0,Unknown Residence,0.0,0.0,0.0,0.0,0.0
97137,11E01249,H80CS30720,2022,0,Other ZIP Codes,0.0,0.0,0.0,0.0,0.0
97138,11E01249,H80CS30720,2022,0,Unknown Residence,0.0,0.0,0.0,0.0,0.0


After importing data, the zip code table had several rows where the zip code was missing, so I imputed these with 0 instead of the '-'. There are no single digit zip codes in general so they would be easy enough to find. 

I will remove rows for the health centers that did not list a zip code and served no patients so they won't influence the service area count table. 

I also want to look at which rows represent an unknown zip code but patients were served in that unknown area(s).

In [34]:
total_patients_zero = service_area[(service_area['ZipCode'] == 0) & (service_area['TotalNumberofPatients'] != 0)]
total_patients_zero

Unnamed: 0,BHCMISID,GrantNumber,ReportingYear,ZipCode,ZipCodeType,None_UninsuredPatients,Medicaid_CHIP_OtherPublicPatients,MedicarePatients,PrivatePatients,TotalNumberofPatients
32,010030,H80CS00803,2022,0,Other ZIP Codes,18.0,234.0,43.0,40.0,335.0
53,010040,H80CS00443,2022,0,Other ZIP Codes,129.0,18.0,0.0,,155.0
98,010060,H80CS00741,2022,0,Other ZIP Codes,66.0,252.0,22.0,89.0,429.0
144,010070,H80CS00312,2022,0,Other ZIP Codes,69.0,322.0,51.0,231.0,673.0
224,010070,H80CS00312,2022,0,Unknown Residence,,,0.0,,
...,...,...,...,...,...,...,...,...,...,...
97094,10E01196,H80CS28997,2022,0,Unknown Residence,,0.0,0.0,0.0,
97100,10E01196,H80CS28997,2022,0,Other ZIP Codes,21.0,22.0,48.0,119.0,210.0
97112,10E01202,H80CS29023,2022,0,Other ZIP Codes,70.0,46.0,76.0,126.0,318.0
97133,10E01221,H80CS29030,2022,0,Other ZIP Codes,47.0,20.0,20.0,68.0,155.0


In [35]:
service_area['ZipCodeType'].unique()

array(['ZipCode', 'Unknown Residence', 'Other ZIP Codes'], dtype=object)

In [36]:
health_centers.isnull().sum()

BHCMISID                        0
GrantNumber                     0
HealthCenterName                0
HealthCenterCity                0
HealthCenterState               0
HealthCenterZIPCode             0
FundingCHC                      0
FundingMHC                      0
FundingHO                       0
FundingPH                       0
UrbanRuralFlag                  0
mhc_amount                      0
chc_amount                      0
ho_amount                       0
ph_amount                       0
total_hc_funding                0
bphc_total                      0
total_other_federal_grants      0
total_state_grants              0
total_private_grants            0
total_other_revenue             0
total_revenue                   0
total_male                      0
total_female                    0
asian                         156
native_hawaiian               681
pac_isl                       542
black                          63
am_ind                        332
white         

In [37]:
health_center_ops_finance.isnull().sum()

BHCMISID                     0
GrantNumber                  0
HealthCenterName             0
HealthCenterCity             0
HealthCenterState            0
HealthCenterZIPCode          0
FundingCHC                   0
FundingMHC                   0
FundingHO                    0
FundingPH                    0
UrbanRuralFlag               0
total_costs_medical        629
total_costs_dental         629
total_costs_mh             629
total_costs_enabling       629
total_costs_quality        629
total_cost_care            629
total_charges              630
total_collected            630
total_sfs                  630
medical_clinic_visits      579
medical_virtual_visits     579
medical_patients           579
dental_clinic_visits       579
dental_virtual_visits      579
dental_patients            579
mh_clinic_visits           579
mh_virtual_visits          579
mh_patients                579
enabling_clinic_visits     579
enabling_virtual_visits    579
enabling_patients          579
total_cl