# Importing Data

Data is being fetched as an excel file directly from HRSA's data warehouse for federally qualified health centers (FQHCs). I'll start by importing 2022 data.

In [27]:
import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.insert(0, parent_dir)

In [28]:
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from configuration.config import DATA_PATH

In [29]:
url = 'https://www.hrsa.gov/sites/default/files/hrsa/foia/h80-2022.xlsx'
file_name = '2022-h80-data.xlsx'

# save file locally
urlretrieve(url, DATA_PATH + file_name)

('/Users/katialopes-gilbert/repos/springboard-projects/capstone-project-fqhc-model/data/2022-h80-data.xlsx',
 <http.client.HTTPMessage at 0x13fede4d0>)

In [30]:
# load file into dictionary
df = pd.read_excel(DATA_PATH + file_name, sheet_name=None)

## Some key information about the 2022 UDS dataset:

**How this data is collected:** 
Data is collected through HRSA's Uniform Data System report that health center grantees must fill out annually. In 2022, there were 1,370 entities that filled out the UDS report.

**Missing Values Representation**
1. "-" represents no data entry by health center
2. "--" represents suppressed patient counts between 1-15 to protect patient privacy
3. "---" represents suppressed health center confidential data  

In [31]:
df.keys()



In [32]:
# load sheets of interest into separate dataframes
health_centers = df['HealthCenterInfo']
health_center_sites = df['HealthCenterSiteInfo']
health_center_funding = df['Table9E']
health_center_zipcodes = df['HealthCenterZipCodes']
health_center_costs = df['Table8A']
personnel_and_visits = df['Table5']
patients_age = df['Table3A']
patients_race = df['Table3B']
patients_other_demographics = df['Table4']
patient_services_revenue = df['Table9D']

# show all columns
pd.set_option('display.max_columns', None)

In [33]:
health_centers.head(3)

Unnamed: 0,BHCMISID,GrantNumber,ReportingYear,HealthCenterName,HealthCenterStreetAddress,HealthCenterOtherAddress,HealthCenterCity,HealthCenterState,HealthCenterZIPCode,ProjectDirector,ProjectDirectorPhone,ProjectDirectorPhoneExt,ProjectDirectorFax,ProjectDirectorEmail,FundingCHC,FundingMHC,FundingHO,FundingPH,UrbanRuralFlag
0,10030,H80CS00803,2022,"HOLYOKE HEALTH CENTER, INC.",230 MAPLE ST,-,Holyoke,MA,1040,Alejandro Esparza Perez,(413)420-2175,-,-,alejandro.esparza@hhcinc.org,True,False,False,False,Urban
1,10040,H80CS00443,2022,MAINE MOBILE HEALTH PROGRAM INC.,9 GREEN ST STE 1,-,Augusta,ME,4330,Carol Murphy,(917)209-3777,-,-,cmurphy@mainemobile.org,False,True,False,False,Rural
2,10060,H80CS00741,2022,"FAIR HAVEN COMMUNITY HEALTH CLINIC, INC.",374 GRAND AVE,-,New Haven,CT,6513,Suzanne Lagarde,(203)752-5129,-,(203)777-8506,s.lagarde@fhchc.org,True,False,False,False,Urban


In [34]:
health_center_sites.head(3)

Unnamed: 0,BHCMISID,GrantNumber,HealthCenterName,SiteName,SiteType,SiteStatus,LocationType,LocationSetting,OperationalSchedule,CalendarSchedule,TotalWeeklyHoursOfOperation,ServiceAreaPopulation,SiteOperatedBy,SiteStreetAddress,SiteCity,SiteState,SiteZIPCode,MailingStreetAddress,MailingCity,MailingState,MailingZIPCode,MedicaidNumber,MedicaidPharmNumber,DataAsof
0,10030,H80CS00803,"HOLYOKE HEALTH CENTER, INC.",CHICOPEE HEALTH CENTER,Service Delivery Site,Active,Permanent,All Other Clinic Types,Full-Time,Year-Round,47.0,Urban,Health Center/Applicant,505 Front St,Chicopee,MA,01013-3140,505-Front St,Chicopee,MA,01013-3140,1320874,401480,12/31/2022 11:59 PM EST
1,10030,H80CS00803,"HOLYOKE HEALTH CENTER, INC.","HOLYOKE HEALTH CENTER, INC.",Service Delivery Site,Active,Permanent,All Other Clinic Types,Full-Time,Year-Round,48.0,Urban,Health Center/Applicant,230 Maple St,Holyoke,MA,01040-5144,230-Maple St,Holyoke,MA,01040-5144,1300237,401480,12/31/2022 11:59 PM EST
2,10030,H80CS00803,"HOLYOKE HEALTH CENTER, INC.",Holyoke Soldier Home,Service Delivery Site,Active,Permanent,All Other Clinic Types,Full-Time,Year-Round,40.0,Urban,Health Center/Applicant,110 Cherry St,Holyoke,MA,01040-7002,-,-,-,-,1300237,401480,12/31/2022 11:59 PM EST


In [35]:
health_center_funding.head(3)

Unnamed: 0,BHCMISID,GrantNumber,T9E_L1a_Ca,T9E_L1b_Ca,T9E_L1c_Ca,T9E_L1e_Ca,T9E_L1g_Ca,T9E_L1k_Ca,T9e_L1l_Ca,T9e_L1m_Ca,T9e_L1n_Ca,T9e_L1o_Ca,T9E_L1p_Other,T9e_L1p_Ca,T9e_L1q_Ca,T9E_L1_Ca,T9E_L2_Ca,T9E_L3_Other,T9E_L3_Ca,T9E_L3a_Ca,T9E_L3b_Other,T9e_L3b_Ca,T9E_L5_Ca,T9E_L6_Other,T9E_L6_Ca,T9E_L6a_Other,T9E_L6a_Ca,T9E_L7_Other,T9E_L7_Ca,T9E_L8_Other,T9E_L8_Ca,T9E_L9_Ca,T9E_L10_Other,T9E_L10_Ca,T9E_L11_Ca
0,,,Migrant Health Center-Amount (a),Community Health Center-Amount (a),Health Care for the Homeless-Amount (a),Public Housing Primary Care-Amount (a),Total Health Center (Sum of Lines 1a through 1...,"Capital Development Grants, including School-B...",Coronavirus Preparedness and Response Suppleme...,"Coronavirus Aid, Relief, and Economic Security...",Expanding Capacity for Coronavirus Testing (EC...,"American Rescue Plan (ARP) (H8F, L2C, C8E)-Amo...",Other COVID-19-Related Funding from BPHC-Source,Other COVID-19-Related Funding from BPHC-Amoun...,Total COVID-19 Supplemental (Sum of Lines 1l t...,Total BPHC Grants (Sum of Lines 1g + 1k + 1q)-...,Ryan White Part C HIV Early Intervention-Amoun...,Other Federal Grants-Source,Other Federal Grants-Amount (a),Medicare and Medicaid EHR Incentive Payments f...,Provider Relief Fund-Source,Provider Relief Fund-Amount (a),Total Other Federal Grants (Sum of Lines 2 thr...,State Government Grants and Contracts-Source,State Government Grants and Contracts-Amount (a),State/Local Indigent Care Programs-Source,State/Local Indigent Care Programs-Amount (a),Local Government Grants and Contracts-Source,Local Government Grants and Contracts-Amount (a),Foundation/Private Grants and Contracts-Source,Foundation/Private Grants and Contracts-Amount...,Total Non-Federal Grants and Contracts (Sum of...,Other Revenue (non-patient service revenue not...,Other Revenue (non-patient service revenue not...,Total Revenue (Sum of Lines 1 + 5 + 9 + 10)-Am...
1,10030.0,H80CS00803,0,5721128,0,0,5721128,0,0,0,0,1863333,-,0,1863333,7584461,795188,"SAMHSA, National Institutes of Health (NIH), H...",568943,0,-,0,1364131,"Counseling & Testing, HIV Case Management, Fam...",2915656,HSN,1442182,-,0,"MA League - CHWs, La Linda Manita, Project Bre...",764680,5122518,"Rental Income from tenants,\nInterest Income, ...",14258919,28330029
2,10040.0,H80CS00443,1758567,-,-,-,1758567,-,1256,-,1396,497431,-,-,500083,2258650,-,-,-,-,-,-,-,-,-,-,-,-,-,MeHAF Advocacy Grant,25000,25000,"Interest $703; Other Income $33,875; Donations...",36578,2320228


In [36]:
health_center_costs.head(3)

Unnamed: 0,BHCMISID,GrantNumber,T8a_L1_Ca,T8a_L1_Cb,T8a_L1_Cc,T8a_L2_Ca,T8a_L2_Cb,T8a_L2_Cc,T8a_L3_Ca,T8a_L3_Cb,T8a_L3_Cc,T8a_L4_Ca,T8a_L4_Cb,T8a_L4_Cc,T8a_L5_Ca,T8a_L5_Cb,T8a_L5_Cc,T8a_L6_Ca,T8a_L6_Cb,T8a_L6_Cc,T8a_L7_Ca,T8a_L7_Cb,T8a_L7_Cc,T8a_L8a_Ca,T8a_L8a_Cb,T8a_L8a_Cc,T8a_L8b_Ca,T8a_L8b_Cc,T8a_L9_Other,T8a_L9_Ca,T8a_L9_Cb,T8a_L9_Cc,T8a_L9a_Ca,T8a_L9a_Cb,T8a_L9a_Cc,T8a_L10_Ca,T8a_L10_Cb,T8a_L10_Cc,T8a_L11a_Ca,T8a_L11a_Cc,T8a_L11b_Ca,T8a_L11b_Cc,T8a_L11c_Ca,T8a_L11c_Cc,T8a_L11d_Ca,T8a_L11d_Cc,T8a_L11e_Ca,T8a_L11e_Cc,T8a_L11f_Ca,T8a_L11f_Cc,T8a_L11g_Other,T8a_L11g_Ca,T8a_L11g_Cc,T8A_L11h_Ca,T8A_L11h_Cc,T8a_L11_Ca,T8a_L11_Cb,T8a_L11_Cc,T8a_L12_Other,T8a_L12_Ca,T8a_L12_Cb,T8a_L12_Cc,T8A_L12a_Ca,T8A_L12a_Cb,T8A_L12a_Cc,T8a_L13_Ca,T8a_L13_Cb,T8a_L13_Cc,T8a_L14_Ca,T8a_L15_Ca,T8a_L16_Ca,T8a_L17_Ca,T8a_L17_Cc,T8a_L18_Other,T8a_L18_Cc,T8a_L19_Cc
0,,,Medical Personnel-Accrued Cost (a),Medical Personnel-Allocation of Facility and N...,Medical Personnel-Total Cost After Allocation ...,Lab and X-ray-Accrued Cost (a),Lab and X-ray-Allocation of Facility and Non-C...,Lab and X-ray-Total Cost After Allocation of F...,Medical/Other Direct-Accrued Cost (a),Medical/Other Direct-Allocation of Facility an...,Medical/Other Direct-Total Cost After Allocati...,Total Medical Care Services (Sum of Lines 1 th...,Total Medical Care Services (Sum of Lines 1 th...,Total Medical Care Services (Sum of Lines 1 th...,Dental-Accrued Cost (a),Dental-Allocation of Facility and Non-Clinical...,Dental-Total Cost After Allocation of Facility...,Mental Health-Accrued Cost (a),Mental Health-Allocation of Facility and Non-C...,Mental Health-Total Cost After Allocation of F...,Substance Use Disorder-Accrued Cost (a),Substance Use Disorder-Allocation of Facility ...,Substance Use Disorder-Total Cost After Alloca...,Pharmacy (not including pharmaceuticals)-Accru...,Pharmacy (not including pharmaceuticals)-Alloc...,Pharmacy (not including pharmaceuticals)-Total...,Pharmaceuticals-Accrued Cost (a),Pharmaceuticals-Total Cost After Allocation of...,Other Professional-Others,Other Professional-Accrued Cost (a),Other Professional-Allocation of Facility and ...,Other Professional-Total Cost After Allocation...,Vision-Accrued Cost (a),Vision-Allocation of Facility and Non-Clinical...,Vision-Total Cost After Allocation of Facility...,Total Other Clinical Services (Sum of Lines 5 ...,Total Other Clinical Services (Sum of Lines 5 ...,Total Other Clinical Services (Sum of Lines 5 ...,Case Management-Accrued Cost (a),Case Management-Total Cost After Allocation of...,Transportation-Accrued Cost (a),Transportation-Total Cost After Allocation of ...,Outreach-Accrued Cost (a),Outreach-Total Cost After Allocation of Facili...,Patient and Community Education-Accrued Cost (a),Patient and Community Education-Total Cost Aft...,Eligibility Assistance-Accrued Cost (a),Eligibility Assistance-Total Cost After Alloca...,Interpretation Services-Accrued Cost (a),Interpretation Services-Total Cost After Alloc...,Other Enabling Services-Others,Other Enabling Services-Accrued Cost (a),Other Enabling Services-Total Cost After Alloc...,Community Health Workers-Accrued Cost (a),Community Health Workers-Total Cost After Allo...,Total Enabling Services (Sum of Lines 11a thro...,Total Enabling Services (Sum of Lines 11a thro...,Total Enabling Services (Sum of Lines 11a thro...,Other Program-Related Services-Others,Other Program-Related Services-Accrued Cost (a),Other Program-Related Services-Allocation of F...,Other Program-Related Services-Total Cost Afte...,Quality Improvement-Accrued Cost (a),Quality Improvement-Allocation of Facility and...,Quality Improvement-Total Cost After Allocatio...,Total Enabling and Other Services (Sum of Line...,Total Enabling and Other Services (Sum of Line...,Total Enabling and Other Services (Sum of Line...,Facility-Accrued Cost (a),Non-Clinical Support Services-Accrued Cost (a),Total Facility and Non-Clinical Support Servic...,Total Accrued Costs (Sum of Lines 4 + 10 + 13 ...,Total Accrued Costs (Sum of Lines 4 + 10 + 13 ...,"Value of Donated Facilities, Services, and Sup...","Value of Donated Facilities, Services, and Sup...",Total with Donations (Sum of Lines 17 and 18)-...
1,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---
2,010040,H80CS00443,405175,251125,656300,20990,11507,32497,55634,30500,86134,481799,293132,774931,17189,9424,26613,197127,116356,313483,-,-,-,1246,13665,14911,23680,23680,podiatrist,210,115,325,3049,1672,4721,242501,141232,383733,547180,547180,-,-,-,-,79276,79276,552,552,-,-,-,-,-,-,-,627008,397592,1024600,-,-,-,-,-,-,-,627008,397592,1024600,89179,742777,831956,2183264,2183264,Donated services represent the difference betw...,105231,2288495


In [37]:
health_center_zipcodes.head(3)

Unnamed: 0,BHCMISID,GrantNumber,ReportingYear,ZipCode,ZipCodeType,None_UninsuredPatients,Medicaid_CHIP_OtherPublicPatients,MedicarePatients,PrivatePatients,TotalNumberofPatients
0,10030,H80CS00803,2022,1011,ZipCode,--,--,0,--,--
1,10030,H80CS00803,2022,1013,ZipCode,61,1346,385,182,1974
2,10030,H80CS00803,2022,1014,ZipCode,0,--,--,0,21


In [38]:
personnel_and_visits.head(3)

Unnamed: 0,BHCMISID,GrantNumber,T5_L1_Ca,T5_L1_Cb,T5_L1_Cb2,T5_L2_Ca,T5_L2_Cb,T5_L2_Cb2,T5_L3_Ca,T5_L3_Cb,T5_L3_Cb2,T5_L4_Ca,T5_L4_Cb,T5_L4_Cb2,T5_L5_Ca,T5_L5_Cb,T5_L5_Cb2,T5_L7_Ca,T5_L7_Cb,T5_L7_Cb2,T5_L8_Ca,T5_L8_Cb,T5_L8_Cb2,T5_L9a_Ca,T5_L9a_Cb,T5_L9a_Cb2,T5_L9b_Ca,T5_L9b_Cb,T5_L9b_Cb2,T5_L10_Ca,T5_L10_Cb,T5_L10_Cb2,T5_L10a_Ca,T5_L10a_Cb,T5_L10a_Cb2,T5_L11_Ca,T5_L11_Cb,T5_L11_Cb2,T5_L12_Ca,T5_L13_Ca,T5_L14_Ca,T5_L15_Ca,T5_L15_Cb,T5_L15_Cb2,T5_L15_Cc,T5_L16_Ca,T5_L16_Cb,T5_L16_Cb2,T5_L17_Ca,T5_L17_Cb,T5_L17_Cb2,T5_L17a_Ca,T5_L17a_Cb,T5_L17a_Cb2,T5_L18_Ca,T5_L19_Ca,T5_L19_Cb,T5_L19_Cb2,T5_L19_Cc,T5_L20a_Ca,T5_L20a_Cb,T5_L20a_Cb2,T5_L20a1_Ca,T5_L20a1_Cb,T5_L20a1_Cb2,T5_L20a2_Ca,T5_L20a2_Cb,T5_L20a2_Cb2,T5_L20b_Ca,T5_L20b_Cb,T5_L20b_Cb2,T5_L20c_Ca,T5_L20c_Cb,T5_L20c_Cb2,T5_L20_Ca,T5_L20_Cb,T5_L20_Cb2,T5_L20_Cc,T5_L21_Ca,T5_L21_Cb,T5_L21_Cb2,T5_L21_Cc,T5_L22a_Ca,T5_L22a_Cb,T5_L22a_Cb2,T5_L22b_Ca,T5_L22b_Cb,T5_L22b_Cb2,T5_L22c_Ca,T5_L22d_Ca,T5_L22d_Cb,T5_L22d_Cb2,T5_L22d_Cc,T5_L22_Other,T5_L22_Ca,T5_L22_Cb,T5_L22_Cb2,T5_L22_Cc,T5_L23_Ca,T5_L24_Ca,T5_L24_Cb,T5_L24_Cb2,T5_L25_Ca,T5_L25_Cb,T5_L25_Cb2,T5_L26_Ca,T5_L27_Ca,T5_L27a_Ca,T5_L27b_Ca,T5_L27c_Ca,T5_L28_Other,T5_L28_Ca,T5_L29_Ca,T5_L29_Cb,T5_L29_Cb2,T5_L29_Cc,T5_L29a_Other,T5_L29a_Ca,T5_L29b_Ca,T5_L30a_Ca,T5_L30b_Ca,T5_L30c_Ca,T5_L31_Ca,T5_L32_Ca,T5_L33_Ca,T5_L34_Ca,T5_L34_Cb,T5_L34_Cb2,T5_L20a01_Ca1,T5_L20a01_Cb,T5_L20a01_Cb2,T5_L20a01_Cc,T5_L20a02_Ca1,T5_L20a02_Cb,T5_L20a02_Cb2,T5_L20a02_Cc,T5_L20a03_Ca1,T5_L20a03_Cb,T5_L20a03_Cb2,T5_L20a03_Cc,T5_L20a04_Ca1,T5_L20a04_Cb,T5_L20a04_Cb2,T5_L20a04_Cc,T5_L21a_Ca1,T5_L21a_Cb,T5_L21a_Cb2,T5_L21a_Cc,T5_L21b_Ca1,T5_L21b_Cb,T5_L21b_Cb2,T5_L21b_Cc,T5_L21c_Ca1,T5_L21c_Cb,T5_L21c_Cb2,T5_L21c_Cc,T5_L21d_Ca1,T5_L21d_Cb,T5_L21d_Cb2,T5_L21d_Cc,T5_L21e_Ca1,T5_L21e_Cb,T5_L21e_Cb2,T5_L21e_Cc,T5_L21f_Ca1,T5_L21f_Cb,T5_L21f_Cb2,T5_L21f_Cc,T5_L21g_Ca1,T5_L21g_Cb,T5_L21g_Cb2,T5_L21g_Cc,T5_L21h_Ca1,T5_L21h_Cb,T5_L21h_Cb2,T5_L21h_Cc
0,,,Family Physicians-FTEs (a),Family Physicians-Clinic Visits (b),Family Physicians-Virtual Visits (b2),General Practitioners-FTEs (a),General Practitioners-Clinic Visits (b),General Practitioners-Virtual Visits (b2),Internists-FTEs (a),Internists-Clinic Visits (b),Internists-Virtual Visits (b2),Obstetrician/Gynecologists-FTEs (a),Obstetrician/Gynecologists-Clinic Visits (b),Obstetrician/Gynecologists-Virtual Visits (b2),Pediatricians-FTEs (a),Pediatricians-Clinic Visits (b),Pediatricians-Virtual Visits (b2),Other Specialty Physicians-FTEs (a),Other Specialty Physicians-Clinic Visits (b),Other Specialty Physicians-Virtual Visits (b2),Total Physicians (Lines 1–7)-FTEs (a),Total Physicians (Lines 1–7)-Clinic Visits (b),Total Physicians (Lines 1–7)-Virtual Visits (b2),Nurse Practitioners-FTEs (a),Nurse Practitioners-Clinic Visits (b),Nurse Practitioners-Virtual Visits (b2),Physician Assistants-FTEs (a),Physician Assistants-Clinic Visits (b),Physician Assistants-Virtual Visits (b2),Certified Nurse Midwives-FTEs (a),Certified Nurse Midwives-Clinic Visits (b),Certified Nurse Midwives-Virtual Visits (b2),"Total NPs, PAs, and CNMs (Lines 9a–10)-FTEs (a)","Total NPs, PAs, and CNMs (Lines 9a–10)-Clinic ...","Total NPs, PAs, and CNMs (Lines 9a–10)-Virtual...",Nurses-FTEs (a),Nurses-Clinic Visits (b),Nurses-Virtual Visits (b2),Other Medical Personnel-FTEs (a),Laboratory Personnel-FTEs (a),X-ray Personnel-FTEs (a),Total Medical Care Services (Lines 8 + 10a thr...,Total Medical Care Services (Lines 8 + 10a thr...,Total Medical Care Services (Lines 8 + 10a thr...,Total Medical Care Services (Lines 8 + 10a thr...,Dentists-FTEs (a),Dentists-Clinic Visits (b),Dentists-Virtual Visits (b2),Dental Hygienists-FTEs (a),Dental Hygienists-Clinic Visits (b),Dental Hygienists-Virtual Visits (b2),Dental Therapists-FTEs (a),Dental Therapists-Clinic Visits (b),Dental Therapists-Virtual Visits (b2),Other Dental Personnel-FTEs (a),Total Dental Services (Lines 16–18)-FTEs (a),Total Dental Services (Lines 16–18)-Clinic Vis...,Total Dental Services (Lines 16–18)-Virtual Vi...,Total Dental Services (Lines 16–18)-Patients (c),Psychiatrists-FTEs (a),Psychiatrists-Clinic Visits (b),Psychiatrists-Virtual Visits (b2),Licensed Clinical Psychologists-FTEs (a),Licensed Clinical Psychologists-Clinic Visits (b),Licensed Clinical Psychologists-Virtual Visits...,Licensed Clinical Social Workers-FTEs (a),Licensed Clinical Social Workers-Clinic Visits...,Licensed Clinical Social Workers-Virtual Visit...,Other Licensed Mental Health Providers-FTEs (a),Other Licensed Mental Health Providers-Clinic ...,Other Licensed Mental Health Providers-Virtual...,Other Mental Health Personnel-FTEs (a),Other Mental Health Personnel-Clinic Visits (b),Other Mental Health Personnel-Virtual Visits (b2),Total Mental Health Services (Lines 20a-c)-FTE...,Total Mental Health Services (Lines 20a-c)-Cli...,Total Mental Health Services (Lines 20a-c)-Vir...,Total Mental Health Services (Lines 20a-c)-Pat...,Substance Use Disorder Services-FTEs (a),Substance Use Disorder Services-Clinic Visits (b),Substance Use Disorder Services-Virtual Visits...,Substance Use Disorder Services-Patients (c),Ophthalmologists-FTEs (a),Ophthalmologists-Clinic Visits (b),Ophthalmologists-Virtual Visits (b2),Optometrists-FTEs (a),Optometrists-Clinic Visits (b),Optometrists-Virtual Visits (b2),Other Vision Care Personnel-FTEs (a),Total Vision Services (Lines 22a–c)-FTEs (a),Total Vision Services (Lines 22a–c)-Clinic Vis...,Total Vision Services (Lines 22a–c)-Virtual Vi...,Total Vision Services (Lines 22a–c)-Patients (c),Other Professional Services-Others,Other Professional Services-FTEs (a),Other Professional Services-Clinic Visits (b),Other Professional Services-Virtual Visits (b2),Other Professional Services-Patients (c),Pharmacy Personnel-FTEs (a),Case Managers-FTEs (a),Case Managers-Clinic Visits (b),Case Managers-Virtual Visits (b2),Patient and Community Education Specialists-FT...,Patient and Community Education Specialists-Cl...,Patient and Community Education Specialists-Vi...,Outreach Workers-FTEs (a),Transportation Personnel-FTEs (a),Eligibility Assistance Workers-FTEs (a),Interpretation Personnel-FTEs (a),Community Health Workers-FTEs (a),Other Enabling Services-Others,Other Enabling Services-FTEs (a),Total Enabling Services (Lines 24–28)-FTEs (a),Total Enabling Services (Lines 24–28)-Clinic V...,Total Enabling Services (Lines 24–28)-Virtual ...,Total Enabling Services (Lines 24–28)-Patients...,Other Programs and Services-Others,Other Programs and Services-FTEs (a),Quality Improvement Personnel-FTEs (a),Management and Support Personnel-FTEs (a),Fiscal and Billing Personnel-FTEs (a),IT Personnel-FTEs (a),Facility Personnel-FTEs (a),Patient Support Personnel-FTEs (a),Total Facility and Non-Clinical Support Person...,Grand Total (Lines 15+19+20+21+22+22d+23+29+29...,Grand Total (Lines 15+19+20+21+22+22d+23+29+29...,Grand Total (Lines 15+19+20+21+22+22d+23+29+29...,Physicians (other than Psychiatrists)-Personne...,Physicians (other than Psychiatrists)-Clinic V...,Physicians (other than Psychiatrists)-Virtual ...,Physicians (other than Psychiatrists)-Patients...,Nurse Practitioners-Personnel (a1),Nurse Practitioners-Clinic Visits (b),Nurse Practitioners-Virtual Visits (b2),Nurse Practitioners-Patients (c),Physician Assistants-Personnel (a1),Physician Assistants-Clinic Visits (b),Physician Assistants-Virtual Visits (b2),Physician Assistants-Patients (c),Certified Nurse Midwives-Personnel (a1),Certified Nurse Midwives-Clinic Visits (b),Certified Nurse Midwives-Virtual Visits (b2),Certified Nurse Midwives-Patients (c),Physicians (other than Psychiatrists)-Personne...,Physicians (other than Psychiatrists)-Clinic V...,Physicians (other than Psychiatrists)-Virtual ...,Physicians (other than Psychiatrists)-Patients...,Nurse Practitioners (Medical)-Personnel (a1),Nurse Practitioners (Medical)-Clinic Visits (b),Nurse Practitioners (Medical)-Virtual Visits (b2),Nurse Practitioners (Medical)-Patients (c),Physician Assistants-Personnel (a1),Physician Assistants-Clinic Visits (b),Physician Assistants-Virtual Visits (b2),Physician Assistants-Patients (c),Certified Nurse Midwives-Personnel (a1),Certified Nurse Midwives-Clinic Visits (b),Certified Nurse Midwives-Virtual Visits (b2),Certified Nurse Midwives-Patients (c),Psychiatrists-Personnel (a1),Psychiatrists-Clinic Visits (b),Psychiatrists-Virtual Visits (b2),Psychiatrists-Patients (c),Licensed Clinical Psychologists-Personnel (a1),Licensed Clinical Psychologists-Clinic Visits (b),Licensed Clinical Psychologists-Virtual Visits...,Licensed Clinical Psychologists-Patients (c),Licensed Clinical Social Workers-Personnel (a1),Licensed Clinical Social Workers-Clinic Visits...,Licensed Clinical Social Workers-Virtual Visit...,Licensed Clinical Social Workers-Patients (c),Other Licensed Mental Health Providers-Personn...,Other Licensed Mental Health Providers-Clinic ...,Other Licensed Mental Health Providers-Virtual...,Other Licensed Mental Health Providers-Patient...
1,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---
2,010040,H80CS00443,0.92,1013,3,0,0,0,0.1,16,0,0,0,0,0,0,0,0,8,1,1.02,1037,4,0.2,40,-,0.51,185,-,0,1,-,0.71,226,-,1.36,-,-,-,-,-,3.09,1263,4,727,-,95,-,-,-,-,-,-,-,-,-,95,-,95,-,-,0,-,-,-,2.12,536,13,-,-,-,-,-,-,2.12,536,13,108,-,4,-,4,0,2,0,0,25,0,-,0,27,0,27,podiatrists,-,2,0,1,0.17,7.76,439,174,1.2,15,1,-,-,-,-,-,-,-,8.96,454,175,263,-,-,-,3.1,1.11,-,-,0.69,4.9,19.24,2381,192,17,51,-,38,1,4,0,3,2,8,-,7,-,-,-,-,5,9,-,9,1,2,-,1,1,2,-,1,-,-,-,-,-,-,-,-,-,-,-,-,2,6,10,7,-,-,-,-


In [39]:
patients_age.head(3)

Unnamed: 0,BHCMISID,GrantNumber,T3a_L1_Ca,T3a_L1_Cb,T3a_L2_Ca,T3a_L2_Cb,T3a_L3_Ca,T3a_L3_Cb,T3a_L4_Ca,T3a_L4_Cb,T3a_L5_Ca,T3a_L5_Cb,T3a_L6_Ca,T3a_L6_Cb,T3a_L7_Ca,T3a_L7_Cb,T3a_L8_Ca,T3a_L8_Cb,T3a_L9_Ca,T3a_L9_Cb,T3a_L10_Ca,T3a_L10_Cb,T3a_L11_Ca,T3a_L11_Cb,T3a_L12_Ca,T3a_L12_Cb,T3a_L13_Ca,T3a_L13_Cb,T3a_L14_Ca,T3a_L14_Cb,T3a_L15_Ca,T3a_L15_Cb,T3a_L16_Ca,T3a_L16_Cb,T3a_L17_Ca,T3a_L17_Cb,T3a_L18_Ca,T3a_L18_Cb,T3a_L19_Ca,T3a_L19_Cb,T3a_L20_Ca,T3a_L20_Cb,T3a_L21_Ca,T3a_L21_Cb,T3a_L22_Ca,T3a_L22_Cb,T3a_L23_Ca,T3a_L23_Cb,T3a_L24_Ca,T3a_L24_Cb,T3a_L25_Ca,T3a_L25_Cb,T3a_L26_Ca,T3a_L26_Cb,T3a_L27_Ca,T3a_L27_Cb,T3a_L28_Ca,T3a_L28_Cb,T3a_L29_Ca,T3a_L29_Cb,T3a_L30_Ca,T3a_L30_Cb,T3a_L31_Ca,T3a_L31_Cb,T3a_L32_Ca,T3a_L32_Cb,T3a_L33_Ca,T3a_L33_Cb,T3a_L34_Ca,T3a_L34_Cb,T3a_L35_Ca,T3a_L35_Cb,T3a_L36_Ca,T3a_L36_Cb,T3a_L37_Ca,T3a_L37_Cb,T3a_L38_Ca,T3a_L38_Cb,T3a_L39_Ca,T3a_L39_Cb
0,,,Under age 1-Male Patients (a),Under age 1-Female Patients (b),Age 1-Male Patients (a),Age 1-Female Patients (b),Age 2-Male Patients (a),Age 2-Female Patients (b),Age 3-Male Patients (a),Age 3-Female Patients (b),Age 4-Male Patients (a),Age 4-Female Patients (b),Age 5-Male Patients (a),Age 5-Female Patients (b),Age 6-Male Patients (a),Age 6-Female Patients (b),Age 7-Male Patients (a),Age 7-Female Patients (b),Age 8-Male Patients (a),Age 8-Female Patients (b),Age 9-Male Patients (a),Age 9-Female Patients (b),Age 10-Male Patients (a),Age 10-Female Patients (b),Age 11-Male Patients (a),Age 11-Female Patients (b),Age 12-Male Patients (a),Age 12-Female Patients (b),Age 13-Male Patients (a),Age 13-Female Patients (b),Age 14-Male Patients (a),Age 14-Female Patients (b),Age 15-Male Patients (a),Age 15-Female Patients (b),Age 16-Male Patients (a),Age 16-Female Patients (b),Age 17-Male Patients (a),Age 17-Female Patients (b),Age 18-Male Patients (a),Age 18-Female Patients (b),Age 19-Male Patients (a),Age 19-Female Patients (b),Age 20-Male Patients (a),Age 20-Female Patients (b),Age 21-Male Patients (a),Age 21-Female Patients (b),Age 22-Male Patients (a),Age 22-Female Patients (b),Age 23-Male Patients (a),Age 23-Female Patients (b),Age 24-Male Patients (a),Age 24-Female Patients (b),Ages 25–29-Male Patients (a),Ages 25–29-Female Patients (b),Ages 30–34-Male Patients (a),Ages 30–34-Female Patients (b),Ages 35–39-Male Patients (a),Ages 35–39-Female Patients (b),Ages 40–44-Male Patients (a),Ages 40–44-Female Patients (b),Ages 45–49-Male Patients (a),Ages 45–49-Female Patients (b),Ages 50–54-Male Patients (a),Ages 50–54-Female Patients (b),Ages 55–59-Male Patients (a),Ages 55–59-Female Patients (b),Ages 60–64-Male Patients (a),Ages 60–64-Female Patients (b),Ages 65–69-Male Patients (a),Ages 65–69-Female Patients (b),Ages 70–74-Male Patients (a),Ages 70–74-Female Patients (b),Ages 75–79-Male Patients (a),Ages 75–79-Female Patients (b),Ages 80–84-Male Patients (a),Ages 80–84-Female Patients (b),Age 85 and over-Male Patients (a),Age 85 and over-Female Patients (b),Total Patients (Sum of Lines 1-38)-Male Patien...,Total Patients (Sum of Lines 1-38)-Female Pati...
1,10030.0,H80CS00803,81,69,104,99,112,113,126,120,138,153,151,122,148,147,174,164,198,145,149,150,148,153,162,152,187,144,171,140,158,178,176,155,153,142,135,140,118,127,85,113,76,112,63,114,88,106,82,114,68,112,355,654,505,687,506,691,503,662,479,646,579,684,656,756,610,704,520,568,364,411,226,294,141,161,126,121,8821,10323
2,10040.0,H80CS00443,--,--,--,0,--,--,--,--,--,--,--,--,--,--,--,--,0,--,--,--,--,--,0,--,--,--,--,--,0,--,--,--,0,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,0,63,16,62,25,80,22,65,21,68,21,50,21,40,25,44,--,31,--,--,--,--,--,0,--,0,0,609,241


In [40]:
patients_race.head(3)

Unnamed: 0,BHCMISID,GrantNumber,T3b_L1_Ca,T3b_L1_Cb,T3b_L1_Cd,T3b_L2a_Ca,T3b_L2a_Cb,T3b_L2a_Cd,T3b_L2b_Ca,T3b_L2b_Cb,T3b_L2b_Cd,T3b_L2_Ca,T3b_L2_Cb,T3b_L2_Cd,T3b_L3_Ca,T3b_L3_Cb,T3b_L3_Cd,T3b_L4_Ca,T3b_L4_Cb,T3b_L4_Cd,T3b_L5_Ca,T3b_L5_Cb,T3b_L5_Cd,T3b_L6_Ca,T3b_L6_Cb,T3b_L6_Cd,T3b_L7_Ca,T3b_L7_Cb,T3b_L7_Cc,T3b_L7_Cd,T3b_L8_Ca,T3b_L8_Cb,T3b_L8_Cc,T3b_L8_Cd,T3b_L12_Ca,T3b_L13_Ca,T3b_L14_Ca,T3b_L15_Ca,T3b_L16_Ca,T3b_L17_Ca,T3b_L18_Ca,T3b_L18a_Ca,T3b_L19_Ca,T3b_L20_Ca,T3b_L21_Ca,T3b_L22_Ca,T3b_L23_Ca,T3b_L24_Ca,T3b_L25_Ca,T3b_L25a_Ca,T3b_L26_Ca
0,,,Asian-Hispanic or Latino/a (a),Asian-Non-Hispanic or Latino/a (b),Asian-Total (d) (Sum Columns a+b+c),Native Hawaiian-Hispanic or Latino/a (a),Native Hawaiian-Non-Hispanic or Latino/a (b),Native Hawaiian-Total (d) (Sum Columns a+b+c),Other Pacific Islander-Hispanic or Latino/a (a),Other Pacific Islander-Non-Hispanic or Latino/...,Other Pacific Islander-Total (d) (Sum Columns ...,Total Native Hawaiian/Other Pacific Islander (...,Total Native Hawaiian/Other Pacific Islander (...,Total Native Hawaiian/Other Pacific Islander (...,Black/African American-Hispanic or Latino/a (a),Black/African American-Non-Hispanic or Latino/...,Black/African American-Total (d) (Sum Columns ...,American Indian/Alaska Native-Hispanic or Lati...,American Indian/Alaska Native-Non-Hispanic or ...,American Indian/Alaska Native-Total (d) (Sum C...,White-Hispanic or Latino/a (a),White-Non-Hispanic or Latino/a (b),White-Total (d) (Sum Columns a+b+c),More than one race-Hispanic or Latino/a (a),More than one race-Non-Hispanic or Latino/a (b),More than one race-Total (d) (Sum Columns a+b+c),Unreported/Chose Not to Disclose race-Hispanic...,Unreported/Chose Not to Disclose race-Non-Hisp...,Unreported/Chose Not to Disclose race-Unreport...,Unreported/Chose Not to Disclose race-Total (d...,Total Patients (Sum of Lines 1 + 2 + 3 to 7)-H...,Total Patients (Sum of Lines 1 + 2 + 3 to 7)-N...,Total Patients (Sum of Lines 1 + 2 + 3 to 7)-U...,Total Patients (Sum of Lines 1 + 2 + 3 to 7)-T...,Patients Best Served in a Language Other than ...,Lesbian or Gay-Number (a),Heterosexual (or straight)-Number (a),Bisexual-Number (a),Other-Number (a),Don’t know-Number (a),Chose not to disclose-Number (a),Unknown-Number (a),Total Patients (Sum of Lines 13 to 18a)-Number...,Male-Number (a),Female-Number (a),Transgender Man/Transgender Male/Transmasculin...,Transgender Woman/Transgender Female/Transfemi...,Other-Number (a),Chose not to disclose-Number (a),Unknown-Number (a),Total Patients (Sum of Lines 20 to 25a)-Number...
1,10030.0,H80CS00803,--,148,158,215,17,232,--,--,--,218,18,236,478,432,910,352,47,399,4297,3035,7332,797,192,989,6479,413,2228,9120,12631,4285,2228,19144,8371,225,10967,180,28,2033,5660,51,19144,7744,9138,33,28,64,2109,28,19144
2,10040.0,H80CS00443,0,0,0,0,0,0,0,--,--,0,--,--,--,332,344,51,--,61,44,--,46,0,0,0,364,--,22,398,471,357,22,850,654,--,717,--,--,25,94,0,850,609,241,0,0,0,0,0,850


In [41]:
patients_other_demographics.head(3)

Unnamed: 0,BHCMISID,GrantNumber,T4_L1_Ca,T4_L2_Ca,T4_L3_Ca,T4_L4_Ca,T4_L5_Ca,T4_L6_Ca,T4_L7_Ca,T4_L7_Cb,T4_L8a_Ca,T4_L8a_Cb,T4_L8b_Ca,T4_L8b_Cb,T4_L8_Ca,T4_L8_Cb,T4_L9a_Ca,T4_L9a_Cb,T4_L9_Ca,T4_L9_Cb,T4_L10a_Other,T4_L10a_Ca,T4_L10a_Cb,T4_L10b_Ca,T4_L10b_Cb,T4_L10_Ca,T4_L10_Cb,T4_L11_Ca,T4_L11_Cb,T4_L12_Ca,T4_L12_Cb,T4_L13a_Ca,T4_L13a_Cb,T4_L13a_Cc,T4_L13a_Cd,T4_L13a_Ce,T4_L13b_Ca,T4_L13b_Cb,T4_L13b_Cc,T4_L13b_Cd,T4_L13b_Ce,T4_L13c_Ca,T4_L13c_Cb,T4_L13c_Cc,T4_L13c_Cd,T4_L13c_Ce,T4_L14_Ca,T4_L15_Ca,T4_L16_Ca,T4_L17_Ca,T4_L18_Ca,T4_L19_Ca,T4_L20_Ca,T4_L21a_Ca,T4_L21_Ca,T4_L22_Ca,T4_L23_Ca,T4_L24_Ca,T4_L25_Ca,T4_L26_Ca
0,,,100% and below-Number of Patients (a),101–150%-Number of Patients (a),151–200%-Number of Patients (a),Over 200%-Number of Patients (a),Unknown-Number of Patients (a),TOTAL (Sum of Lines 1–5)-Number of Patients (a),None/Uninsured-0-17 years old (a),None/Uninsured-18 and older (b),Medicaid (Title XIX)-0-17 years old (a),Medicaid (Title XIX)-18 and older (b),CHIP Medicaid-0-17 years old (a),CHIP Medicaid-18 and older (b),Total Medicaid (Line 8a + 8b)-0-17 years old (a),Total Medicaid (Line 8a + 8b)-18 and older (b),Dually Eligible (Medicare and Medicaid)-0-17 y...,Dually Eligible (Medicare and Medicaid)-18 and...,Medicare (Inclusive of dually eligible and oth...,Medicare (Inclusive of dually eligible and oth...,Other Public Insurance (Non-CHIP)-Others,Other Public Insurance (Non-CHIP)-0-17 years o...,Other Public Insurance (Non-CHIP)-18 and older...,Other Public Insurance CHIP-0-17 years old (a),Other Public Insurance CHIP-18 and older (b),Total Public Insurance (Line 10a + 10b)-0-17 y...,Total Public Insurance (Line 10a + 10b)-18 and...,Private Insurance-0-17 years old (a),Private Insurance-18 and older (b),TOTAL (Sum of Lines 7 + 8 + 9 +10 +11)-0-17 ye...,TOTAL (Sum of Lines 7 + 8 + 9 +10 +11)-18 and ...,Capitated Member Months-Medicaid (a),Capitated Member Months-Medicare (b),Capitated Member Months-Other Public Including...,Capitated Member Months-Private (d),Capitated Member Months-TOTAL (e),Fee-for-service Member Months-Medicaid (a),Fee-for-service Member Months-Medicare (b),Fee-for-service Member Months-Other Public Inc...,Fee-for-service Member Months-Private (d),Fee-for-service Member Months-TOTAL (e),Total Member Months (Sum of Lines 13a + 13b)-M...,Total Member Months (Sum of Lines 13a + 13b)-M...,Total Member Months (Sum of Lines 13a + 13b)-O...,Total Member Months (Sum of Lines 13a + 13b)-P...,Total Member Months (Sum of Lines 13a + 13b)-T...,Migratory (330g awardees only)-Number of Patie...,Seasonal (330g awardees only)-Number of Patien...,Total Agricultural Workers or Dependents (All ...,Homeless Shelter (330h awardees only)-Number o...,Transitional (330h awardees only)-Number of Pa...,Doubling Up (330h awardees only)-Number of Pat...,Street (330h awardees only)-Number of Patients...,Permanent Supportive Housing (330h awardees on...,Other (330h awardees only)-Number of Patients (a),Unknown (330h awardees only)-Number of Patient...,Total Homeless (All health centers report this...,Total School-Based Service Site Patients (All ...,Total Veterans (All health centers report this...,Total Patients Served at a Health Center Locat...
1,10030.0,H80CS00803,2706,184,103,168,15983,19144,104,609,4882,8638,0,0,4882,8638,0,1088,--,3320,Childrens Medical Security,--,--,0,0,--,--,166,1410,5157,13987,0,9877,0,1008,10885,3265,7974,0,6075,17314,3265,17851,0,7083,28199,-,-,45,-,-,-,-,-,-,-,2998,0,332,19144
2,10040.0,H80CS00443,768,68,--,--,0,850,35,734,24,36,0,0,24,36,-,--,0,--,-,-,-,-,-,-,-,--,--,64,786,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,571,168,769,-,-,-,-,-,-,-,--,0,--,0


In [42]:
patient_services_revenue.head(3)

Unnamed: 0,BHCMISID,GrantNumber,T9D_L1_Ca,T9D_L1_Cb,T9D_L1_Cc1,T9D_L1_Cc2,T9D_L1_Cc3,T9D_L1_Cc4,T9D_L1_Cd,T9D_L2a_Ca,T9D_L2a_Cb,T9D_L2a_Cc1,T9D_L2a_Cc2,T9D_L2a_Cc3,T9D_L2a_Cc4,T9D_L2a_Cd,T9D_L2b_Ca,T9D_L2b_Cb,T9D_L2b_Cc1,T9D_L2b_Cc2,T9D_L2b_Cc3,T9D_L2b_Cc4,T9D_L2b_Cd,T9D_L3_Ca,T9D_L3_Cb,T9D_L3_Cc1,T9D_L3_Cc2,T9D_L3_Cc3,T9D_L3_Cc4,T9D_L3_Cd,T9D_L4_Ca,T9D_L4_Cb,T9D_L4_Cc1,T9D_L4_Cc2,T9D_L4_Cc3,T9D_L4_Cc4,T9D_L4_Cd,T9D_L5a_Ca,T9D_L5a_Cb,T9D_L5a_Cc1,T9D_L5a_Cc2,T9D_L5a_Cc3,T9D_L5a_Cc4,T9D_L5a_Cd,T9D_L5b_Ca,T9D_L5b_Cb,T9D_L5b_Cc1,T9D_L5b_Cc2,T9D_L5b_Cc3,T9D_L5b_Cc4,T9D_L5b_Cd,T9D_L6_Ca,T9D_L6_Cb,T9D_L6_Cc1,T9D_L6_Cc2,T9D_L6_Cc3,T9D_L6_Cc4,T9D_L6_Cd,T9D_L7_Ca,T9D_L7_Cb,T9D_L7_Cc1,T9D_L7_Cc2,T9D_L7_Cc3,T9D_L7_Cc4,T9D_L7_Cd,T9D_L8a_Ca,T9D_L8a_Cb,T9D_L8a_Cc1,T9D_L8a_Cc2,T9D_L8a_Cc3,T9D_L8a_Cc4,T9D_L8a_Cd,T9D_L8b_Ca,T9D_L8b_Cb,T9D_L8b_Cc1,T9D_L8b_Cc2,T9D_L8b_Cc3,T9D_L8b_Cc4,T9D_L8b_Cd,T9d_L8c_Ca,T9d_L8c_Cb,T9d_L8c_Cc3,T9d_L8c_Cc4,T9d_L8c_Cd,T9D_L9_Ca,T9D_L9_Cb,T9D_L9_Cc1,T9D_L9_Cc2,T9D_L9_Cc3,T9D_L9_Cc4,T9D_L9_Cd,T9D_L10_Ca,T9D_L10_Cb,T9D_L10_Cc3,T9D_L10_Cc4,T9D_L10_Cd,T9D_L11a_Ca,T9D_L11a_Cb,T9D_L11a_Cc3,T9D_L11a_Cc4,T9D_L11a_Cd,T9D_L11b_Ca,T9D_L11b_Cb,T9D_L11b_Cc3,T9D_L11b_Cc4,T9D_L11b_Cd,T9D_L12_Ca,T9D_L12_Cb,T9D_L12_Cc3,T9D_L12_Cc4,T9D_L12_Cd,T9D_L13_Ca,T9D_L13_Cb,T9D_L13_Ce,T9D_L13_Cf,T9D_L14_Ca,T9D_L14_Cb,T9D_L14_Cc1,T9D_L14_Cc2,T9D_L14_Cc3,T9D_L14_Cc4,T9D_L14_Cd,T9D_L14_Ce,T9D_L14_Cf
0,,,Medicaid Non-Managed Care-Full Charges This Pe...,Medicaid Non-Managed Care-Amount Collected Thi...,Medicaid Non-Managed Care-Collection of Reconc...,Medicaid Non-Managed Care-Collection of Reconc...,Medicaid Non-Managed Care-Collection of Other ...,Medicaid Non-Managed Care-Penalty/Payback (c4),Medicaid Non-Managed Care-Adjustments (d),Medicaid Managed Care (capitated)-Full Charges...,Medicaid Managed Care (capitated)-Amount Colle...,Medicaid Managed Care (capitated)-Collection o...,Medicaid Managed Care (capitated)-Collection o...,Medicaid Managed Care (capitated)-Collection o...,Medicaid Managed Care (capitated)-Penalty/Payb...,Medicaid Managed Care (capitated)-Adjustments (d),Medicaid Managed Care (fee-for-service)-Full C...,Medicaid Managed Care (fee-for-service)-Amount...,Medicaid Managed Care (fee-for-service)-Collec...,Medicaid Managed Care (fee-for-service)-Collec...,Medicaid Managed Care (fee-for-service)-Collec...,Medicaid Managed Care (fee-for-service)-Penalt...,Medicaid Managed Care (fee-for-service)-Adjust...,Total Medicaid (Sum of Lines 1 + 2a + 2b)-Full...,Total Medicaid (Sum of Lines 1 + 2a + 2b)-Amou...,Total Medicaid (Sum of Lines 1 + 2a + 2b)-Coll...,Total Medicaid (Sum of Lines 1 + 2a + 2b)-Coll...,Total Medicaid (Sum of Lines 1 + 2a + 2b)-Coll...,Total Medicaid (Sum of Lines 1 + 2a + 2b)-Pena...,Total Medicaid (Sum of Lines 1 + 2a + 2b)-Adju...,Medicare Non-Managed Care-Full Charges This Pe...,Medicare Non-Managed Care-Amount Collected Thi...,Medicare Non-Managed Care-Collection of Reconc...,Medicare Non-Managed Care-Collection of Reconc...,Medicare Non-Managed Care-Collection of Other ...,Medicare Non-Managed Care-Penalty/Payback (c4),Medicare Non-Managed Care-Adjustments (d),Medicare Managed Care (capitated)-Full Charges...,Medicare Managed Care (capitated)-Amount Colle...,Medicare Managed Care (capitated)-Collection o...,Medicare Managed Care (capitated)-Collection o...,Medicare Managed Care (capitated)-Collection o...,Medicare Managed Care (capitated)-Penalty/Payb...,Medicare Managed Care (capitated)-Adjustments (d),Medicare Managed Care (fee-for-service)-Full C...,Medicare Managed Care (fee-for-service)-Amount...,Medicare Managed Care (fee-for-service)-Collec...,Medicare Managed Care (fee-for-service)-Collec...,Medicare Managed Care (fee-for-service)-Collec...,Medicare Managed Care (fee-for-service)-Penalt...,Medicare Managed Care (fee-for-service)-Adjust...,Total Medicare (Sum of Lines 4 + 5a + 5b)-Full...,Total Medicare (Sum of Lines 4 + 5a + 5b)-Amou...,Total Medicare (Sum of Lines 4 + 5a + 5b)-Coll...,Total Medicare (Sum of Lines 4 + 5a + 5b)-Coll...,Total Medicare (Sum of Lines 4 + 5a + 5b)-Coll...,Total Medicare (Sum of Lines 4 + 5a + 5b)-Pena...,Total Medicare (Sum of Lines 4 + 5a + 5b)-Adju...,"Other Public, including Non-Medicaid CHIP, Non...","Other Public, including Non-Medicaid CHIP, Non...","Other Public, including Non-Medicaid CHIP, Non...","Other Public, including Non-Medicaid CHIP, Non...","Other Public, including Non-Medicaid CHIP, Non...","Other Public, including Non-Medicaid CHIP, Non...","Other Public, including Non-Medicaid CHIP, Non...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including Non-Medicaid CHIP, Man...","Other Public, including COVID-19 Uninsured Pro...","Other Public, including COVID-19 Uninsured Pro...","Other Public, including COVID-19 Uninsured Pro...","Other Public, including COVID-19 Uninsured Pro...","Other Public, including COVID-19 Uninsured Pro...",Total Other Public (Sum of Lines 7 + 8a + 8b +...,Total Other Public (Sum of Lines 7 + 8a + 8b +...,Total Other Public (Sum of Lines 7 + 8a + 8b +...,Total Other Public (Sum of Lines 7 + 8a + 8b +...,Total Other Public (Sum of Lines 7 + 8a + 8b +...,Total Other Public (Sum of Lines 7 + 8a + 8b +...,Total Other Public (Sum of Lines 7 + 8a + 8b +...,Private Non-Managed Care-Full Charges This Per...,Private Non-Managed Care-Amount Collected This...,Private Non-Managed Care-Collection of Other P...,Private Non-Managed Care-Penalty/Payback (c4),Private Non-Managed Care-Adjustments (d),Private Managed Care (capitated)-Full Charges ...,Private Managed Care (capitated)-Amount Collec...,Private Managed Care (capitated)-Collection of...,Private Managed Care (capitated)-Penalty/Payba...,Private Managed Care (capitated)-Adjustments (d),Private Managed Care (fee-for-service)-Full Ch...,Private Managed Care (fee-for-service)-Amount ...,Private Managed Care (fee-for-service)-Collect...,Private Managed Care (fee-for-service)-Penalty...,Private Managed Care (fee-for-service)-Adjustm...,Total Private (Sum of Lines 10 + 11a + 11b)-Fu...,Total Private (Sum of Lines 10 + 11a + 11b)-Am...,Total Private (Sum of Lines 10 + 11a + 11b)-Co...,Total Private (Sum of Lines 10 + 11a + 11b)-Pe...,Total Private (Sum of Lines 10 + 11a + 11b)-Ad...,Self-Pay-Full Charges This Period (a),Self-Pay-Amount Collected This Period (b),Self-Pay-Sliding Fee Discounts (e),Self-Pay-Bad Debt Write-Off (f),TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Full ...,TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Amoun...,TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Colle...,TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Colle...,TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Colle...,TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Penal...,TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Adjus...,TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Slidi...,TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Bad D...
1,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---
2,010040,H80CS00443,4963,151,-,-,-,-,99,-,-,-,-,-,-,-,-,-,-,-,-,-,-,4963,151,-,-,-,-,99,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,2388,0,-,-,85,-,-,-,-,-,-,-,-,-,-,2388,0,-,-,85,629518,2573,617158,1235,636869,2724,-,-,-,-,184,617158,1235


# Data Cleaning

Write up an initial overview of how data will need to be cleaned here.

I am going to need to rename the columns for several of the dataframes. The current names are abbreviations that will be too hard to understand by checking HRSA's reference for each one. I'll create a function to keep the first and second values of the original column names, and replace all other columns names for the values in the first row. 

In [43]:
def rename_columns(df):
    """
    Renames dataframe columns by preserving the first two column names
    and setting the first row as the new column names for all columns after the 
    2nd column.
    """
    
    # save the first two column names
    original_columns = df.columns[:2]
    # create new column names by combining saved columns + first row
    new_column_names = list(original_columns) + df.iloc[0, 2:].tolist()
    # rename columns
    df.columns = new_column_names
    # drop redundant first row
    df = df.drop(index=0, inplace=True)
    
    return df

In [44]:
dataframes_to_rename = [health_center_funding, health_center_costs, patient_services_revenue, personnel_and_visits, 
                        patients_age, patients_race, patients_other_demographics]

renamed_dataframes = [rename_columns(df) for df in dataframes_to_rename]

In [45]:
health_center_funding.head(2)

Unnamed: 0,BHCMISID,GrantNumber,Migrant Health Center-Amount (a),Community Health Center-Amount (a),Health Care for the Homeless-Amount (a),Public Housing Primary Care-Amount (a),Total Health Center (Sum of Lines 1a through 1e)-Amount (a),"Capital Development Grants, including School-Based Service Site Capital Grants-Amount (a)",Coronavirus Preparedness and Response Supplemental Appropriations Act (H8C)-Amount (a),"Coronavirus Aid, Relief, and Economic Security Act (CARES) (H8D)-Amount (a)",Expanding Capacity for Coronavirus Testing (ECT) (H8E and LAL ECT)-Amount (a),"American Rescue Plan (ARP) (H8F, L2C, C8E)-Amount (a)",Other COVID-19-Related Funding from BPHC-Source,Other COVID-19-Related Funding from BPHC-Amount (a),Total COVID-19 Supplemental (Sum of Lines 1l through 1p)-Amount (a),Total BPHC Grants (Sum of Lines 1g + 1k + 1q)-Amount (a),Ryan White Part C HIV Early Intervention-Amount (a),Other Federal Grants-Source,Other Federal Grants-Amount (a),Medicare and Medicaid EHR Incentive Payments for Eligible Providers-Amount (a),Provider Relief Fund-Source,Provider Relief Fund-Amount (a),Total Other Federal Grants (Sum of Lines 2 through 3b)-Amount (a),State Government Grants and Contracts-Source,State Government Grants and Contracts-Amount (a),State/Local Indigent Care Programs-Source,State/Local Indigent Care Programs-Amount (a),Local Government Grants and Contracts-Source,Local Government Grants and Contracts-Amount (a),Foundation/Private Grants and Contracts-Source,Foundation/Private Grants and Contracts-Amount (a),Total Non-Federal Grants and Contracts (Sum of Lines 6 + 6a + 7 + 8)-Amount (a),Other Revenue (non-patient service revenue not reported elsewhere)-Source,Other Revenue (non-patient service revenue not reported elsewhere)-Amount (a),Total Revenue (Sum of Lines 1 + 5 + 9 + 10)-Amount (a)
1,10030,H80CS00803,0,5721128,0,0,5721128,0,0,0,0,1863333,-,0,1863333,7584461,795188,"SAMHSA, National Institutes of Health (NIH), H...",568943,0,-,0,1364131,"Counseling & Testing, HIV Case Management, Fam...",2915656,HSN,1442182,-,0,"MA League - CHWs, La Linda Manita, Project Bre...",764680,5122518,"Rental Income from tenants,\nInterest Income, ...",14258919,28330029
2,10040,H80CS00443,1758567,-,-,-,1758567,-,1256,-,1396,497431,-,-,500083,2258650,-,-,-,-,-,-,-,-,-,-,-,-,-,MeHAF Advocacy Grant,25000,25000,"Interest $703; Other Income $33,875; Donations...",36578,2320228


In [46]:
personnel_and_visits.head(3)

Unnamed: 0,BHCMISID,GrantNumber,Family Physicians-FTEs (a),Family Physicians-Clinic Visits (b),Family Physicians-Virtual Visits (b2),General Practitioners-FTEs (a),General Practitioners-Clinic Visits (b),General Practitioners-Virtual Visits (b2),Internists-FTEs (a),Internists-Clinic Visits (b),Internists-Virtual Visits (b2),Obstetrician/Gynecologists-FTEs (a),Obstetrician/Gynecologists-Clinic Visits (b),Obstetrician/Gynecologists-Virtual Visits (b2),Pediatricians-FTEs (a),Pediatricians-Clinic Visits (b),Pediatricians-Virtual Visits (b2),Other Specialty Physicians-FTEs (a),Other Specialty Physicians-Clinic Visits (b),Other Specialty Physicians-Virtual Visits (b2),Total Physicians (Lines 1–7)-FTEs (a),Total Physicians (Lines 1–7)-Clinic Visits (b),Total Physicians (Lines 1–7)-Virtual Visits (b2),Nurse Practitioners-FTEs (a),Nurse Practitioners-Clinic Visits (b),Nurse Practitioners-Virtual Visits (b2),Physician Assistants-FTEs (a),Physician Assistants-Clinic Visits (b),Physician Assistants-Virtual Visits (b2),Certified Nurse Midwives-FTEs (a),Certified Nurse Midwives-Clinic Visits (b),Certified Nurse Midwives-Virtual Visits (b2),"Total NPs, PAs, and CNMs (Lines 9a–10)-FTEs (a)","Total NPs, PAs, and CNMs (Lines 9a–10)-Clinic Visits (b)","Total NPs, PAs, and CNMs (Lines 9a–10)-Virtual Visits (b2)",Nurses-FTEs (a),Nurses-Clinic Visits (b),Nurses-Virtual Visits (b2),Other Medical Personnel-FTEs (a),Laboratory Personnel-FTEs (a),X-ray Personnel-FTEs (a),Total Medical Care Services (Lines 8 + 10a through 14)-FTEs (a),Total Medical Care Services (Lines 8 + 10a through 14)-Clinic Visits (b),Total Medical Care Services (Lines 8 + 10a through 14)-Virtual Visits (b2),Total Medical Care Services (Lines 8 + 10a through 14)-Patients (c),Dentists-FTEs (a),Dentists-Clinic Visits (b),Dentists-Virtual Visits (b2),Dental Hygienists-FTEs (a),Dental Hygienists-Clinic Visits (b),Dental Hygienists-Virtual Visits (b2),Dental Therapists-FTEs (a),Dental Therapists-Clinic Visits (b),Dental Therapists-Virtual Visits (b2),Other Dental Personnel-FTEs (a),Total Dental Services (Lines 16–18)-FTEs (a),Total Dental Services (Lines 16–18)-Clinic Visits (b),Total Dental Services (Lines 16–18)-Virtual Visits (b2),Total Dental Services (Lines 16–18)-Patients (c),Psychiatrists-FTEs (a),Psychiatrists-Clinic Visits (b),Psychiatrists-Virtual Visits (b2),Licensed Clinical Psychologists-FTEs (a),Licensed Clinical Psychologists-Clinic Visits (b),Licensed Clinical Psychologists-Virtual Visits (b2),Licensed Clinical Social Workers-FTEs (a),Licensed Clinical Social Workers-Clinic Visits (b),Licensed Clinical Social Workers-Virtual Visits (b2),Other Licensed Mental Health Providers-FTEs (a),Other Licensed Mental Health Providers-Clinic Visits (b),Other Licensed Mental Health Providers-Virtual Visits (b2),Other Mental Health Personnel-FTEs (a),Other Mental Health Personnel-Clinic Visits (b),Other Mental Health Personnel-Virtual Visits (b2),Total Mental Health Services (Lines 20a-c)-FTEs (a),Total Mental Health Services (Lines 20a-c)-Clinic Visits (b),Total Mental Health Services (Lines 20a-c)-Virtual Visits (b2),Total Mental Health Services (Lines 20a-c)-Patients (c),Substance Use Disorder Services-FTEs (a),Substance Use Disorder Services-Clinic Visits (b),Substance Use Disorder Services-Virtual Visits (b2),Substance Use Disorder Services-Patients (c),Ophthalmologists-FTEs (a),Ophthalmologists-Clinic Visits (b),Ophthalmologists-Virtual Visits (b2),Optometrists-FTEs (a),Optometrists-Clinic Visits (b),Optometrists-Virtual Visits (b2),Other Vision Care Personnel-FTEs (a),Total Vision Services (Lines 22a–c)-FTEs (a),Total Vision Services (Lines 22a–c)-Clinic Visits (b),Total Vision Services (Lines 22a–c)-Virtual Visits (b2),Total Vision Services (Lines 22a–c)-Patients (c),Other Professional Services-Others,Other Professional Services-FTEs (a),Other Professional Services-Clinic Visits (b),Other Professional Services-Virtual Visits (b2),Other Professional Services-Patients (c),Pharmacy Personnel-FTEs (a),Case Managers-FTEs (a),Case Managers-Clinic Visits (b),Case Managers-Virtual Visits (b2),Patient and Community Education Specialists-FTEs (a),Patient and Community Education Specialists-Clinic Visits (b),Patient and Community Education Specialists-Virtual Visits (b2),Outreach Workers-FTEs (a),Transportation Personnel-FTEs (a),Eligibility Assistance Workers-FTEs (a),Interpretation Personnel-FTEs (a),Community Health Workers-FTEs (a),Other Enabling Services-Others,Other Enabling Services-FTEs (a),Total Enabling Services (Lines 24–28)-FTEs (a),Total Enabling Services (Lines 24–28)-Clinic Visits (b),Total Enabling Services (Lines 24–28)-Virtual Visits (b2),Total Enabling Services (Lines 24–28)-Patients (c),Other Programs and Services-Others,Other Programs and Services-FTEs (a),Quality Improvement Personnel-FTEs (a),Management and Support Personnel-FTEs (a),Fiscal and Billing Personnel-FTEs (a),IT Personnel-FTEs (a),Facility Personnel-FTEs (a),Patient Support Personnel-FTEs (a),Total Facility and Non-Clinical Support Personnel (Lines 30a-32)-FTEs (a),Grand Total (Lines 15+19+20+21+22+22d+23+29+29a+29b+33)-FTEs (a),Grand Total (Lines 15+19+20+21+22+22d+23+29+29a+29b+33)-Clinic Visits (b),Grand Total (Lines 15+19+20+21+22+22d+23+29+29a+29b+33)-Virtual Visits (b2),Physicians (other than Psychiatrists)-Personnel (a1),Physicians (other than Psychiatrists)-Clinic Visits (b),Physicians (other than Psychiatrists)-Virtual Visits (b2),Physicians (other than Psychiatrists)-Patients (c),Nurse Practitioners-Personnel (a1),Nurse Practitioners-Clinic Visits (b).1,Nurse Practitioners-Virtual Visits (b2).1,Nurse Practitioners-Patients (c),Physician Assistants-Personnel (a1),Physician Assistants-Clinic Visits (b).1,Physician Assistants-Virtual Visits (b2).1,Physician Assistants-Patients (c),Certified Nurse Midwives-Personnel (a1),Certified Nurse Midwives-Clinic Visits (b).1,Certified Nurse Midwives-Virtual Visits (b2).1,Certified Nurse Midwives-Patients (c),Physicians (other than Psychiatrists)-Personnel (a1).1,Physicians (other than Psychiatrists)-Clinic Visits (b).1,Physicians (other than Psychiatrists)-Virtual Visits (b2).1,Physicians (other than Psychiatrists)-Patients (c).1,Nurse Practitioners (Medical)-Personnel (a1),Nurse Practitioners (Medical)-Clinic Visits (b),Nurse Practitioners (Medical)-Virtual Visits (b2),Nurse Practitioners (Medical)-Patients (c),Physician Assistants-Personnel (a1).1,Physician Assistants-Clinic Visits (b).2,Physician Assistants-Virtual Visits (b2).2,Physician Assistants-Patients (c).1,Certified Nurse Midwives-Personnel (a1).1,Certified Nurse Midwives-Clinic Visits (b).2,Certified Nurse Midwives-Virtual Visits (b2).2,Certified Nurse Midwives-Patients (c).1,Psychiatrists-Personnel (a1),Psychiatrists-Clinic Visits (b).1,Psychiatrists-Virtual Visits (b2).1,Psychiatrists-Patients (c),Licensed Clinical Psychologists-Personnel (a1),Licensed Clinical Psychologists-Clinic Visits (b).1,Licensed Clinical Psychologists-Virtual Visits (b2).1,Licensed Clinical Psychologists-Patients (c),Licensed Clinical Social Workers-Personnel (a1),Licensed Clinical Social Workers-Clinic Visits (b).1,Licensed Clinical Social Workers-Virtual Visits (b2).1,Licensed Clinical Social Workers-Patients (c),Other Licensed Mental Health Providers-Personnel (a1),Other Licensed Mental Health Providers-Clinic Visits (b).1,Other Licensed Mental Health Providers-Virtual Visits (b2).1,Other Licensed Mental Health Providers-Patients (c)
1,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---
2,010040,H80CS00443,0.92,1013,3,0,0,0,0.1,16,0,0,0,0,0,0,0,0,8,1,1.02,1037,4,0.2,40,-,0.51,185,-,0,1,-,0.71,226,-,1.36,-,-,-,-,-,3.09,1263,4,727,-,95,-,-,-,-,-,-,-,-,-,95,-,95,-,-,0,-,-,-,2.12,536,13,-,-,-,-,-,-,2.12,536,13,108,-,4,-,4,0,2,0,0,25,0,-,0,27,0,27,podiatrists,-,2,0,1,0.17,7.76,439,174,1.2,15,1,-,-,-,-,-,-,-,8.96,454,175,263,-,-,-,3.1,1.11,-,-,0.69,4.9,19.24,2381,192,17,51,-,38,1,4,0,3,2,8,-,7,-,-,-,-,5,9,-,9,1,2,-,1,1,2,-,1,-,-,-,-,-,-,-,-,-,-,-,-,2,6,10,7,-,-,-,-
3,010060,H80CS00741,4.84,7695,2700,0,0,0,4.48,7506,2557,0,0,0,22.35,31342,5168,0.8,890,321,32.47,47433,10746,12.22,17179,2519,2.66,5222,2338,3.65,6468,24,18.53,28869,4881,31.51,985,441,27.15,1,0,110.66,77287,16068,30818,1.85,3830,0,2.79,1762,0,0,0,0,2.79,7.43,5592,0,2288,0.66,615,303,3.01,1697,1550,4.82,2190,578,7.02,4678,3545,4.97,1255,757,20.48,10435,6733,2346,0,0,0,0,0,0,0,0.09,476,0,0,0.09,476,0,457,"Podiatrist, Registered Dietician",0.99,1503,35,861,0.41,6.86,1094,5057,0,0,0,0.31,0,0.34,1.95,0.8,-,0,10.26,1094,5057,3539,WIC,3.33,3.18,23.65,14.13,4.83,3.2,45.08,90.89,247.72,96387,27893,156,6214,1262,4458,18,1539,278,1186,6,278,227,379,6,93,0,88,77,1822,1014,1061,10,541,47,446,4,23,36,45,3,15,0,15,3,161,66,69,3,80,3,12,3,93,9,13,8,102,174,72


In [47]:
# create a dictionary with df names as keys and dfs as values
dataframes_dict = {'health_centers': health_centers, 
              'health_center_sites': health_center_sites, 
              'health_center_funding': health_center_funding, 
              'health_center_zipcodes': health_center_zipcodes,
              'health_center_costs': health_center_costs,
              'patient_services_revenue': patient_services_revenue, 
              'personnel_and_visits': personnel_and_visits, 
              'patients_age': patients_age, 
              'patients_race': patients_race, 
              'patients_other_demographics': patients_other_demographics}

In [48]:
def dataframe_summary(dataframe_dict, key):
    """
    A function that provides an overview of a dataframe's structure and columns.
    
    Parameters:
    - dataframes_dict: Dict[str, pd.DataFrame], a dictionary of DataFrames.
    - key: str, the key for the DataFrame to process.

    Returns:
    An overview of a dataframe's shape and column names."""
    # access the df with its key
    df = dataframe_dict[key]

    # print relevant information about the df
    print(f'The {key} dataframe has a shape of {df.shape}.')
    print()
    print(f'The {key} dataframe has the following columns and number of values: ')
    print(df.columns)
    print()
    print('-----------------------------------')

In [49]:
for key in dataframes_dict.keys():
    dataframe_summary(dataframes_dict, key)

The health_centers dataframe has a shape of (1370, 19).

The health_centers dataframe has the following columns and number of values: 
Index(['BHCMISID', 'GrantNumber', 'ReportingYear', 'HealthCenterName',
       'HealthCenterStreetAddress', 'HealthCenterOtherAddress',
       'HealthCenterCity', 'HealthCenterState', 'HealthCenterZIPCode',
       'ProjectDirector', 'ProjectDirectorPhone', 'ProjectDirectorPhoneExt',
       'ProjectDirectorFax', 'ProjectDirectorEmail', 'FundingCHC',
       'FundingMHC', 'FundingHO', 'FundingPH', 'UrbanRuralFlag'],
      dtype='object')

-----------------------------------
The health_center_sites dataframe has a shape of (14987, 24).

The health_center_sites dataframe has the following columns and number of values: 
Index(['BHCMISID', 'GrantNumber', 'HealthCenterName', 'SiteName', 'SiteType',
       'SiteStatus', 'LocationType', 'LocationSetting', 'OperationalSchedule',
       'CalendarSchedule', 'TotalWeeklyHoursOfOperation',
       'ServiceAreaPopulation

## Subsetting dataframes before further cleaning and EDA

There is a lot of useful data across these dataframes, however, I will not be able to keep all of these columns for my analysis and model development. Below I will be subsetting each dataframe to keep core components of each one. I'll provide justification as to why I am removing or keeping certain columns.

In general, I will be keeping the `BHCMISID` and the `GrantNumber` columns because this will allow me to know which records pertain to the unique health center / entity.

I will not make any changes at this point to the health_center_zipcodes df.

### Subsetting the `health_centers` df 

I'll remove the address columns and the information about the project director. Knowing the city, state, and zip codes might provide  interesting information for analysis, especially to group by specific geographical regions. The address of the orgnaizations will not provide useful information for the scope of this project. Similarly, knowing who the project director is will not benefit further analysis for this project. 

Health Centers can receive funding for special populations, as defined by HRSA, in addition to Health Center Program funding. `FundingCHC` represents the Health Center Program funding that is not for any specific population group. There are three types of special population funding:
1. Migrant Health Center Funding - represented by the `FundingMHC` column
2. Health Center funding for Homeless - represented by the `FundingHO` column
3. Public Housing Primary Care Funding - represented by the `FundingPH` column

Looking at how the various sub-funding types influence overall funding will be interesting for further analysis. Also, I will keep the `UrbanRuralFlag` column as their primary designation will offer interesting insights too.

In [50]:
# drop columns from health_centers df
health_centers_1 = health_centers.drop(columns=['ReportingYear', 'HealthCenterStreetAddress', 'HealthCenterOtherAddress', 
                                                'ProjectDirector', 'ProjectDirectorPhone', 'ProjectDirectorPhoneExt', 
                                                'ProjectDirectorFax', 'ProjectDirectorEmail'])

### Subsetting the `health_center_sites` df

Health Centers have to report information about their approved sites, which include service location sites and administrative sites. I  want to understand how things like the total number of sites per organization, operating hours, population type (urbarn/rural) for each site, and site geographical factors influence funding. All other columns will be removed for analysis.

In [51]:
# subset health_center_sites df
health_center_sites_1 = health_center_sites[['BHCMISID', 'GrantNumber', 'SiteName', 'SiteType', 'SiteStatus', 
                                             'LocationType','TotalWeeklyHoursOfOperation', 'ServiceAreaPopulation', 
                                             'SiteOperatedBy', 'SiteCity', 'SiteState', 'SiteZIPCode']]

### Subsetting the `health_center_funding` df

I want to know how much Health Center Program funding each entity receives, and the amounts they receive for special populations (MHC, HO, and PH funding). Other Bureau of Primary Health Care grant details will not be necessary, but I will preserve the column that sums up the total BPHC funding each entity receives `Total BPHC Grants`. Note that a lot of this funding was expanded due to the COVID pandemic and will likely not occur in future years. Most federal COVID funding will expire by the end of September 2024. 

It will be interesting to compare the CHC funding to each organization's total revenue as well as other government and non-government grant revenue. I'll preserve the appropriate columns. I will not need to know the specific sources of other revenue for the scope of this project, for example, which foundation an organization might receive grants from. 

Also, many of these columns have long names, I'll rename them for easier processing.

In [52]:
# subset health_center_funding df
health_center_funding_1 = health_center_funding[['BHCMISID', 
                                                 'GrantNumber', 
                                                 'Migrant Health Center-Amount (a)', 
                                                 'Community Health Center-Amount (a)', 
                                                 'Health Care for the Homeless-Amount (a)',
                                                 'Public Housing Primary Care-Amount (a)', 
                                                 'Total Health Center (Sum of Lines 1a through 1e)-Amount (a)',
                                                 'Total BPHC Grants (Sum of Lines 1g + 1k + 1q)-Amount (a)', 
                                                 'Other Federal Grants-Amount (a)',
                                                 'State Government Grants and Contracts-Amount (a)',
                                                 'Local Government Grants and Contracts-Amount (a)', 
                                                 'Foundation/Private Grants and Contracts-Amount (a)',
                                                 'Other Revenue (non-patient service revenue not reported elsewhere)-Amount (a)', 
                                                 'Total Revenue (Sum of Lines 1 + 5 + 9 + 10)-Amount (a)']]
                                                

In [53]:
# rename health_center_funding_1 columns
health_center_funding_1 = health_center_funding_1.rename(columns={
                                        'Migrant Health Center-Amount (a)':'mhc_amount', 
                                        'Community Health Center-Amount (a)':'chc_amount', 
                                        'Health Care for the Homeless-Amount (a)':'ho_amount',
                                        'Public Housing Primary Care-Amount (a)':'ph_amount', 
                                        'Total Health Center (Sum of Lines 1a through 1e)-Amount (a)':'total_hc_funding',
                                        'Total BPHC Grants (Sum of Lines 1g + 1k + 1q)-Amount (a)':'bphc_total', 
                                        'Other Federal Grants-Amount (a)':'total_other_federal_grants',
                                        'State Government Grants and Contracts-Amount (a)':'total_state_grants',
                                        'Local Government Grants and Contracts-Amount (a)': 'total_local_gov_grants', 
                                        'Foundation/Private Grants and Contracts-Amount (a)':'total_private_grants',
                                        'Other Revenue (non-patient service revenue not reported elsewhere)-Amount (a)':'total_other_revenue', 
                                        'Total Revenue (Sum of Lines 1 + 5 + 9 + 10)-Amount (a)':'total_revenue'})

### Subsetting the `health_center_costs` df

Similar to the personnel and visits dataframe, I will subset the costs of providing care by the four core service types: medical care, dental care, mental health care, and enabling services. I will also include the costs for quality improvement initiatives because I am curious if this type of programming has any impact on overall funding an entity receives. [HRSA requires](https://bphc.hrsa.gov/compliance/compliance-manual/chapter10) health center grantees to create an annual quality management and improvement plans that "support the quality assurance program and the provision of high quality patient care". 

I have chosen the columns that reflect the total cost including administrative and operational costs such as facilities and non-clinical staff to get the true cost of providing care in these categories. 

I'll rename these columns for readability.

In [54]:
health_center_costs_1 = health_center_costs[['BHCMISID',
                                             'GrantNumber',
                                             'Total Medical Care Services (Sum of Lines 1 through 3)-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)',
                                             'Dental-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)',
                                             'Mental Health-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)',
                                             'Total Enabling Services (Sum of Lines 11a through 11h)-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)',
                                             'Quality Improvement-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)',
                                             'Total Accrued Costs (Sum of Lines 4 + 10 + 13 + 16)-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)']]

In [55]:
health_center_costs_1 = health_center_costs_1.rename(columns={
                                             'Total Medical Care Services (Sum of Lines 1 through 3)-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)':'total_costs_medical',
                                             'Dental-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)':'total_costs_dental',
                                             'Mental Health-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)':'total_costs_mh',
                                             'Total Enabling Services (Sum of Lines 11a through 11h)-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)':'total_costs_enabling',
                                             'Quality Improvement-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)':'total_costs_quality',
                                             'Total Accrued Costs (Sum of Lines 4 + 10 + 13 + 16)-Total Cost After Allocation of Facility and Non-Clinical Support Services (c)':'total_cost_care'})

### Subsetting the `personnel_and_visits` df
Although health centers provide a range of healthcare and support services, the most common that organizations provide are medical care, dental care, mental health care, and enabling services. Becuase of this, I will be excluding other types of specialty care from the personnel and visits dataframe. Examples of these services include substance use disorder care, psychiatry, vision, etc.

I will preserve total clinic and virtual visits which includes other visit types to get a full picture of how many clinic and virtual visits entities provided in 2022. 

Several columns from this table are also long, so I will rename them for readability.

In [56]:
# subset personnel_and_visits df
personnel_and_visits_1 = personnel_and_visits[['BHCMISID', 'GrantNumber', 
                                               'Total Medical Care Services (Lines 8 + 10a through 14)-Clinic Visits (b)',
                                               'Total Medical Care Services (Lines 8 + 10a through 14)-Virtual Visits (b2)',
                                               'Total Medical Care Services (Lines 8 + 10a through 14)-Patients (c)',
                                               'Total Dental Services (Lines 16–18)-Clinic Visits (b)',
                                               'Total Dental Services (Lines 16–18)-Virtual Visits (b2)',
                                               'Total Dental Services (Lines 16–18)-Patients (c)',
                                               'Total Mental Health Services (Lines 20a-c)-Clinic Visits (b)',
                                               'Total Mental Health Services (Lines 20a-c)-Virtual Visits (b2)',
                                               'Total Mental Health Services (Lines 20a-c)-Patients (c)',
                                               'Total Enabling Services (Lines 24–28)-Clinic Visits (b)',
                                               'Total Enabling Services (Lines 24–28)-Virtual Visits (b2)',
                                               'Total Enabling Services (Lines 24–28)-Patients (c)',
                                               'Grand Total (Lines 15+19+20+21+22+22d+23+29+29a+29b+33)-Clinic Visits (b)',
                                               'Grand Total (Lines 15+19+20+21+22+22d+23+29+29a+29b+33)-Virtual Visits (b2)']]

In [57]:
#rename personnel_and_visits_1 columns
personnel_and_visits_1 = personnel_and_visits_1.rename(columns={
                                               'Total Medical Care Services (Lines 8 + 10a through 14)-Clinic Visits (b)':'medical_clinic_visits',
                                               'Total Medical Care Services (Lines 8 + 10a through 14)-Virtual Visits (b2)':'medical_virtual_visits',
                                               'Total Medical Care Services (Lines 8 + 10a through 14)-Patients (c)':'medical_patients',
                                               'Total Dental Services (Lines 16–18)-Clinic Visits (b)':'dental_clinic_visits',
                                               'Total Dental Services (Lines 16–18)-Virtual Visits (b2)':'dental_virtual_visits',
                                               'Total Dental Services (Lines 16–18)-Patients (c)':'dental_patients',
                                               'Total Mental Health Services (Lines 20a-c)-Clinic Visits (b)':'mh_clinic_visits',
                                               'Total Mental Health Services (Lines 20a-c)-Virtual Visits (b2)':'mh_virtual_visits',
                                               'Total Mental Health Services (Lines 20a-c)-Patients (c)':'mh_patients',
                                               'Total Enabling Services (Lines 24–28)-Clinic Visits (b)':'enabling_clinic_visits',
                                               'Total Enabling Services (Lines 24–28)-Virtual Visits (b2)':'enabling_virtual_visits',
                                               'Total Enabling Services (Lines 24–28)-Patients (c)':'enabling_patients',
                                               'Grand Total (Lines 15+19+20+21+22+22d+23+29+29a+29b+33)-Clinic Visits (b)':'total_clinic_visits',
                                               'Grand Total (Lines 15+19+20+21+22+22d+23+29+29a+29b+33)-Virtual Visits (b2)':'total_virtual_visits'})

### Subsetting the `patient_services_revenue` df

I won't be using most of the columns from this dataframe becauase they have to do with Medicaid (MA) and Medicare (MC) reimbursement rates. MA and MC reimbursement rates can vary drastically across different geographical regions, primarily due to the volume of services delivered. According to [Dartmouth's Atlas Project](https://www.dartmouthatlas.org/interactive-apps/medicare-reimbursements/), among over 300 hospital referral regions in the U.S., price-adjusted MC reimbursements varied from 8,273 USD to 14,351 USD in 2019.

I'll preserve the total charges for each entity in 2022 and the total amount each entity collected. I will also keep the sliding fee scale discounts. SFS programs are a core operational compoenent of FQHCs as it ensures that services are provied based on patient income. The main purpose of the health center program funding is to offset losses experienced by entities for having an SFS program.

In [58]:
patient_services_revenue_1 = patient_services_revenue[['BHCMISID', 
                                                       'GrantNumber', 
                                                       'TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Full Charges This Period (a)',
                                                       'TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Amount Collected This Period (b)',
                                                       'TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Sliding Fee Discounts (e)']]
                                                       

In [59]:
patient_services_revenue_1 = patient_services_revenue_1.rename(columns={ 
                                                       'TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Full Charges This Period (a)':'total_charges',
                                                       'TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Amount Collected This Period (b)':'total_collected',
                                                       'TOTAL (Sum of Lines 3 + 6 + 9 + 12 + 13)-Sliding Fee Discounts (e)':'total_sfs'})

### Subsetting the `patients_age` df

I only want to keep the total male and total female patients to combine these totals into a new column to capture a new column for total patients per entity. 

In [60]:
patients_age_1 = patients_age[['BHCMISID', 'GrantNumber', 'Total Patients (Sum of Lines 1-38)-Male Patients (a)',
                               'Total Patients (Sum of Lines 1-38)-Female Patients (b)']]

In [61]:
patients_age_1 = patients_age_1.rename(columns={'Total Patients (Sum of Lines 1-38)-Male Patients (a)':'total_male',
                               'Total Patients (Sum of Lines 1-38)-Female Patients (b)':'total_female'})

### Subsetting the `patients_other_demographics` df

After reviewing the columns for this df, I realize I don't need to use the `patients_age` df to capture total patients as that will be captured in the `patients_other_demographics` df. 

Of the `patients_other_demographics` df, I will keep details about the number of patients within various federal poverty level categories. I will also keep details about general insurance coverage (MA, MC, Other Public, Private) and special populations served. 

In [62]:
patients_other_demographics_1 = patients_other_demographics[['BHCMISID', 
                                                             'GrantNumber', 
                                                             '100% and below-Number of Patients (a)',
                                                             '101–150%-Number of Patients (a)', 
                                                             '151–200%-Number of Patients (a)',
                                                             'Over 200%-Number of Patients (a)', 
                                                             'Unknown-Number of Patients (a)',
                                                             'TOTAL (Sum of Lines 1–5)-Number of Patients (a)', 
                                                             'None/Uninsured-0-17 years old (a)', 
                                                             'None/Uninsured-18 and older (b)', 
                                                             'Total Medicaid (Line 8a + 8b)-0-17 years old (a)',
                                                             'Total Medicaid (Line 8a + 8b)-18 and older (b)',
                                                             'Medicare (Inclusive of dually eligible and other Title XVIII beneficiaries)-18 and older (b)',
                                                             'Total Public Insurance (Line 10a + 10b)-0-17 years old (a)',
                                                             'Total Public Insurance (Line 10a + 10b)-18 and older (b)',
                                                             'Private Insurance-0-17 years old (a)',
                                                             'Private Insurance-18 and older (b)',
                                                             'TOTAL (Sum of Lines 7 + 8 + 9 +10 +11)-0-17 years old (a)',
                                                             'TOTAL (Sum of Lines 7 + 8 + 9 +10 +11)-18 and older (b)',
                                                             'Total Agricultural Workers or Dependents (All health centers report this line)-Number of Patients (a)',
                                                             'Total Homeless (All health centers report this line)-Number of Patients (a)',
                                                             'Total School-Based Service Site Patients (All health centers report this line)-Number of Patients (a)',
                                                             'Total Veterans (All health centers report this line)-Number of Patients (a)',
                                                             'Total Patients Served at a Health Center Located In or Immediately Accessible to a Public Housing Site (All health centers report this line)-Number of Patients (a)']]

In [63]:
patients_other_demographics_1 = patients_other_demographics_1.rename(columns={'100% and below-Number of Patients (a)':'fpl_100_below',
                                                                              '101–150%-Number of Patients (a)':'fpl_101_to_150',
                                                                              '151–200%-Number of Patients (a)':'fpl_151_to_200',
                                                                              'Over 200%-Number of Patients (a)':'fpl_over_200', 
                                                                              'Unknown-Number of Patients (a)':'fpl_unknown',
                                                                              'TOTAL (Sum of Lines 1–5)-Number of Patients (a)':'total_patients', 
                                                                              'None/Uninsured-0-17 years old (a)':'uninsured_0_17', 
                                                                              'None/Uninsured-18 and older (b)':'uninsured_18_up', 
                                                                              'Total Medicaid (Line 8a + 8b)-0-17 years old (a)':'medicaid_0_17',
                                                                              'Total Medicaid (Line 8a + 8b)-18 and older (b)':'medicaid_18_up',
                                                                              'Medicare (Inclusive of dually eligible and other Title XVIII beneficiaries)-18 and older (b)':'medicare_18_up',
                                                                              'Total Public Insurance (Line 10a + 10b)-0-17 years old (a)':'public_insurance_0_17',
                                                                              'Total Public Insurance (Line 10a + 10b)-18 and older (b)':'public_insurance_18_up',
                                                                              'Private Insurance-0-17 years old (a)':'private_insurance_0_17',
                                                                              'Private Insurance-18 and older (b)':'private_insurance_18_up',
                                                                              'TOTAL (Sum of Lines 7 + 8 + 9 +10 +11)-0-17 years old (a)':'total_0_17',
                                                                              'TOTAL (Sum of Lines 7 + 8 + 9 +10 +11)-18 and older (b)':'total_18_up',
                                                                              'Total Agricultural Workers or Dependents (All health centers report this line)-Number of Patients (a)':'migrant_patients',
                                                                              'Total Homeless (All health centers report this line)-Number of Patients (a)':'homeless_patients',
                                                                              'Total School-Based Service Site Patients (All health centers report this line)-Number of Patients (a)':'school_based_patients',
                                                                              'Total Veterans (All health centers report this line)-Number of Patients (a)':'veteran_patients',
                                                                              'Total Patients Served at a Health Center Located In or Immediately Accessible to a Public Housing Site (All health centers report this line)-Number of Patients (a)':'public_housing_patients'})

### Subsetting the `paitents_race` df

FQHCs aim to provide care to underserved communities in the United States. Due to systemic racism and discriminations, individuals of color are less likely to have access to quality health care due to various social determinants of health (transportation, financial barriers, language barriers, etc.). Understanding the racial and ethnic distribution of patients across entities may be an important factor in the level of funding they receive through the Health Center Program. 

I'll simplify this table by only keeping the race-alone categories (ex: Asian Non-Hispanic individuals) and all Hispanic patients regardless of race. I will also keep the patients who face language barriers to see if this factor has a significant effect. 

In [64]:
patients_race_1 = patients_race[['BHCMISID', 
                                 'GrantNumber', 
                                 'Asian-Non-Hispanic or Latino/a (b)',
                                 'Native Hawaiian-Non-Hispanic or Latino/a (b)',
                                 'Other Pacific Islander-Non-Hispanic or Latino/a (b)',
                                 'Black/African American-Non-Hispanic or Latino/a (b)',
                                 'American Indian/Alaska Native-Non-Hispanic or Latino/a (b)',
                                 'White-Non-Hispanic or Latino/a (b)',
                                 'Unreported/Chose Not to Disclose race-Unreported/Chose Not to Disclose Ethnicity (c)',
                                 'Total Patients (Sum of Lines 1 + 2 + 3 to 7)-Hispanic or Latino/a (a)',
                                 'Total Patients (Sum of Lines 1 + 2 + 3 to 7)-Total (d) (Sum Columns a+b+c)',
                                 'Patients Best Served in a Language Other than English-Number (a)']]

In [65]:
patients_race_1 = patients_race_1.rename(columns={ 
                                 'Asian-Non-Hispanic or Latino/a (b)':'asian',
                                 'Native Hawaiian-Non-Hispanic or Latino/a (b)':'native_hawaiian',
                                 'Other Pacific Islander-Non-Hispanic or Latino/a (b)':'pac_isl',
                                 'Black/African American-Non-Hispanic or Latino/a (b)':'black',
                                 'American Indian/Alaska Native-Non-Hispanic or Latino/a (b)':'am_ind',
                                 'White-Non-Hispanic or Latino/a (b)':'white',
                                 'Unreported/Chose Not to Disclose race-Unreported/Chose Not to Disclose Ethnicity (c)':'unreported',
                                 'Total Patients (Sum of Lines 1 + 2 + 3 to 7)-Hispanic or Latino/a (a)':'hispanic_any_race',
                                 'Total Patients (Sum of Lines 1 + 2 + 3 to 7)-Total (d) (Sum Columns a+b+c)':'total_patients',
                                 'Patients Best Served in a Language Other than English-Number (a)':'lep_patients'})

# Handling Missing Values

I mentioned earlier that there are three main types of missing values in this dataset. However, running `.isnull().sum()` won't be informative because of how they are represented.

**Missing Values Representation**
1. "-" represents no data entry by health center
2. "--" represents suppressed patient counts between 1-15 to protect patient privacy
3. "---" represents suppressed health center confidential data

I'll create a function to take a closer look at the impact of each of these null types in my dataframe columns. 

In [66]:
new_df_dict = {'health_centers': health_centers_1, 
              'health_center_sites': health_center_sites_1, 
              'health_center_funding': health_center_funding_1, 
              'health_center_zipcodes': health_center_zipcodes,
              'health_center_costs': health_center_costs_1,
              'patient_services_revenue': patient_services_revenue_1, 
              'personnel_and_visits': personnel_and_visits_1, 
              'patients_age': patients_age_1,
              'patients_race': patients_race_1, 
              'patients_other_demographics': patients_other_demographics_1}

In [67]:
def find_missing_values(df_dictionary, key, missing_variable):

    '''Find the counts for an unusual missing variable type that is not a NaN data type.
    
    Arguments:
    1. df_dictionary: A dictionary of dataframes where each key is the df name and value is the df
    2. key: A key that represents the dataframe names
    3. missing_variable: A string of the missing variable to count
    
    Returns:
    Creates a dataframe of the missing_variable counts and the % of the total it represents for each column in 
    each dataframe in the dictionary.
    '''

    # establish the df from the df_dict and key
    df = df_dictionary[key]
   
    missing_values = {}

    for col in df.columns:
        count = (df[col] == missing_variable).sum()
        if count > 0:
            missing_values[col] = count
    
    if not missing_values:
        print(f'The {key} dataframe does not contain any instances of "{missing_variable}".')
        print()

    else:
        missing_values_df = pd.DataFrame.from_dict(missing_values, orient='index').reset_index()
        missing_values_df.columns = ['Column Name', 'Missing Value Count']
        missing_values_df['% of Total'] = (missing_values_df['Missing Value Count'] / len(df)) * 100
        missing_values['% of Total'] = missing_values_df['% of Total'].round(4)
        missing_values_df = missing_values_df.sort_values(by='Missing Value Count', ascending=False)
    
        print(f'The {key} dataframe has the following missing value counts for "{missing_variable}": ')
        print()
        
        print(missing_values_df)
        print()
    

### Use the `find_missing_values()` function to analyze the presence of the '-' null type

In [68]:
for key in new_df_dict.keys():
    find_missing_values(new_df_dict, key, '-')

The health_centers dataframe has the following missing value counts for "-": 

           Column Name  Missing Value Count  % of Total
0  HealthCenterZIPCode                    1    0.072993

The health_center_sites dataframe has the following missing value counts for "-": 

                   Column Name  Missing Value Count  % of Total
4                  SiteZIPCode                 1400    9.341429
1  TotalWeeklyHoursOfOperation                   17    0.113432
2                     SiteCity                    3    0.020017
0                     SiteName                    2    0.013345
3                    SiteState                    2    0.013345

The health_center_funding dataframe has the following missing value counts for "-": 

                  Column Name  Missing Value Count  % of Total
3                   ph_amount                  710   51.824818
0                  mhc_amount                  648   47.299270
2                   ho_amount                  603   44.014599
4

In [69]:
for key in new_df_dict.keys():
    find_missing_values(new_df_dict, key, '--')

The health_centers dataframe does not contain any instances of "--".

The health_center_sites dataframe does not contain any instances of "--".

The health_center_funding dataframe does not contain any instances of "--".

The health_center_zipcodes dataframe has the following missing value counts for "--": 

                         Column Name  Missing Value Count  % of Total
2                   MedicarePatients                50594   52.082518
0             None_UninsuredPatients                50399   51.881781
3                    PrivatePatients                47475   48.871755
1  Medicaid_CHIP_OtherPublicPatients                37127   38.219308
4              TotalNumberofPatients                19381   19.951205

The health_center_costs dataframe does not contain any instances of "--".

The patient_services_revenue dataframe does not contain any instances of "--".

The personnel_and_visits dataframe does not contain any instances of "--".

The patients_age dataframe does not co

In [70]:
for key in new_df_dict.keys():
    find_missing_values(new_df_dict, key, '---')

The health_centers dataframe does not contain any instances of "---".

The health_center_sites dataframe does not contain any instances of "---".

The health_center_funding dataframe does not contain any instances of "---".

The health_center_zipcodes dataframe does not contain any instances of "---".

The health_center_costs dataframe has the following missing value counts for "---": 

            Column Name  Missing Value Count  % of Total
0              BHCMISID                  629   45.912409
1           GrantNumber                  629   45.912409
2   total_costs_medical                  629   45.912409
3    total_costs_dental                  629   45.912409
4        total_costs_mh                  629   45.912409
5  total_costs_enabling                  629   45.912409
6   total_costs_quality                  629   45.912409
7       total_cost_care                  629   45.912409

The patient_services_revenue dataframe has the following missing value counts for "---": 

     

In [71]:
def show_missing_rows(df, col, missing_variable):
    df = df[df[col] == missing_variable]
    print(df.head(10))

In [72]:
show_missing_rows(patient_services_revenue_1, 'total_sfs', '-')

      BHCMISID GrantNumber total_charges total_collected total_sfs
552   04E01171  H80CS28959        317293          113492         -
994     080620  H80CS00040      13494171        14222485         -
1284    100800  H80CS01127      16296131        16851169         -


# Fill Missing Values


In [73]:
def fill_values(df, value_to_fill, new_value):
    """
    Replace specified values in a DataFrame with a new value.
    
    Parameters:
    df (pd.DataFrame): The DataFrame in which to replace values.
    value_to_fill: The value to replace with.
    new_value: The new value to impute.
    
    Returns:
    pd.DataFrame: The modified DataFrame with values replaced.
    """
    
    df = df.replace(to_replace=value_to_fill, value=new_value)
    return df

In [74]:
def drop_missing_rows(df):
    """
    Drops rows with missing data from a dataframe."""
    
    df = df.dropna(axis=0, how='any')
    return df

Since the "-" represents no entry by the health center, I will fill these values with 0 before doing further analysis.

In [75]:
# replace all instances of '-' with 0 for all dataframes.
updated_df_dict = {}

for key, df in new_df_dict.items():
    updated_df_dict[key] = fill_values(df, '-', 0)

In [76]:
for key in new_df_dict.keys():
    find_missing_values(updated_df_dict, key, '-')

The health_centers dataframe does not contain any instances of "-".

The health_center_sites dataframe does not contain any instances of "-".

The health_center_funding dataframe does not contain any instances of "-".

The health_center_zipcodes dataframe does not contain any instances of "-".

The health_center_costs dataframe does not contain any instances of "-".

The patient_services_revenue dataframe does not contain any instances of "-".

The personnel_and_visits dataframe does not contain any instances of "-".

The patients_age dataframe does not contain any instances of "-".

The patients_race dataframe does not contain any instances of "-".

The patients_other_demographics dataframe does not contain any instances of "-".



The "---" represents data that has been marked as confidential by health centers. I'll replace these with null values and then drop the rows. I'll drop these rows because there is no way to estimate what numbers should be there since nearly 50% of the data is missing from 3 of the tables.

During my analysis, it may be interesting to observe what these health centers have in common and other factors like geographic distribution and size, compared to health centers that report all of their data.

In [77]:
df_dict_confidential_null = {}

for key, df in updated_df_dict.items():
    df_dict_confidential_null[key] = fill_values(df, '---', pd.NA)

In [78]:
df_dict_dropped_rows = {}

for key, df in df_dict_confidential_null.items():
    df_dict_dropped_rows[key] = drop_missing_rows(df)

In [79]:
for key in df_dict_dropped_rows.keys():
    find_missing_values(df_dict_dropped_rows, key, '---')

The health_centers dataframe does not contain any instances of "---".

The health_center_sites dataframe does not contain any instances of "---".

The health_center_funding dataframe does not contain any instances of "---".

The health_center_zipcodes dataframe does not contain any instances of "---".

The health_center_costs dataframe does not contain any instances of "---".

The patient_services_revenue dataframe does not contain any instances of "---".

The personnel_and_visits dataframe does not contain any instances of "---".

The patients_age dataframe does not contain any instances of "---".

The patients_race dataframe does not contain any instances of "---".

The patients_other_demographics dataframe does not contain any instances of "---".



The "--" represents numbers between 1 and 15 that have been supressed to protect patient privacy. Later during my analysis, I will attempt to fill these types of null values using methods such as KNN and/or MICE. For now, I will keep these tables separate from the tables with null values from the "---". 

In [80]:
# replace all instances of '--' with null values for all dataframes.
updated_df_dict_null = {}

for key, df in df_dict_dropped_rows.items():
    updated_df_dict_null[key] = fill_values(df, '--', np.nan)

In [81]:
for key in updated_df_dict_null.keys():
    find_missing_values(updated_df_dict_null, key, '--')

The health_centers dataframe does not contain any instances of "--".

The health_center_sites dataframe does not contain any instances of "--".

The health_center_funding dataframe does not contain any instances of "--".

The health_center_zipcodes dataframe does not contain any instances of "--".

The health_center_costs dataframe does not contain any instances of "--".

The patient_services_revenue dataframe does not contain any instances of "--".

The personnel_and_visits dataframe does not contain any instances of "--".

The patients_age dataframe does not contain any instances of "--".

The patients_race dataframe does not contain any instances of "--".

The patients_other_demographics dataframe does not contain any instances of "--".



In [82]:
df_dict_clean = updated_df_dict_null

I now want to create three main tables that combine several of the earlier tables. 

I will create one table for all information relating to patient demographics by merging the `patients_age`, `patients_race`, and `patients_other_demographics` tables. 

I'll create a table for health center information that includes the `patients_df` and `health_center_funding`.

As I mentioned earlier, I will keep the three tables that had many "---" values separate for now. These will be part of the `health_center_ops_finance` table. 

In [83]:
patients_df = df_dict_clean['patients_age'].merge(df_dict_clean['patients_race'], 
                                                  on=['BHCMISID', 'GrantNumber'], how='outer') \
                                            .merge(df_dict_clean['patients_other_demographics'], 
                                                   on=['BHCMISID', 'GrantNumber'], how='outer')

In [84]:
health_center_combined = df_dict_clean['health_centers'].merge(df_dict_clean['health_center_funding'], 
                                                         on=['BHCMISID', 'GrantNumber'], how='outer') \
                                                        .merge(patients_df,
                                                         on=['BHCMISID', 'GrantNumber'], how='outer')

In [85]:
health_center_ops_finance = df_dict_clean['health_centers'].merge(df_dict_clean['health_center_costs'], 
                                                         on=['BHCMISID', 'GrantNumber'], how='outer') \
                                                        .merge(df_dict_clean['patient_services_revenue'], 
                                                         on=['BHCMISID', 'GrantNumber'], how='outer') \
                                                        .merge(df_dict_clean['personnel_and_visits'], 
                                                         on=['BHCMISID', 'GrantNumber'], how='outer')

In [86]:
health_center_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1370 entries, 0 to 1369
Data columns (total 57 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   BHCMISID                    1370 non-null   object
 1   GrantNumber                 1370 non-null   object
 2   HealthCenterName            1370 non-null   object
 3   HealthCenterCity            1370 non-null   object
 4   HealthCenterState           1370 non-null   object
 5   HealthCenterZIPCode         1370 non-null   object
 6   FundingCHC                  1370 non-null   bool  
 7   FundingMHC                  1370 non-null   bool  
 8   FundingHO                   1370 non-null   bool  
 9   FundingPH                   1370 non-null   bool  
 10  UrbanRuralFlag              1370 non-null   object
 11  mhc_amount                  1370 non-null   object
 12  chc_amount                  1370 non-null   object
 13  ho_amount                   1370 non-null   obje

In [87]:
service_area_df = df_dict_clean['health_center_zipcodes']
service_sites_df = df_dict_clean['health_center_sites']

Nearly all of the columns from each dataframe need to be converted to numeric types before analysis. Luckily, these all occur after the first few columns of object or boolean data. I'll write a function to convert the columns for each dataset. 

In [88]:
def convert_columns_to_numeric(df, start_col, stop_col):
    """
    Converts columns in a specified range to numeric dtypes.

    Parameters:
    df (pd.DataFrame): The DataFrame in which to convert columns.
    start_col: The index for the column to start converting data types.
    stop_col: The index for the colum to stop converting data types (inclusive).
    
    """
    columns_to_convert = df.columns[start_col : stop_col + 1] # adjust column range for 0-based indexing
    for col in columns_to_convert:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    return df


In [89]:
convert_columns_to_numeric(health_center_combined, 11, 55)

Unnamed: 0,BHCMISID,GrantNumber,HealthCenterName,HealthCenterCity,HealthCenterState,HealthCenterZIPCode,FundingCHC,FundingMHC,FundingHO,FundingPH,UrbanRuralFlag,mhc_amount,chc_amount,ho_amount,ph_amount,total_hc_funding,bphc_total,total_other_federal_grants,total_state_grants,total_local_gov_grants,total_private_grants,total_other_revenue,total_revenue,total_male,total_female,asian,native_hawaiian,pac_isl,black,am_ind,white,unreported,hispanic_any_race,total_patients_x,lep_patients,fpl_100_below,fpl_101_to_150,fpl_151_to_200,fpl_over_200,fpl_unknown,total_patients_y,uninsured_0_17,uninsured_18_up,medicaid_0_17,medicaid_18_up,medicare_18_up,public_insurance_0_17,public_insurance_18_up,private_insurance_0_17,private_insurance_18_up,total_0_17,total_18_up,migrant_patients,homeless_patients,school_based_patients,veteran_patients,public_housing_patients
0,010030,H80CS00803,"HOLYOKE HEALTH CENTER, INC.",Holyoke,MA,01040,True,False,False,False,Urban,0,5721128,0,0,5721128,7584461,568943.0,2915656.0,0.0,764680.0,14258919.0,28330029,8821,10323,148.0,17.0,,432.0,47.0,3035.0,2228.0,12631.0,19144,8371.0,2706.0,184.0,103.0,168.0,15983.0,19144,104.0,609.0,4882.0,8638,3320.0,,,166.0,1410.0,5157.0,13987,45.0,2998.0,0.0,332.0,19144
1,010040,H80CS00443,MAINE MOBILE HEALTH PROGRAM INC.,Augusta,ME,04330,False,True,False,False,Rural,1758567,0,0,0,1758567,2258650,0.0,0.0,0.0,25000.0,36578.0,2320228,609,241,0.0,0.0,,332.0,,,22.0,471.0,850,654.0,768.0,68.0,,,0.0,850,35.0,734.0,24.0,36,,0.0,0.0,,,64.0,786,769.0,,0.0,,0
2,010060,H80CS00741,"FAIR HAVEN COMMUNITY HEALTH CLINIC, INC.",New Haven,CT,06513,True,False,False,False,Urban,0,4569124,0,0,4569124,6571573,44836.0,511690.0,258065.0,1189046.0,270649.0,9064640,14150,17768,810.0,,23.0,7036.0,34.0,2601.0,729.0,19204.0,31918,12686.0,13138.0,3958.0,1685.0,1486.0,11651.0,31918,1329.0,3138.0,16495.0,5818,1686.0,0.0,0.0,1543.0,1908.0,19368.0,12550,,208.0,1937.0,24.0,29342
3,010070,H80CS00312,CORNELL SCOTT HILL HEALTH CORPORATION,New Haven,CT,06519,True,False,True,True,Urban,0,4553563,1041417,565694,6160674,9587880,1430533.0,2179279.0,2156460.0,104831.0,1753132.0,17483690,21230,31204,1504.0,,32.0,17046.0,116.0,11286.0,594.0,19860.0,52434,12197.0,22703.0,3842.0,1816.0,2160.0,21913.0,52434,809.0,4629.0,6433.0,26163,3210.0,0.0,0.0,882.0,10306.0,8126.0,44308,16.0,2206.0,2125.0,196.0,14603
4,010120,H80CS00001,"CITY OF SPRINGFIELD, MASSACHUSETTS",Springfield,MA,01105,False,False,True,False,Urban,0,0,1842241,0,1842241,2042241,0.0,0.0,0.0,0.0,0.0,2050671,1350,705,,,0.0,440.0,31.0,636.0,38.0,857.0,2055,461.0,1291.0,63.0,29.0,47.0,625.0,2055,0.0,331.0,36.0,1321,295.0,0.0,0.0,0.0,72.0,36.0,2019,85.0,1747.0,0.0,57.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1365,10E01196,H80CS28997,GIRDWOOD HEALTH CLINIC,Girdwood,AK,99587,True,False,False,False,Rural,0,1218153,0,0,1218153,2106125,0.0,42426.0,0.0,175000.0,277171.0,2600722,544,636,25.0,,,,,967.0,60.0,55.0,1180,,186.0,95.0,339.0,202.0,358.0,1180,,92.0,57.0,165,233.0,0.0,0.0,117.0,509.0,181.0,999,0.0,,0.0,62.0,0
1366,10E01202,H80CS29023,WINDING WATERS MEDICAL CLINIC,Enterprise,OR,97828,True,False,False,False,Rural,0,1027595,0,0,1027595,2038889,0.0,319998.0,325503.0,171716.0,144679.0,3000785,2561,2908,18.0,0.0,,,,4567.0,629.0,145.0,5469,40.0,1064.0,239.0,202.0,3458.0,506.0,5469,205.0,594.0,588.0,976,1431.0,0.0,,260.0,1413.0,1053.0,4416,,,0.0,287.0,0
1367,10E01221,H80CS29030,MATTAWA COMMUNITY MEDICAL CLINIC,Mattawa,WA,99349,True,False,False,False,Rural,0,1087491,0,0,1087491,1206346,0.0,79262.0,0.0,0.0,505731.0,1791339,1885,1754,,0.0,,0.0,18.0,728.0,,2883.0,3639,2187.0,693.0,602.0,369.0,375.0,1600.0,3639,72.0,911.0,991.0,467,264.0,,,111.0,805.0,1189.0,2450,1435.0,,0.0,,0
1368,11E01249,H80CS30720,CHUUK STATE DEPARTMENT OF HEALTH SERVICES,Chuuk,FM,96942,True,False,False,False,Rural,0,792000,0,0,792000,1781062,0.0,0.0,0.0,0.0,0.0,1781062,1897,2667,,0.0,4552.0,0.0,0.0,0.0,0.0,0.0,4564,4564.0,2234.0,,,0.0,2323.0,4564,1533.0,2233.0,0.0,0,0.0,131.0,667.0,0.0,0.0,1664.0,2900,0.0,0.0,0.0,0.0,0


In [90]:
health_center_combined.dtypes

BHCMISID                       object
GrantNumber                    object
HealthCenterName               object
HealthCenterCity               object
HealthCenterState              object
HealthCenterZIPCode            object
FundingCHC                       bool
FundingMHC                       bool
FundingHO                        bool
FundingPH                        bool
UrbanRuralFlag                 object
mhc_amount                      int64
chc_amount                      int64
ho_amount                       int64
ph_amount                       int64
total_hc_funding                int64
bphc_total                      int64
total_other_federal_grants    float64
total_state_grants            float64
total_local_gov_grants        float64
total_private_grants          float64
total_other_revenue           float64
total_revenue                   int64
total_male                      int64
total_female                    int64
asian                         float64
native_hawai

In [91]:
health_center_ops_finance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1370 entries, 0 to 1369
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   BHCMISID                 1370 non-null   object
 1   GrantNumber              1370 non-null   object
 2   HealthCenterName         1370 non-null   object
 3   HealthCenterCity         1370 non-null   object
 4   HealthCenterState        1370 non-null   object
 5   HealthCenterZIPCode      1370 non-null   object
 6   FundingCHC               1370 non-null   bool  
 7   FundingMHC               1370 non-null   bool  
 8   FundingHO                1370 non-null   bool  
 9   FundingPH                1370 non-null   bool  
 10  UrbanRuralFlag           1370 non-null   object
 11  total_costs_medical      741 non-null    object
 12  total_costs_dental       741 non-null    object
 13  total_costs_mh           741 non-null    object
 14  total_costs_enabling     741 non-null   

In [92]:
convert_columns_to_numeric(health_center_ops_finance, 11, 33)

Unnamed: 0,BHCMISID,GrantNumber,HealthCenterName,HealthCenterCity,HealthCenterState,HealthCenterZIPCode,FundingCHC,FundingMHC,FundingHO,FundingPH,UrbanRuralFlag,total_costs_medical,total_costs_dental,total_costs_mh,total_costs_enabling,total_costs_quality,total_cost_care,total_charges,total_collected,total_sfs,medical_clinic_visits,medical_virtual_visits,medical_patients,dental_clinic_visits,dental_virtual_visits,dental_patients,mh_clinic_visits,mh_virtual_visits,mh_patients,enabling_clinic_visits,enabling_virtual_visits,enabling_patients,total_clinic_visits,total_virtual_visits
0,010030,H80CS00803,"HOLYOKE HEALTH CENTER, INC.",Holyoke,MA,01040,True,False,False,False,Urban,,,,,,,,,,,,,,,,,,,,,,,
1,010040,H80CS00443,MAINE MOBILE HEALTH PROGRAM INC.,Augusta,ME,04330,False,True,False,False,Rural,774931.0,26613.0,313483.0,1024600.0,0.0,2183264.0,636869.0,2724.0,617158.0,1263.0,4.0,727.0,95.0,0.0,95.0,536.0,13.0,108.0,454.0,175.0,263.0,2381.0,192.0
2,010060,H80CS00741,"FAIR HAVEN COMMUNITY HEALTH CLINIC, INC.",New Haven,CT,06513,True,False,False,False,Urban,21619441.0,1391991.0,3879153.0,2212470.0,516537.0,33153465.0,34615790.0,24985087.0,1716660.0,77287.0,16068.0,30818.0,5592.0,0.0,2288.0,10435.0,6733.0,2346.0,1094.0,5057.0,3539.0,96387.0,27893.0
3,010070,H80CS00312,CORNELL SCOTT HILL HEALTH CORPORATION,New Haven,CT,06519,True,False,True,True,Urban,,,,,,,,,,,,,,,,,,,,,,,
4,010120,H80CS00001,"CITY OF SPRINGFIELD, MASSACHUSETTS",Springfield,MA,01105,False,False,True,False,Urban,1515135.0,413126.0,245920.0,369171.0,0.0,2684614.0,1307876.0,726972.0,8876.0,3408.0,1229.0,809.0,759.0,0.0,330.0,171.0,440.0,105.0,2856.0,244.0,1140.0,7612.0,1926.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1365,10E01196,H80CS28997,GIRDWOOD HEALTH CLINIC,Girdwood,AK,99587,True,False,False,False,Rural,,,,,,,,,,,,,,,,,,,,,,,
1366,10E01202,H80CS29023,WINDING WATERS MEDICAL CLINIC,Enterprise,OR,97828,True,False,False,False,Rural,,,,,,,,,,,,,,,,,,,,,,,
1367,10E01221,H80CS29030,MATTAWA COMMUNITY MEDICAL CLINIC,Mattawa,WA,99349,True,False,False,False,Rural,1186873.0,719898.0,542918.0,154661.0,56199.0,3593396.0,,,,8280.0,78.0,3135.0,2418.0,0.0,932.0,690.0,49.0,147.0,0.0,0.0,0.0,11388.0,127.0
1368,11E01249,H80CS30720,CHUUK STATE DEPARTMENT OF HEALTH SERVICES,Chuuk,FM,96942,True,False,False,False,Rural,1286326.0,59363.0,17269.0,209655.0,27276.0,1623659.0,7314.0,3158.0,0.0,5392.0,0.0,4056.0,677.0,0.0,479.0,198.0,0.0,155.0,0.0,0.0,0.0,6267.0,0.0


In [93]:
health_center_ops_finance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1370 entries, 0 to 1369
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   BHCMISID                 1370 non-null   object 
 1   GrantNumber              1370 non-null   object 
 2   HealthCenterName         1370 non-null   object 
 3   HealthCenterCity         1370 non-null   object 
 4   HealthCenterState        1370 non-null   object 
 5   HealthCenterZIPCode      1370 non-null   object 
 6   FundingCHC               1370 non-null   bool   
 7   FundingMHC               1370 non-null   bool   
 8   FundingHO                1370 non-null   bool   
 9   FundingPH                1370 non-null   bool   
 10  UrbanRuralFlag           1370 non-null   object 
 11  total_costs_medical      741 non-null    float64
 12  total_costs_dental       741 non-null    float64
 13  total_costs_mh           741 non-null    float64
 14  total_costs_enabling    

In [94]:
service_area_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97142 entries, 0 to 97141
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   BHCMISID                           97142 non-null  object
 1   GrantNumber                        97142 non-null  object
 2   ReportingYear                      97142 non-null  int64 
 3   ZipCode                            97142 non-null  object
 4   ZipCodeType                        97142 non-null  object
 5   None_UninsuredPatients             46743 non-null  object
 6   Medicaid_CHIP_OtherPublicPatients  60015 non-null  object
 7   MedicarePatients                   46548 non-null  object
 8   PrivatePatients                    49667 non-null  object
 9   TotalNumberofPatients              77761 non-null  object
dtypes: int64(1), object(9)
memory usage: 7.4+ MB


In [95]:
convert_columns_to_numeric(service_area_df, 5, 9)
service_area_df.dtypes

BHCMISID                              object
GrantNumber                           object
ReportingYear                          int64
ZipCode                               object
ZipCodeType                           object
None_UninsuredPatients               float64
Medicaid_CHIP_OtherPublicPatients    float64
MedicarePatients                     float64
PrivatePatients                      float64
TotalNumberofPatients                float64
dtype: object

In [96]:
service_sites_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14987 entries, 0 to 14986
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   BHCMISID                     14987 non-null  object
 1   GrantNumber                  14987 non-null  object
 2   SiteName                     14987 non-null  object
 3   SiteType                     14987 non-null  object
 4   SiteStatus                   14987 non-null  object
 5   LocationType                 14987 non-null  object
 6   TotalWeeklyHoursOfOperation  14987 non-null  object
 7   ServiceAreaPopulation        14987 non-null  object
 8   SiteOperatedBy               14987 non-null  object
 9   SiteCity                     14987 non-null  object
 10  SiteState                    14987 non-null  object
 11  SiteZIPCode                  14987 non-null  object
dtypes: object(12)
memory usage: 1.4+ MB


In [97]:
convert_columns_to_numeric(service_sites_df, 6, 6)

Unnamed: 0,BHCMISID,GrantNumber,SiteName,SiteType,SiteStatus,LocationType,TotalWeeklyHoursOfOperation,ServiceAreaPopulation,SiteOperatedBy,SiteCity,SiteState,SiteZIPCode
0,010030,H80CS00803,CHICOPEE HEALTH CENTER,Service Delivery Site,Active,Permanent,47.0,Urban,Health Center/Applicant,Chicopee,MA,01013-3140
1,010030,H80CS00803,"HOLYOKE HEALTH CENTER, INC.",Service Delivery Site,Active,Permanent,48.0,Urban,Health Center/Applicant,Holyoke,MA,01040-5144
2,010030,H80CS00803,Holyoke Soldier Home,Service Delivery Site,Active,Permanent,40.0,Urban,Health Center/Applicant,Holyoke,MA,01040-7002
3,010030,H80CS00803,Western Massachusetts Hospital 91 East Mountai...,Service Delivery Site,Active,Permanent,40.0,Urban,Health Center/Applicant,Westfield,MA,01085-1801
4,010040,H80CS00443,Blueberry Harvest School,Service Delivery Site,Active,Seasonal,40.0,Rural,Health Center/Applicant,Milbridge,ME,04658-3527
...,...,...,...,...,...,...,...,...,...,...,...,...
14982,11E01386,H80CS35350,Tafunsak Community Health Center,Administrative/Service Delivery Site,Active,Permanent,40.0,Rural,Health Center/Applicant,Kosrae,FM,0
14983,11E01386,H80CS35350,Tafunsak Health Center,Service Delivery Site,Active,Permanent,40.0,Rural,Health Center/Applicant,Tafunsak,FM,96944-3000
14984,11E01386,H80CS35350,Tofol Dental site,Service Delivery Site,Active,Permanent,32.0,Rural,Health Center/Applicant,Kosrae,FM,96944-9998
14985,11E01386,H80CS35350,Utwa Health Center,Service Delivery Site,Active,Permanent,24.0,Rural,Health Center/Applicant,0,FM,0


In [98]:
service_sites_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14987 entries, 0 to 14986
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   BHCMISID                     14987 non-null  object 
 1   GrantNumber                  14987 non-null  object 
 2   SiteName                     14987 non-null  object 
 3   SiteType                     14987 non-null  object 
 4   SiteStatus                   14987 non-null  object 
 5   LocationType                 14987 non-null  object 
 6   TotalWeeklyHoursOfOperation  14987 non-null  float64
 7   ServiceAreaPopulation        14987 non-null  object 
 8   SiteOperatedBy               14987 non-null  object 
 9   SiteCity                     14987 non-null  object 
 10  SiteState                    14987 non-null  object 
 11  SiteZIPCode                  14987 non-null  object 
dtypes: float64(1), object(11)
memory usage: 1.4+ MB


In [99]:
service_area_df.head()

Unnamed: 0,BHCMISID,GrantNumber,ReportingYear,ZipCode,ZipCodeType,None_UninsuredPatients,Medicaid_CHIP_OtherPublicPatients,MedicarePatients,PrivatePatients,TotalNumberofPatients
0,10030,H80CS00803,2022,1011,ZipCode,,,0.0,,
1,10030,H80CS00803,2022,1013,ZipCode,61.0,1346.0,385.0,182.0,1974.0
2,10030,H80CS00803,2022,1014,ZipCode,0.0,,,0.0,21.0
3,10030,H80CS00803,2022,1104,ZipCode,26.0,791.0,181.0,97.0,1095.0
4,10030,H80CS00803,2022,1108,ZipCode,25.0,640.0,79.0,61.0,805.0


In [100]:
health_center_combined.head()

Unnamed: 0,BHCMISID,GrantNumber,HealthCenterName,HealthCenterCity,HealthCenterState,HealthCenterZIPCode,FundingCHC,FundingMHC,FundingHO,FundingPH,UrbanRuralFlag,mhc_amount,chc_amount,ho_amount,ph_amount,total_hc_funding,bphc_total,total_other_federal_grants,total_state_grants,total_local_gov_grants,total_private_grants,total_other_revenue,total_revenue,total_male,total_female,asian,native_hawaiian,pac_isl,black,am_ind,white,unreported,hispanic_any_race,total_patients_x,lep_patients,fpl_100_below,fpl_101_to_150,fpl_151_to_200,fpl_over_200,fpl_unknown,total_patients_y,uninsured_0_17,uninsured_18_up,medicaid_0_17,medicaid_18_up,medicare_18_up,public_insurance_0_17,public_insurance_18_up,private_insurance_0_17,private_insurance_18_up,total_0_17,total_18_up,migrant_patients,homeless_patients,school_based_patients,veteran_patients,public_housing_patients
0,10030,H80CS00803,"HOLYOKE HEALTH CENTER, INC.",Holyoke,MA,1040,True,False,False,False,Urban,0,5721128,0,0,5721128,7584461,568943.0,2915656.0,0.0,764680.0,14258919.0,28330029,8821,10323,148.0,17.0,,432.0,47.0,3035.0,2228.0,12631.0,19144,8371.0,2706.0,184.0,103.0,168.0,15983.0,19144,104.0,609.0,4882.0,8638,3320.0,,,166.0,1410.0,5157.0,13987,45.0,2998.0,0.0,332.0,19144
1,10040,H80CS00443,MAINE MOBILE HEALTH PROGRAM INC.,Augusta,ME,4330,False,True,False,False,Rural,1758567,0,0,0,1758567,2258650,0.0,0.0,0.0,25000.0,36578.0,2320228,609,241,0.0,0.0,,332.0,,,22.0,471.0,850,654.0,768.0,68.0,,,0.0,850,35.0,734.0,24.0,36,,0.0,0.0,,,64.0,786,769.0,,0.0,,0
2,10060,H80CS00741,"FAIR HAVEN COMMUNITY HEALTH CLINIC, INC.",New Haven,CT,6513,True,False,False,False,Urban,0,4569124,0,0,4569124,6571573,44836.0,511690.0,258065.0,1189046.0,270649.0,9064640,14150,17768,810.0,,23.0,7036.0,34.0,2601.0,729.0,19204.0,31918,12686.0,13138.0,3958.0,1685.0,1486.0,11651.0,31918,1329.0,3138.0,16495.0,5818,1686.0,0.0,0.0,1543.0,1908.0,19368.0,12550,,208.0,1937.0,24.0,29342
3,10070,H80CS00312,CORNELL SCOTT HILL HEALTH CORPORATION,New Haven,CT,6519,True,False,True,True,Urban,0,4553563,1041417,565694,6160674,9587880,1430533.0,2179279.0,2156460.0,104831.0,1753132.0,17483690,21230,31204,1504.0,,32.0,17046.0,116.0,11286.0,594.0,19860.0,52434,12197.0,22703.0,3842.0,1816.0,2160.0,21913.0,52434,809.0,4629.0,6433.0,26163,3210.0,0.0,0.0,882.0,10306.0,8126.0,44308,16.0,2206.0,2125.0,196.0,14603
4,10120,H80CS00001,"CITY OF SPRINGFIELD, MASSACHUSETTS",Springfield,MA,1105,False,False,True,False,Urban,0,0,1842241,0,1842241,2042241,0.0,0.0,0.0,0.0,0.0,2050671,1350,705,,,0.0,440.0,31.0,636.0,38.0,857.0,2055,461.0,1291.0,63.0,29.0,47.0,625.0,2055,0.0,331.0,36.0,1321,295.0,0.0,0.0,0.0,72.0,36.0,2019,85.0,1747.0,0.0,57.0,0


In [101]:
health_center_ops_finance.head()

Unnamed: 0,BHCMISID,GrantNumber,HealthCenterName,HealthCenterCity,HealthCenterState,HealthCenterZIPCode,FundingCHC,FundingMHC,FundingHO,FundingPH,UrbanRuralFlag,total_costs_medical,total_costs_dental,total_costs_mh,total_costs_enabling,total_costs_quality,total_cost_care,total_charges,total_collected,total_sfs,medical_clinic_visits,medical_virtual_visits,medical_patients,dental_clinic_visits,dental_virtual_visits,dental_patients,mh_clinic_visits,mh_virtual_visits,mh_patients,enabling_clinic_visits,enabling_virtual_visits,enabling_patients,total_clinic_visits,total_virtual_visits
0,10030,H80CS00803,"HOLYOKE HEALTH CENTER, INC.",Holyoke,MA,1040,True,False,False,False,Urban,,,,,,,,,,,,,,,,,,,,,,,
1,10040,H80CS00443,MAINE MOBILE HEALTH PROGRAM INC.,Augusta,ME,4330,False,True,False,False,Rural,774931.0,26613.0,313483.0,1024600.0,0.0,2183264.0,636869.0,2724.0,617158.0,1263.0,4.0,727.0,95.0,0.0,95.0,536.0,13.0,108.0,454.0,175.0,263.0,2381.0,192.0
2,10060,H80CS00741,"FAIR HAVEN COMMUNITY HEALTH CLINIC, INC.",New Haven,CT,6513,True,False,False,False,Urban,21619441.0,1391991.0,3879153.0,2212470.0,516537.0,33153465.0,34615790.0,24985087.0,1716660.0,77287.0,16068.0,30818.0,5592.0,0.0,2288.0,10435.0,6733.0,2346.0,1094.0,5057.0,3539.0,96387.0,27893.0
3,10070,H80CS00312,CORNELL SCOTT HILL HEALTH CORPORATION,New Haven,CT,6519,True,False,True,True,Urban,,,,,,,,,,,,,,,,,,,,,,,
4,10120,H80CS00001,"CITY OF SPRINGFIELD, MASSACHUSETTS",Springfield,MA,1105,False,False,True,False,Urban,1515135.0,413126.0,245920.0,369171.0,0.0,2684614.0,1307876.0,726972.0,8876.0,3408.0,1229.0,809.0,759.0,0.0,330.0,171.0,440.0,105.0,2856.0,244.0,1140.0,7612.0,1926.0


In [102]:
print('Shape of Service area dataframe: ', service_sites_df.shape)
print('Shape of Service sites dataframe: ', service_area_df.shape)
print('Shape of Health centers dataframe: ', health_center_combined.shape)
print('Shape of Health centers operations and finance dataframe: ', health_center_ops_finance.shape)

Shape of Service area dataframe:  (14987, 12)
Shape of Service sites dataframe:  (97142, 10)
Shape of Health centers dataframe:  (1370, 57)
Shape of Health centers operations and finance dataframe:  (1370, 34)


In [103]:
# save new dataframes to csv files
service_sites_df.to_csv(DATA_PATH +'service_sites_df.csv', index=False)
service_area_df.to_csv(DATA_PATH +'service_area_df.csv', index=False)
health_center_combined.to_csv(DATA_PATH +'health_center_combined.csv', index=False)
health_center_ops_finance.to_csv(DATA_PATH +'health_center_ops_finance.csv', index=False)