# Final Project

## 1. Programming environment

In [1]:
import pandas as pd
import requests
import json

## 2. Data Collection

Our group chose the website https://www.communitybenefitinsight.org/?page=info.data_api to collect our data. This website contains information about hospitals across the United States. 
The Community Benefit Insight data API allows us to retrieve the following types of data:
+ Hospital data (optionally filtered by state)
+ Detailed data about a single hospital

### 2.1 Retrieve hospital data


In [3]:
url = 'https://www.communitybenefitinsight.org/api/get_hospitals.php'
data = requests.get(url).text
hospital_data = json.loads(data)
attribute_names = list(hospital_data[0].keys())
data = {}
for attribute in attribute_names:
    data[attribute] = [item[attribute] for item in hospital_data]
df = pd.DataFrame.from_dict(data)
df.to_csv('./data/full_hospital.csv', index=False)
print(1)

### 2.2 Retrieve detailed data from every hospital

To retrieve data for each hospital, we use https://www.communitybenefitinsight.org/api/get_hospital_data.php?hospital_id= + ID (ID is the hospital ID)

Get attributes for the dataset 

In [12]:
url = 'https://www.communitybenefitinsight.org/api/get_hospital_data.php?hospital_id=1'
data = requests.get(url).text
web = json.loads(data)
attribute = list(web[-1].keys())
attribute

['hospital_data_id',
 'fiscal_yr',
 'data_ein',
 'data_name',
 'form_990_filed_part_grp_ret_f',
 'form_990_num_fac_oper',
 'form_990_name',
 'form_990_address',
 'form_990_city',
 'form_990_state',
 'form_990_zip',
 'tot_func_exp',
 'tot_revenue',
 'tot_comm_bnfts',
 'chrty_care',
 'unreim_medcd',
 'unreim_costs',
 'comm_hlth_impr_svcs_comm_bnft_oper',
 'hlth_prof_educ',
 'subsd_hlth_svcs',
 'rsrch',
 'cash_inknd_contrib_comm_grps',
 'comm_bldg',
 'comm_bldg_actvs',
 'comm_bldg_actvs_physimprvhse',
 'comm_bldg_actvs_econdevlp',
 'comm_bldg_actvs_cmntysuprt',
 'comm_bldg_actvs_envrnimprv',
 'comm_bldg_actvs_ldrdevlp',
 'comm_bldg_actvs_cltnbldg',
 'comm_bldg_actvs_htlhimprvadvcy',
 'comm_bldg_actvs_wrkfrcdevlp',
 'comm_bldg_actvs_other',
 'comm_bldg_actvs_physimprvhse_pct',
 'comm_bldg_actvs_econdevlp_pct',
 'comm_bldg_actvs_cmntysuprt_pct',
 'comm_bldg_actvs_envrnimprv_pct',
 'comm_bldg_actvs_ldrdevlp_pct',
 'comm_bldg_actvs_cltnbldg_pct',
 'comm_bldg_actvs_htlhimprvadvcy_pct',
 'comm_

Create a dictionary to store value for each attribute. The dictionary has one extra key to store the hospital id.

In [13]:
single_data = {}
single_data['hospital_id'] = []
for name in attribute:
    single_data[name] = []

There are total 3491 hospitals in the dataset but the website only allowed us to make 100 requests per week. So I had to change my VPN  to retrieve information about 100 hospitals at a time.

In [None]:
for id in range(1, 100):
    new_url = 'https://www.communitybenefitinsight.org/api/get_hospital_data.php?hospital_id=' + str(id)
    new_data = requests.get(new_url).text
    json_data = json.loads(new_data)
    for name in attribute:
        single_data[name].append(json_data[-1][name])
    single_data['hospital_id'].append(id)
    # print(id)

Save the data as csv format

In [None]:
df = pd.DataFrame.from_dict(single_data)
df.to_csv('./data/test.csv', index=False)

Finally I will concatenate that dataset to the first one based on the hospital_id attribute.

In [83]:
first_df = pd.read_csv('./data/full_hospital.csv')
second_df = pd.read_csv('./data/test.csv')
raw_df = pd.merge(first_df, second_df, on='hospital_id')
raw_df.to_csv('./data/raw_data.csv', index = False)
raw_df.head(5)

Unnamed: 0,hospital_id,hospital_org_id,ein,name,name_cr,street_address,city,state,zip_code,fips_state_and_county_code,...,pctttlexp_econdevlp,pctttlexp_cmntysuprt,pctttlexp_envrnimprv,pctttlexp_ldrdevlp,pctttlexp_cltnbldg,pctttlexp_htlhimprvadvcy,pctttlexp_wrkfrcdevlp,pctttlexp_other,pctttlexp_total,updated_dt_y
0,1,1,630307951,Mizell Memorial Hospital,Mizell Memorial Hospital,702 Main Street,Opp,AL,36462,1039,...,0.0,0.0001,0.0,0.0,0.0,0.0001,0.0017,0.0,0.0019,"November 20, 2023"
1,2,2,630578923,St Vincents East,St Vincents East,50 Medical Park Drive East,Birmingham,AL,35235,1073,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"November 20, 2023"
2,3,3,630312913,Shelby Baptist Medical Center,Shelby Baptist Medical Center,1000 First Street North,Alabaster,AL,35007,1117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"November 20, 2023"
3,4,4,630459034,Callahan Eye Foundation Hosp,Callahan Eye Foundation Hosp,1720 University Boulevard,Birmingham,AL,35233,1073,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"November 20, 2023"
4,5,5,581973570,Cherokee Medical Center,Cherokee Medical Center,400 Northwood Drive,Centre,AL,35960,1019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"November 20, 2023"


## 3. Data Preprocessing & Exploration

### 3.1 Data Preprocessing

#### How many rows and columns does this data have?

In [38]:
raw_df = pd.read_csv('./data/raw_data.csv')
shape = raw_df.shape
print(f"Current shape: {shape}")

Current shape: (3491, 161)


#### Check for duplicate row in data 

In [39]:
index = raw_df.index
detectDupSeries = index.duplicated(keep='first')
num_duplicated_rows = detectDupSeries.sum()

if num_duplicated_rows == 0:
    print(f"Data have no duplicated line.!")
else:
    if num_duplicated_rows > 1:
        ext = "lines"
    else:
        ext = "line"
    print(f"Data have {num_duplicated_rows} duplicated " + ext + ". Please de-deduplicate your raw data.!")

Data have no duplicated line.!


#### Choosing attribute

There are 161 features in the dataframe which is too many so we will choose only essential features in the dataframe to explore

First find the columns that has NaN value

In [40]:
nan_count = raw_df.isna().sum()

columns_with_NaN_value = nan_count[nan_count > 0]
# columns = columns_with_NaN_value.index.tolist()
# columns
len(columns_with_NaN_value)

41

There are 41 columns in the dataset that has NaN value. So we will delete columns that 20% of it is NaN values.

In [41]:
chosen_columns = nan_count[nan_count > 698]
del_columns = chosen_columns.index.tolist()
for name in del_columns:
    del raw_df[name]
raw_df.shape
# raw_df.to_csv('./data/first.csv')

We see there are a lot of columns in the dataframe that has a lot of 0 value. So we will see those that has the most 0 value count.

In [48]:
columns_with_0 = raw_df.columns[(raw_df == 0).sum() > 0]
columns_with_0

Index(['tot_func_exp', 'tot_revenue', 'tot_comm_bnfts', 'chrty_care',
       'unreim_medcd', 'unreim_costs', 'comm_hlth_impr_svcs_comm_bnft_oper',
       'hlth_prof_educ', 'subsd_hlth_svcs', 'rsrch',
       ...
       'pctttlexp_physimprvhse', 'pctttlexp_econdevlp', 'pctttlexp_cmntysuprt',
       'pctttlexp_envrnimprv', 'pctttlexp_ldrdevlp', 'pctttlexp_cltnbldg',
       'pctttlexp_htlhimprvadvcy', 'pctttlexp_wrkfrcdevlp', 'pctttlexp_other',
       'pctttlexp_total'],
      dtype='object', length=101)

We will delete columns that 20% of it is 0.

In [49]:
drop_columns = raw_df.columns[(raw_df == 0).sum() > 698]
raw_df = raw_df.drop(columns=drop_columns)
raw_df.shape

(3491, 68)

The dataframe now only has 68 columns. Next our group will eliminate attributes that our group think it is irrelevant.

In [71]:
del raw_df['hospital_org_id'], raw_df['updated_dt_x'], raw_df['updated_dt_y']

In [77]:
del raw_df['zip_code'], raw_df['fips_state_and_county_code'], raw_df['data_ein'], raw_df['name_cr'], raw_df['data_name'],
del raw_df['form_990_name'], raw_df['form_990_address'], raw_df['form_990_city'], raw_df['form_990_state'], raw_df['form_990_zip']

Many columns are just flag, so we will select carefully attributes to delete

In [72]:
columns_with_Y = raw_df.columns[(raw_df == 'Y').sum() > 0]
for x in columns_with_Y:
    print(x)
    print(raw_df[x].value_counts()['Y'])

chrch_affl_f
347
urban_location_f
2288
children_hospital_f
101
memb_counc_teach_hosps_f
1099
form_990_filed_part_grp_ret_f
1481
sole_comm_prov_f
25
written_fncl_asst_policy_f
3484
used_fed_pov_gdlns_det_elig_free_disc_care_f
3473
chna_cond_comm_hlth_needs_assmt_f
3316
chna_defined_comm_served_f
3314
chna_took_into_acct_input_ppl_rep_broad_intrst_comm_served_f
3313
chna_cond_w_mult_hosp_facilities_f
2004
chna_made_wide_avail_pub_f
3318
hosp_adp_imp_stg_addr_hlth_needs_comm_f
3284
st_impl_aca_medcd_exp_cvrg_adlts_w_incs_up_138pct_fpl_f
2205
st_law_req_hosp_rpt_comm_bnfts_f
2480
paper_return_f
65


In [73]:
columns_with_N = raw_df.columns[(raw_df == 'N').sum() > 0]
for x in columns_with_N:
    print(x)
    print(raw_df[x].value_counts()['N'])

chrch_affl_f
3144
urban_location_f
1203
children_hospital_f
3390
memb_counc_teach_hosps_f
2392
form_990_filed_part_grp_ret_f
2010
sole_comm_prov_f
3466
written_fncl_asst_policy_f
1
used_fed_pov_gdlns_det_elig_free_disc_care_f
12
chna_cond_comm_hlth_needs_assmt_f
80
chna_took_into_acct_input_ppl_rep_broad_intrst_comm_served_f
9
chna_cond_w_mult_hosp_facilities_f
1318
chna_made_wide_avail_pub_f
4
hosp_adp_imp_stg_addr_hlth_needs_comm_f
26
st_impl_aca_medcd_exp_cvrg_adlts_w_incs_up_138pct_fpl_f
1286
st_law_req_hosp_rpt_comm_bnfts_f
1011
paper_return_f
3426


We drop the flag columns that are divided so uneven.

In [78]:
drop_columns_Y = raw_df.columns[(raw_df == 'Y').sum() > 3491*0.8]
raw_df = raw_df.drop(columns=drop_columns_Y)

drop_columns_N = raw_df.columns[(raw_df == 'N').sum() > 3491*0.8]
raw_df = raw_df.drop(columns=drop_columns_N)

raw_df.shape

(3491, 44)

In [80]:
raw_df.to_csv('./data/final_data.csv', index=False)

### 3.2 Data exploration

In [82]:
df = pd.read_csv('./data/final_data.csv')
df.shape

(3491, 44)