In [12]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


We first extract the raw csv file from the NYC health department GitHub page: https://github.com/nychealth/coronavirus-data/tree/master. Using pandas, we can get the csv file directly from the link and read the data into a dataframe.


In [13]:
url = "https://raw.githubusercontent.com/nychealth/coronavirus-data/master/totals/"
data_urls = ["by-group.csv", "data-by-modzcta.csv", "deaths-by-race-age.csv", "group-cases-by-boro.csv", "group-data-by-boro.csv", "group-death-by-boro.csv", "group-hosp-by-boro.csv", "summary.csv"]

df_group = pd.read_csv(url+data_urls[0], index_col=0)
df_group.head(5)

Unnamed: 0_level_0,subgroup,CONFIRMED_CASE_RATE,CASE_RATE,HOSPITALIZED_RATE,DEATH_RATE,CONFIRMED_CASE_COUNT,PROBABLE_CASE_COUNT,CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Age group,0-4,22479.08,26585.49,907.93,,117727.0,21506.0,139233.0,4755.0,
Age group,5-12,27155.33,32975.2,287.6,,203002.0,43507.0,246509.0,2150.0,
Age group,13-17,29559.28,36693.0,456.05,,127946.0,30878.0,158824.0,1974.0,
Age group,0-17,,,,3.52,,,,,60.0
Age group,18-24,38826.64,48513.57,960.31,11.49,273600.0,68261.0,341861.0,6767.0,81.0


We first analyze the age groups among the race that is most susceptible to death by the coronavirus. The below code gives us a bar graph that puts the rate of death cases together for a simple visualization.

In [15]:
df_race = pd.read_csv(url+data_urls[2], index_col=0)
print(df_race.head(10))

plt.style.use("ggplot")
fig,ax = plt.subplots(figsize=(14,9))
bar_width = 0.1
r = np.arange(len(df_race.index))

for i, col in enumerate(df_race.columns):
    ax.bar(r + i * bar_width, df_race[col], width=bar_width, label=col)

ax.set_xlabel('Race Group', fontweight='bold', fontsize=12)
ax.set_ylabel('Death Rate', fontweight='bold', fontsize=12)
ax.set_title('Age Distribution by Race Group', fontweight='bold', fontsize=14)
ax.set_xticks(r + bar_width * 3.5)
ax.set_xticklabels(df_race.index, rotation=45, ha='right')

ax.legend(title='Age Groups', bbox_to_anchor=(1.05, 1), loc='upper left')
def add_y_value_labels(ax, spacing=5):
    for rect in ax.patches:
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        label = f"{y_value:.0f}"

        va = 'bottom' if y_value >= 0 else 'top'

        ax.annotate(
            label,
            (x_value, y_value),
            xytext=(0, spacing),
            textcoords="offset points",
            ha='center',
            va=va)
        
add_y_value_labels(ax)
# Adjust layout and display the plot
plt.tight_layout()
plt.show()

                        AGE_0_17_YRS  AGE_18_24_YRS  AGE_25_34_YRS  \
RACE_GROUP                                                           
Asian/Pacific-Islander          2.78           4.06          12.22   
Black/African-American          6.43          21.24          49.40   
Hispanic/Latino                 2.67          11.85          39.43   
White                           2.64           4.15          11.21   

                        AGE_35_44_YRS  AGE_45_54_YRS  AGE_55_64_YRS  \
RACE_GROUP                                                            
Asian/Pacific-Islander          31.26         120.62         357.03   
Black/African-American         136.11         354.26         849.05   
Hispanic/Latino                147.18         352.80         794.58   
White                           28.13         102.48         383.10   

                        AGE_65_74_YRS  AGE_GE_75_YRS  
RACE_GROUP                                            
Asian/Pacific-Islander         920.74     

It seems that among all races, the age group with overwhelmingly high rate of death is the population with age higher than 75 years old.

Let's perform an even more in-depth analysis between the age groups. We now plot a bar graph to visualize the different type of cases within the age groups.

In [16]:
df_data = pd.read_csv(url+data_urls[0], index_col=1)
df_data = df_data.dropna(axis=0)
df_data = df_data[df_data['group'] == "Age group"]

plt.style.use("ggplot")
fig, ax = plt.subplots(figsize=(16,9))
bar_height = 0.2
print(df_data.head(10))

r = np.arange(len(df_data.index))

ax.barh(r, df_data['CASE_RATE'], height=bar_height, label='Case Rate', color='blue')
ax.barh(r + bar_height, df_data['HOSPITALIZED_RATE'], height=bar_height, label='Hospitalized Rate', color='green')
ax.barh(r + 2 * bar_height, df_data['DEATH_RATE'], height=bar_height, label='Death Rate', color='red')

# Add y-ticks in the middle of the group bars
ax.set_ylabel('Age Group', fontweight='bold', fontsize=12)
ax.set_xlabel('Rate per 100,000', fontweight='bold', fontsize=12)
ax.set_title('Case, Hospitalization, and Death Rates by Age Group', fontweight='bold', fontsize=14)
ax.set_yticks(r + bar_height)
ax.set_yticklabels(df_data.index)

# Create legend
ax.legend(title='Rates', bbox_to_anchor=(1.05, 1), loc='upper left')

# Use logarithmic scale for x-axis due to large differences in magnitudes
ax.set_xscale('log')

def add_value_labels(ax, spacing=5):
    for rect in ax.patches:
        x_value = rect.get_width()
        y_value = rect.get_y() + rect.get_height() / 2

        label = f"{x_value:.0f}"

        ha = 'left' if x_value >= 0 else 'right'

        ax.annotate(
            label,
            (x_value, y_value),
            xytext=(spacing, 0),
            textcoords="offset points",
            ha=ha,
            va='center')

add_value_labels(ax)
plt.tight_layout()
plt.show()


              group  CONFIRMED_CASE_RATE  CASE_RATE  HOSPITALIZED_RATE  \
subgroup                                                                 
18-24     Age group             38826.64   48513.57             960.31   
25-34     Age group             39479.57   48830.39            1211.57   
35-44     Age group             40489.72   49408.66            1681.76   
45-54     Age group             38066.33   46355.71            2364.20   
55-64     Age group             36205.55   44081.80            3947.24   
65-74     Age group             32083.28   38877.43            6257.42   
75+       Age group             32631.19   38844.93           12666.62   

          DEATH_RATE  CONFIRMED_CASE_COUNT  PROBABLE_CASE_COUNT  CASE_COUNT  \
subgroup                                                                      
18-24          11.49              273600.0              68261.0    341861.0   
25-34          28.71              585758.0             138738.0    724496.0   
35-44          92

In [17]:

df_data = pd.read_csv(url + data_urls[0], index_col=1)
df_data = df_data.dropna(axis=0)
df_data = df_data[df_data['group'] == "Sex"]
print(df_data.head(10))

plt.style.use("ggplot")
fig, ax = plt.subplots(figsize=(14, 9))
bar_width = 0.25
r = np.arange(len(df_data.index))

ax.bar(r, df_data['CASE_RATE'], bar_width, label='Case Rate', color='blue')
ax.bar(r + bar_width, df_data['HOSPITALIZED_RATE'], bar_width, label='Hospitalized Rate', color='green')
ax.bar(r + 2 * bar_width, df_data['DEATH_RATE'], bar_width, label='Death Rate', color='red')

ax.set_xlabel('Age Group', fontweight='bold', fontsize=12)
ax.set_ylabel('Rate per 100,000', fontweight='bold', fontsize=12)
ax.set_title('Case, Hospitalization, and Death Rates by Age Group', fontweight='bold', fontsize=14)
ax.set_xticks(r + bar_width)
ax.set_xticklabels(df_data.index)

ax.legend(title='Rates', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.set_yscale('log')

add_y_value_labels(ax)
plt.tight_layout()
plt.show()

         group  CONFIRMED_CASE_RATE  CASE_RATE  HOSPITALIZED_RATE  DEATH_RATE  \
subgroup                                                                        
Female     Sex             37095.13   45544.15            2659.84      471.09   
Male       Sex             32976.85   39927.92            2932.91      651.26   

          CONFIRMED_CASE_COUNT  PROBABLE_CASE_COUNT  CASE_COUNT  \
subgroup                                                          
Female               1616746.0             368240.0   1984986.0   
Male                 1311964.0             276544.0   1588508.0   

          HOSPITALIZED_COUNT  DEATH_COUNT  
subgroup                                   
Female              115926.0      20532.0  
Male                116684.0      25910.0  
