In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import adjustText as aT
import geopandas as gpd
%matplotlib inline

ModuleNotFoundError: No module named 'adjustText'

# Creation of Choropleths

In [4]:
df = pd.read_csv("statewide_cases.csv")

In [5]:
# Sanity checks
df.shape

(7325, 6)

In [5]:
df['county'].nunique()

60

In [11]:
[col for col in df.columns]

['county',
 'totalcountconfirmed',
 'totalcountdeaths',
 'newcountconfirmed',
 'newcountdeaths',
 'date']

In [6]:
df.dtypes

county                  object
totalcountconfirmed    float64
totalcountdeaths       float64
newcountconfirmed        int64
newcountdeaths           int64
date                    object
dtype: object

In [8]:
round(df.describe(), 2)

Unnamed: 0,totalcountconfirmed,totalcountdeaths,newcountconfirmed,newcountdeaths
count,7322.0,7323.0,7325.0,7325.0
mean,1890.69,56.08,52.52,1.05
std,8818.98,294.12,217.21,5.23
min,0.0,0.0,-51.0,-3.0
25%,9.0,0.0,0.0,0.0
50%,82.5,2.0,2.0,0.0
75%,860.75,19.0,24.0,0.0
max,153265.0,4084.0,4416.0,81.0


In [10]:
df[df['newcountconfirmed'] < 0].count()

county                 127
totalcountconfirmed    127
totalcountdeaths       127
newcountconfirmed      127
newcountdeaths         127
date                   127
dtype: int64

In [11]:
df[df['newcountdeaths'] < 0].count()

county                 21
totalcountconfirmed    21
totalcountdeaths       21
newcountconfirmed      21
newcountdeaths         21
date                   21
dtype: int64

In [19]:
df_county = df.groupby('county').nunique()

In [None]:
# Remove unassigned and out of county values
df = df[df["county"] != 'Unassigned']
df = df[df["county"] != 'Out Of Country']

In [None]:
# Import Geography dataset
fp = "CA_Counties/CA_Counties_TIGER2016.shp"
map_df = gpd.read_file(fp)
map_df.head()

# Merge with original dataset
merged = map_df.merge(df, how = 'left', left_on='NAME', right_on='county')

# Select only desired variables
merged2 = merged[['NAME', 'geometry', 'totalcountconfirmed', 'totalcountdeaths', 'newcountconfirmed', 'newcountdeaths', 'date']]

In [None]:
# Add County population data
county_pop = pd.read_csv('california_county_pop.csv')

# Drop growth rate column
county_pop.drop('GrowthRate', axis=1, inplace=True)

# Remove suffix from county names so can merge with DF
county_pop['CTYNAME'].replace(' County', '', regex=True, inplace=True)

In [None]:
# Create new master DF with geography and county populations
merged3 = merged2.merge(county_pop, how = 'left', left_on='NAME', right_on='CTYNAME')

# Drop extraneous name variable
merged3.drop(['CTYNAME'], axis=1, inplace=True)

In [None]:
# Replace negative values with 0
merged3[merged3['newcountdeaths'] <0] = 0
merged3[merged3['newcountconfirmed'] <0] = 0

In [None]:
# Calculate incidence/prevalence/mortality
merged3['incidence_rate'] = merged3['newcountconfirmed']/merged3['pop2018']
merged3['mortality_rate'] = merged3['newcountdeaths']/merged3['pop2018']
merged3['prevalence'] = merged3['totalcountconfirmed']/merged3['pop2018']

In [None]:
# Split dataset into 4 months
month_1 = merged3[(merged4['date'] >= pd.to_datetime('2020-03-18')) & (merged3['date'] < pd.to_datetime('2020-04-18'))]
month_2 = merged3[(merged4['date'] >= pd.to_datetime('2020-04-18')) & (merged3['date'] < pd.to_datetime('2020-05-18'))]
month_3 = merged3[(merged4['date'] >= pd.to_datetime('2020-05-18')) & (merged3['date'] < pd.to_datetime('2020-06-18'))]
month_4 = merged3[(merged4['date'] >= pd.to_datetime('2020-06-18')) & (merged3['date'] < pd.to_datetime('2020-07-18'))]                

In [None]:
# Groupby county name and calculate mean
month_1_grp = month_1.groupby('NAME').mean()
month_2_grp = month_2.groupby('NAME').mean()
month_3_grp = month_3.groupby('NAME').mean()
month_4_grp = month_4.groupby('NAME').mean()

# Reset county indices
month_1_grp.reset_index()
month_2_grp.reset_index()
month_3_grp.reset_index()
month_4_grp.reset_index()

# Replace lost geometry column
month_1_grp = map_df.merge(month_1_grp, how = 'left', left_on='NAME', right_on='NAME')
month_2_grp = map_df.merge(month_2_grp, how = 'left', left_on='NAME', right_on='NAME')
month_3_grp = map_df.merge(month_3_grp, how = 'left', left_on='NAME', right_on='NAME')
month_4_grp = map_df.merge(month_4_grp, how = 'left', left_on='NAME', right_on='NAME')

# Eliminate unneeded variables
month_1_grp = month_1_grp[['NAME', 'geometry', 'newcountconfirmed', 'newcountdeaths', 'totalcountconfirmed', 'newcountdeaths', 'pop2018', 'incidence_rate', 'mortality_rate', 'prevalence']]
month_2_grp = month_2_grp[['NAME', 'geometry', 'newcountconfirmed', 'newcountdeaths', 'totalcountconfirmed', 'newcountdeaths', 'pop2018', 'incidence_rate', 'mortality_rate', 'prevalence']]
month_3_grp = month_3_grp[['NAME', 'geometry', 'newcountconfirmed', 'newcountdeaths', 'totalcountconfirmed', 'newcountdeaths', 'pop2018', 'incidence_rate', 'mortality_rate', 'prevalence']]
month_4_grp = month_4_grp[['NAME', 'geometry', 'newcountconfirmed', 'newcountdeaths', 'totalcountconfirmed', 'newcountdeaths', 'pop2018', 'incidence_rate', 'mortality_rate', 'prevalence']]


In [None]:
# The code to create the choropleths and label them - algorithm labels county if its averaged monthly index in top 1/2 of Statewide average
# Month 1 incidence
vmin, vmax = min(merged4['incidence_rate']), max(merged4['incidence_rate'])
ax = month_1.plot(column='incidence_rate', figsize = (10,10), cmap='Reds', linewidth=1.0, edgecolor='0.7', vmin=vmin, vmax=vmax, legend=True, norm=plt.Normalize(vmin=vmin, vmax=vmax))
plt.axis('off')
plt.title('Average Incidence', fontsize=17)
texts = []

for x, y, label, metric in zip(month_1_points.geometry.x, month_1_points.geometry.y, month_1_points['NAME'], month_1_points['incidence_rate']):
        if metric >= month_1_grp['incidence_rate'].mean():
            texts.append(plt.text(x,y,label,fontsize=8))
        else:
            pass

aT.adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle="->", color='black'))

In [None]:
# Calculate centroids for labeling tasks
month_1_grp['center'] = month_1_grp['geometry'].centroid
month_2_grp['center'] = month_2_grp['geometry'].centroid
month_3_grp['center'] = month_3_grp['geometry'].centroid
month_4_grp['center'] = month_4_grp['geometry'].centroid

# Create copies for use by geopandas
month_1_points = month_1_grp.copy()
month_2_points = month_2_grp.copy()
month_3_points = month_3_grp.copy()
month_4_points = month_4_grp.copy()

# Set geometry
month_1_points.set_geometry('center', inplace=True)
month_2_points.set_geometry('center', inplace=True)
month_3_points.set_geometry('center', inplace=True)
month_4_points.set_geometry('center', inplace=True)

# Create Ethnicity Analyses

In [None]:
import seaborn as sns

In [None]:
# Import dataset and perform sanity checks
ethnicity = pd.read_csv('case_demographics_ethnicity.csv')

In [None]:
ethnicity.shape

In [None]:
[col for col in ethnicity.columns]

In [None]:
ethnicity.dtypes

In [None]:
round(ethnicity.describe(), 2)

In [None]:
print(ethnicity.race_ethnicity.unique())
print(ethnicity.race_ethnicity.isna().sum())

In [None]:
# Replace repeated races
ethnicity.race_ethnicity.replace(['Multi-Race', 'Native Hawaiian and other Pacific Islander'], ['Multiracial', 'Native Hawaiian or Pacific Islander'], inplace=True)

In [None]:
# Eliminate unused ethnic categories
ethnicity = ethnicity[ethnicity['race_ethnicity'] != 'Other']
ethnicity = ethnicity[ethnicity['race_ethnicity'] != 'American Indian or Alaska Native']
ethnicity = ethnicity[ethnicity['race_ethnicity'] != 'Native Hawaiian or Pacific Islander']
ethnicity = ethnicity[ethnicity['race_ethnicity'] != 'Multiracial']

In [None]:
# Prepare data for plotting
ethnicities = ['Asian','Black','Latino','White']
colors = ['#db5f57','#dbc257','#91db57','#57db80']
lines = [15.4, 6.0, 38.9, 36.6]

In [None]:
# Normalize case percentage
def normalize_case_percent(df):
    if df['race_ethnicity'].all() == 'Asian':
        return df['case_percentage'] - 15.4
    if df['race_ethnicity'].all() == 'Black':
        return df['case_percentage'] - 6.0
    if df['race_ethnicity'].all() == 'Latino':
        return df['case_percentage'] - 38.9
    if df['race_ethnicity'].all() == 'White':
        return df['case_percentage'] - 36.6
    
# Normalize death percentage
def normalize_death_percent(df):
    if df['race_ethnicity'].all() == 'Asian':
        return df['death_percentage'] - 15.4
    if df['race_ethnicity'].all() == 'Black':
        return df['death_percentage'] - 6.0
    if df['race_ethnicity'].all() == 'Latino':
        return df['death_percentage'] - 38.9
    if df['race_ethnicity'].all() == 'White':
        return df['death_percentage'] - 36.6

In [None]:
# Example of graphing process using case percentage (same process can be used for death percentage if using that as a variable)
case_percent_normalized = ethnicity.groupby('race_ethnicity').apply(normalize_case_percent)

ethnicity['case_percent_normalized'] = case_percent_normalized.reset_index(level = 0, drop = True)

plt.clf()
fig = plt.figure()

sns.set_palette(sns.color_palette("hls", 8))
fig = plt.figure(figsize = [9,9])
ax = sns.lineplot(x = 'date', y = 'case_percent_normalized', hue = 'race_ethnicity', data = ethnicity)
ax.axhline(y = 0, c = 'black', alpha=0.6)
plt.ylabel('Normalized Case Percentage')
plt.xlabel('Date')
plt.legend

plt.xticks(rotation = 90)

every_nth = 4
for n, label in enumerate(ax.xaxis.get_ticklabels()):
    if n % every_nth != 0:
        label.set_visible(False)
plt.show()