In [1]:
import pandas as pd
import scipy.stats as sts
import matplotlib.pyplot as plt
import numpy as np
import gmaps
import requests
import json

#import API key for gmaps
from config import gkey
gmaps.configure(api_key=gkey)

In [2]:
college_data = pd.read_csv('Data/ca_colleges.csv')

college_data.columns = ['School_ID', 'School_name', 'Year', 'Duplicate', 'Street_Address', 'City', 'Zip_code', 
                        'county', 'Longitude', 'Latitude', 'Total_Enrollment', 'Full_time_enrollment', 
                        'Part_time_enrollment']

college_data.drop(columns=['School_ID', 'Year', 'Duplicate', 'Street_Address', 'City', 'Zip_code', 
                        'Full_time_enrollment', 'Part_time_enrollment'], inplace=True)

college_data.county = college_data.county.map(lambda x: x[ :-7])
college_data.dropna(subset=['Total_Enrollment'], inplace=True)
college_data.to_csv('Data/college_data.csv', index=False)

college_data

Unnamed: 0,School_name,county,Longitude,Latitude,Total_Enrollment
0,Academy of Art University,San Francisco,-122.400578,37.787943,9812.0
1,Academy of Chinese Culture and Health Sciences,Alameda,-122.269839,37.805972,133.0
3,Avalon School of Cosmetology-Alameda,Alameda,-122.243566,37.764203,47.0
4,College of Alameda,Alameda,-122.279303,37.781017,5667.0
5,Allan Hancock College,Santa Barbara,-120.421144,34.943716,11894.0
...,...,...,...,...,...
686,MIXED Institute of Cosmetology & Barber,Sacramento,-121.435770,38.495699,75.0
687,UEI College-Sacramento,Sacramento,-121.454197,38.494532,613.0
688,Paul Mitchell the School-San Jose,Santa Clara,-121.891967,37.332192,142.0
689,KC Beauty Academy,Los Angeles,-118.235894,34.048458,20.0


In [3]:
grouped_colleges = college_data.groupby('county')['Total_Enrollment'].sum().astype(int)
college_population = grouped_colleges.to_frame()
len(college_population)

41

In [4]:
clean_case = pd.read_csv('Data/clean_case.csv')
clean_case = clean_case.rename(columns={'area': 'county'})
clean_case.head()

Unnamed: 0,date,county,population,cases,deaths,total_tests,positive_tests
0,2021-03-31,Alameda,1685886.0,98.0,1.0,10553.0,123.0
1,2021-03-31,Alpine,1117.0,0.0,0.0,3.0,0.0
2,2021-03-31,Amador,38531.0,3.0,0.0,317.0,8.0
3,2021-03-31,Butte,217769.0,11.0,0.0,632.0,11.0
4,2021-03-31,Calaveras,44289.0,4.0,0.0,80.0,4.0


In [5]:
vaccine_df = pd.read_csv('Data/vaccine_df.csv')
vaccine_df.tail()

Unnamed: 0,county,administered_date,total_doses,pfizer_doses,moderna_doses,jj_doses,partially_vaccinated,total_partially_vaccinated,fully_vaccinated,cumulative_fully_vaccinated,at_least_one_dose
7250,Yuba,4/19/2021,244,102,142,0,103,5934,141,12839,103
7251,Yuba,4/20/2021,302,79,223,0,177,5987,125,12964,178
7252,Yuba,4/21/2021,694,283,411,0,299,5891,395,13359,299
7253,Yuba,4/22/2021,269,95,174,0,149,5920,120,13479,149
7254,Yuba,4/23/2021,177,27,150,0,96,5935,81,13560,96


In [40]:
# Grouping Case data by county

cases_grouped = clean_case.groupby('county').agg({'cases': 'sum',
                                                  'deaths': 'sum',
                                                  'total_tests': 'sum',
                                                  'positive_tests': 'sum',
                                                  'population': 'max'})

cases_grouped['Cases per 100,000'] = (cases_grouped['cases']/cases_grouped['population']*100000).astype(int)
cases_grouped.head()

Unnamed: 0_level_0,cases,deaths,total_tests,positive_tests,population,"Cases per 100,000"
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alameda,84220.0,1240.0,2288714.0,98015.0,1685886.0,4995
Alpine,86.0,0.0,1870.0,35.0,1117.0,7699
Amador,3588.0,46.0,105842.0,4096.0,38531.0,9311
Butte,11415.0,196.0,198445.0,12035.0,217769.0,5241
Calaveras,2013.0,50.0,37082.0,2258.0,44289.0,4545


In [8]:
# Grouping Vaccine data by county and finding dose and fully vaccinated totals

vaccine_grouped = vaccine_df.groupby('county').agg({'total_doses': 'sum',
                                                  'cumulative_fully_vaccinated': 'max'})

vaccine_grouped['Fully Vaccinated per 100,000'] = (vaccine_grouped['cumulative_fully_vaccinated']/
                                                         clean_case_2021['population']*100000).astype(int)

vaccine_grouped.describe()


Unnamed: 0,total_doses,cumulative_fully_vaccinated,"Fully Vaccinated per 100,000"
count,58.0,58.0,58.0
mean,470699.6,191958.4,27272.793103
std,1029514.0,417026.4,7847.48585
min,1391.0,593.0,14711.0
25%,26740.75,11188.0,21636.5
50%,125405.0,51515.0,26580.0
75%,423529.5,176592.8,30233.75
max,7003153.0,2839079.0,53088.0


In [9]:
# Create a list of Counties included in data sets to use for geocoding API
# Create columns to append in for loop

print(len(clean_case['county'].unique()))
print(len(vaccine_df['county'].unique()))
counties = clean_case['county'].unique()
counties

58
58


array(['Alameda', 'Alpine', 'Amador', 'Butte', 'Calaveras', 'Colusa',
       'Contra Costa', 'Del Norte', 'El Dorado', 'Fresno', 'Glenn',
       'Humboldt', 'Imperial', 'Inyo', 'Kern', 'Kings', 'Lake', 'Lassen',
       'Los Angeles', 'Madera', 'Marin', 'Mariposa', 'Mendocino',
       'Merced', 'Modoc', 'Mono', 'Monterey', 'Napa', 'Nevada', 'Orange',
       'Placer', 'Plumas', 'Riverside', 'Sacramento', 'San Benito',
       'San Bernardino', 'San Diego', 'San Francisco', 'San Joaquin',
       'San Luis Obispo', 'San Mateo', 'Santa Barbara', 'Santa Clara',
       'Santa Cruz', 'Shasta', 'Sierra', 'Siskiyou', 'Solano', 'Sonoma',
       'Stanislaus', 'Sutter', 'Tehama', 'Trinity', 'Tulare', 'Tuolumne',
       'Ventura', 'Yolo', 'Yuba'], dtype=object)

In [10]:
# Creating Lists to fill coordinates
lat = []
lng = []

# Set up for API call
base_url = 'https://maps.googleapis.com/maps/api/geocode/json?address='
    
for county in counties:
    target_county = f'{base_url}{county},+CA&key={gkey}'
    response = requests.get(target_county).json()
    try:
        lat.append(response['results'][0]['geometry']['location']['lat'])
        lng.append(response['results'][0]['geometry']['location']['lng'])
    except:
        print(f'{county} could not be located...Skipping')
        pass
print('Coordinates have been added to datasets.')

Coordinates have been added to datasets.


In [11]:
county_dict = {'county': counties,
              'latitude': lat,
              'longitude': lng}
county_data = pd.DataFrame(county_dict)

county_data.head()

Unnamed: 0,county,latitude,longitude
0,Alameda,37.779872,-122.282185
1,Alpine,32.835052,-116.766411
2,Amador,38.348892,-120.774093
3,Butte,39.625395,-121.537
4,Calaveras,38.196048,-120.680504


In [12]:
county_data.set_index('county', inplace=True)
county_data['population'] = cases_grouped['population']
county_data['student_pop'] = college_population['Total_Enrollment']
county_data['cases per 100,000'] = cases_grouped['Cases per 100,000']
county_data['Fully Vaccinated per 100,000'] = vaccine_grouped['Fully Vaccinated per 100,000']

In [13]:
# confirmed 17 counties do not have colleges, replacing nan with 0

county_data['student_pop'] = county_data['student_pop'].fillna(0)
county_data.reset_index(inplace=True)
county_data['% student population'] = round(county_data['student_pop']/county_data['population']*100,2)
county_data.head()

Unnamed: 0,county,latitude,longitude,population,student_pop,"2020 cases per 100,000","2021 cases per 100,000","Fully Vaccinated per 100,000",% student population
0,Alameda,37.779872,-122.282185,1685886.0,130485.0,3347,1648,33949,7.74
1,Alpine,32.835052,-116.766411,1117.0,0.0,6266,1432,53088,0.0
2,Amador,38.348892,-120.774093,38531.0,0.0,7046,2265,25405,0.0
3,Butte,39.625395,-121.537,217769.0,27274.0,3686,1555,27251,12.52
4,Calaveras,38.196048,-120.680504,44289.0,0.0,2824,1720,25898,0.0


In [14]:
# using describe to see % student population details
county_data.describe()

Unnamed: 0,latitude,longitude,population,student_pop,"2020 cases per 100,000","2021 cases per 100,000","Fully Vaccinated per 100,000",% student population
count,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0
mean,37.681321,-120.762809,691882.1,47732.362069,5165.931034,2111.87931,27272.793103,4.716552
std,2.279756,1.863556,1496476.0,113492.866482,2617.104003,817.696788,7847.48585,4.270994
min,32.715738,-123.965971,1117.0,0.0,1059.0,366.0,14711.0,0.0
25%,36.582126,-122.026564,46304.5,0.0,3468.25,1517.0,21636.5,0.0
50%,37.942282,-120.89579,192373.5,9483.5,4654.0,2028.0,26580.0,4.7
75%,39.244611,-119.902711,724076.5,40256.0,6491.0,2678.5,30233.75,7.425
max,41.774326,-115.569439,10257560.0,757021.0,15080.0,3998.0,53088.0,18.96


In [17]:
# Locating all counties in the top quartile as "College Counties"
college_counties = county_data.loc[county_data['% student population']>=7.43]
college_counties

Unnamed: 0,county,latitude,longitude,population,student_pop,"2020 cases per 100,000","2021 cases per 100,000","Fully Vaccinated per 100,000",% student population
0,Alameda,37.779872,-122.282185,1685886.0,130485.0,3347,1648,33949,7.74
3,Butte,39.625395,-121.537,217769.0,27274.0,3686,1555,27251,12.52
9,Fresno,36.737798,-119.787125,1032227.0,81646.0,7035,2635,24361,7.91
11,Humboldt,40.745005,-123.869509,134098.0,11931.0,1445,1152,26662,8.9
17,Lassen,40.539439,-120.712002,30065.0,2526.0,15080,1945,16737,8.4
26,Monterey,36.600238,-121.894676,448732.0,33378.0,6693,2775,26569,7.44
29,Orange,33.787914,-117.853101,3228519.0,285987.0,5438,2316,29307,8.86
31,Plumas,39.992683,-120.803947,18997.0,2058.0,2542,942,30704,10.83
33,Sacramento,38.581572,-121.4944,1567975.0,116772.0,4460,1861,25723,7.45
36,San Diego,32.715738,-117.161084,3370418.0,317016.0,5228,2803,29933,9.41


In [43]:
# Creating heat map of Covid-19 cases in 2020

center = 36.7783,-119.4179
counties = county_data[['latitude', 'longitude']]
cases = cases_grouped['Cases per 100,000']

fig1 = gmaps.figure(center=center, zoom_level=5.5)

heatmap_layer_cases = gmaps.heatmap_layer(counties, weights=cases, dissipating=False)
heatmap_layer_cases.max_intensity = 6500
heatmap_layer_cases.point_radius = 0.25

fig1.add_layer(heatmap_layer_cases)

In [44]:
#add college counties to the 2020 heatmap

info_box_template = """
<dl>
<dt>County</dt><dd>{county}</dd>
<dt>% Students</dt><dd>{% student population}</dd>
<dt>Cases per 100,000</dt><dd>{2020 cases per 100,000}</dd>
</dl>
"""
# Store the DataFrame Row

county_info = [info_box_template.format(**row) for index, row in college_counties.iterrows()]
locations = college_counties[['latitude', 'longitude']]

# Add marker layer ontop of heat map

markers = gmaps.marker_layer(locations, info_box_content=county_info)
fig1.add_layer(markers)

# Display figure

fig1

Figure(layout=FigureLayout(height='420px'))

The above heatmap was originally broken down by year to see the affect of lower student populations due to virtual learning. However, the maps were identical. This also suggests that each county is maintaining it's own Covid-19 curve. No county appears to have dramatically changed its numbers between 2020 and 2021.

In [58]:
# Creating heat map of Vaccines

center = 36.7783,-119.4179
counties = county_data[['latitude', 'longitude']]
cases = county_data['Fully Vaccinated per 100,000']

fig3 = gmaps.figure(center=center, zoom_level=5.5)

heatmap_layer_vaccines = gmaps.heatmap_layer(counties, weights=cases, dissipating=False)
heatmap_layer_vaccines.max_intensity = 30233
heatmap_layer_vaccines.point_radius = 0.25
heatmap_layer_vaccines.gradient = [(255,0,0, 0),
                                   (255,0,0, 1),
                                   (255,255,0, 1),
                                   (0,255,0, 1)]

fig3.add_layer(heatmap_layer_vaccines)

In [59]:
#add college counties to the Vaccine heatmap

info_box_template = """
<dl>
<dt>County</dt><dd>{county}</dd>
<dt>% Students</dt><dd>{% student population}</dd>
<dt>Cases per 100,000</dt><dd>{Fully Vaccinated per 100,000}</dd>
</dl>
"""
# Store the DataFrame Row

county_info = [info_box_template.format(**row) for index, row in college_counties.iterrows()]
locations = college_counties[['latitude', 'longitude']]

# Add marker layer ontop of heat map

markers = gmaps.marker_layer(locations, info_box_content=county_info)
fig3.add_layer(markers)

# Display figure

fig3

Figure(layout=FigureLayout(height='420px'))

Inverted colors on Vaccines heat map as a greater number of vaccines per 100,000 people is considered good.