In [12]:
import numpy as np
import pandas as pd

import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline  

In [13]:
# import data from local files
global_mobility = pd.read_csv('data/raw_data/google_mobility.csv', header=0, names=[
    'country_code', 'country', 'region_1', 'region_2', 'date', 'retail_recreation', \
    'grocery_pharmacy', 'parks', 'transit', 'workplaces', 'residential'
], low_memory=False)
covid19_canada = pd.read_csv('data/raw_data/covid19_canada.csv', low_memory=False)
covid19_global = pd.read_csv('data/raw_data/covid19_global.csv', low_memory=False)
weather_ontario = pd.read_csv('data/raw_data/weather_ontario.csv', low_memory=False)

# import data from URLs
# global_mobility_url = pd.read_csv('https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=6d352e35dcffafce')
# covid19_canada_url = pd.read_csv('https://health-infobase.canada.ca/src/data/covidLive/covid19.csv')

In [14]:
global_mobility.head()

Unnamed: 0,country_code,country,region_1,region_2,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,AE,United Arab Emirates,,,2020-02-15,0.0,4.0,5.0,0.0,2.0,1.0
1,AE,United Arab Emirates,,,2020-02-16,1.0,4.0,4.0,1.0,2.0,1.0
2,AE,United Arab Emirates,,,2020-02-17,-1.0,1.0,5.0,1.0,2.0,1.0
3,AE,United Arab Emirates,,,2020-02-18,-2.0,1.0,5.0,0.0,2.0,1.0
4,AE,United Arab Emirates,,,2020-02-19,-2.0,0.0,4.0,-1.0,2.0,1.0


# Create data for Canada provinces

In [15]:
# create dataframe for Canada 
canada_mobility = global_mobility.loc[global_mobility.country_code == 'CA']

# drop 'region_2' because all values are NaN
canada_mobility = canada_mobility.drop('region_2', 1)

# export data 
canada_mobility.to_csv(path_or_buf='data/cleaned_data/canada_mobility.csv')

In [16]:
canada_mobility.head()

Unnamed: 0,country_code,country,region_1,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
16799,CA,Canada,,2020-02-15,4.0,2.0,10.0,3.0,1.0,0.0
16800,CA,Canada,,2020-02-16,13.0,8.0,41.0,4.0,0.0,-2.0
16801,CA,Canada,,2020-02-17,-12.0,-15.0,63.0,-28.0,-52.0,11.0
16802,CA,Canada,,2020-02-18,-1.0,4.0,6.0,-1.0,-1.0,1.0
16803,CA,Canada,,2020-02-19,1.0,1.0,9.0,0.0,0.0,0.0


In [17]:
# create dataframe for provinces in Canada

# drop 'Prince Edward Island' since there are no many missing values
non_provinces = [np.NaN, 'Northwest Territories', 'Nunavut', 'Yukon', 'Prince Edward Island']
non_provinces_bool = ~canada_mobility.region_1.isin(non_provinces) 
canada_provinces_mobility = canada_mobility[non_provinces_bool]

# drop rows that contain null values in 'parks' and 'transit' columns since they occupy less than 1% of the records
# filling null values with mean/median may not be a good idea 
# since we will be using insights from future data which will not be available at the time
canada_provinces_mobility.parks.isna().sum() / len(canada_provinces_mobility.parks)
canada_provinces_mobility.transit.isna().sum() / len(canada_provinces_mobility.transit)
canada_provinces_mobility = canada_provinces_mobility.dropna(subset=['parks', 'transit'])

# export data
canada_provinces_mobility.to_csv(path_or_buf='data/cleaned_data/canada_provinces_mobility.csv')

In [18]:
canada_provinces_mobility.tail()

Unnamed: 0,country_code,country,region_1,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
18094,CA,Canada,Saskatchewan,2020-05-21,-12.0,20.0,80.0,-38.0,-39.0,13.0
18095,CA,Canada,Saskatchewan,2020-05-22,-21.0,8.0,83.0,-36.0,-37.0,12.0
18096,CA,Canada,Saskatchewan,2020-05-23,-24.0,6.0,68.0,-27.0,-14.0,6.0
18097,CA,Canada,Saskatchewan,2020-05-24,-19.0,3.0,197.0,-24.0,-16.0,3.0
18098,CA,Canada,Saskatchewan,2020-05-25,-13.0,11.0,151.0,-38.0,-39.0,10.0


# Create Google trends park data and Canada provinces park data

In [19]:
# import data
google_trends_park = pd.read_csv('data/raw_data/google_trends_park.csv')

# convert date to datetime
google_trends_park.date = pd.to_datetime(google_trends_park.date, infer_datetime_format=True)
google_trends_park = google_trends_park[google_trends_park.date <= '2020-05-25']

# export data
google_trends_park.to_csv(path_or_buf='data/cleaned_data/google_trends_park.csv')

Since Google trends does not provide daily data for the past 12 months and provides daily data for the past 90 days, we used the data of Google search trends for "park" for the past 90 days.

In [20]:
google_trends_park.tail()

Unnamed: 0,date,frequency_score
78,2020-05-21,53
79,2020-05-22,59
80,2020-05-23,72
81,2020-05-24,100
82,2020-05-25,83


In [21]:
# create dataframe for Canada province park data

# drop unused columns
canada_provinces_park  = canada_provinces_mobility.drop(['country_code', 'country', 'retail_recreation', \
                                                         'grocery_pharmacy', 'transit', 'workplaces', 
                                                         'residential'], axis=1)

# group data by date
canada_provinces_park = canada_provinces_park.groupby('date').mean().reset_index()

# drop date before 2020-03-04 since Google trends data does not contain data before this date
canada_provinces_park = canada_provinces_park[canada_provinces_park.date >= '2020-03-04']

# export data
canada_provinces_park.to_csv(path_or_buf='data/cleaned_data/canada_provinces_park.csv')

In [22]:
canada_provinces_park.tail()

Unnamed: 0,date,parks
96,2020-05-21,93.0
97,2020-05-22,81.444444
98,2020-05-23,83.666667
99,2020-05-24,169.75
100,2020-05-25,93.375


# Create a social distancing score for each province

In [24]:
canada_provinces_mobility.head()

Unnamed: 0,country_code,country,region_1,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
16900,CA,Canada,Alberta,2020-02-15,7.0,3.0,24.0,7.0,-2.0,-1.0
16901,CA,Canada,Alberta,2020-02-16,10.0,-2.0,28.0,3.0,-3.0,-2.0
16902,CA,Canada,Alberta,2020-02-17,-7.0,-2.0,10.0,-40.0,-67.0,15.0
16903,CA,Canada,Alberta,2020-02-18,-1.0,1.0,-2.0,-9.0,-5.0,2.0
16904,CA,Canada,Alberta,2020-02-19,3.0,4.0,21.0,-7.0,-1.0,1.0
