In [1]:
import numpy as np
import pandas as pd

import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline  

In [2]:
# import data from local files
global_mobility = pd.read_csv('../../data/raw_data/google_mobility.csv', header=0, names=[
    'country_code', 'country', 'region_1', 'region_2', 'date', 'retail_recreation', \
    'grocery_pharmacy', 'parks', 'transit', 'workplaces', 'residential'
], low_memory=False)
covid19_canada = pd.read_csv('../../data/raw_data/covid19_canada.csv', low_memory=False)
covid19_global = pd.read_csv('../../data/raw_data/covid19_global.csv', low_memory=False)
weather_ontario = pd.read_csv('../../data/raw_data/weather_ontario.csv', low_memory=False)

# import data from URLs
# global_mobility_url = pd.read_csv('https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=6d352e35dcffafce')
# covid19_canada_url = pd.read_csv('https://health-infobase.canada.ca/src/data/covidLive/covid19.csv')

In [3]:
global_mobility.head()

Unnamed: 0,country_code,country,region_1,region_2,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,AE,United Arab Emirates,,,2020-02-15,0.0,4.0,5.0,0.0,2.0,1.0
1,AE,United Arab Emirates,,,2020-02-16,1.0,4.0,4.0,1.0,2.0,1.0
2,AE,United Arab Emirates,,,2020-02-17,-1.0,1.0,5.0,1.0,2.0,1.0
3,AE,United Arab Emirates,,,2020-02-18,-2.0,1.0,5.0,0.0,2.0,1.0
4,AE,United Arab Emirates,,,2020-02-19,-2.0,0.0,4.0,-1.0,2.0,1.0


In [4]:
# pd.set_option('display.max_rows', canada_provinces_mobility.shape[0]+1)

# Create data for Canada provinces

In [5]:
# create dataframe for Canada 
canada_mobility = global_mobility.loc[global_mobility.country_code == 'CA']

# drop 'region_2' because all values are NaN
canada_mobility = canada_mobility.drop('region_2', 1)

# export data 
canada_mobility.to_csv(path_or_buf='../../data/cleaned_data/canada_mobility.csv')

In [6]:
canada_mobility.head()

Unnamed: 0,country_code,country,region_1,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
16799,CA,Canada,,2020-02-15,4.0,2.0,10.0,3.0,1.0,0.0
16800,CA,Canada,,2020-02-16,13.0,8.0,41.0,4.0,0.0,-2.0
16801,CA,Canada,,2020-02-17,-12.0,-15.0,63.0,-28.0,-52.0,11.0
16802,CA,Canada,,2020-02-18,-1.0,4.0,6.0,-1.0,-1.0,1.0
16803,CA,Canada,,2020-02-19,1.0,1.0,9.0,0.0,0.0,0.0


In [7]:
# create dataframe for provinces in Canada

# drop 'Prince Edward Island' since there are no many missing values
non_provinces = [np.NaN, 'Northwest Territories', 'Nunavut', 'Yukon', 'Prince Edward Island']
non_provinces_bool = ~canada_mobility.region_1.isin(non_provinces) 
canada_provinces_mobility = canada_mobility[non_provinces_bool]

# drop rows that contain null values in 'parks' and 'transit' columns since they occupy less than 1% of the records
# filling null values with mean/median may not be a good idea 
# since we will be using insights from future data which will not be available at the time
canada_provinces_mobility.parks.isna().sum() / len(canada_provinces_mobility.parks)
canada_provinces_mobility.transit.isna().sum() / len(canada_provinces_mobility.transit)
canada_provinces_mobility = canada_provinces_mobility.dropna(subset=['parks', 'transit'])

# export data
canada_provinces_mobility.to_csv(path_or_buf='../../data/cleaned_data/canada_provinces_mobility.csv')

In [8]:
canada_provinces_mobility.tail()

Unnamed: 0,country_code,country,region_1,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
18094,CA,Canada,Saskatchewan,2020-05-21,-12.0,20.0,80.0,-38.0,-39.0,13.0
18095,CA,Canada,Saskatchewan,2020-05-22,-21.0,8.0,83.0,-36.0,-37.0,12.0
18096,CA,Canada,Saskatchewan,2020-05-23,-24.0,6.0,68.0,-27.0,-14.0,6.0
18097,CA,Canada,Saskatchewan,2020-05-24,-19.0,3.0,197.0,-24.0,-16.0,3.0
18098,CA,Canada,Saskatchewan,2020-05-25,-13.0,11.0,151.0,-38.0,-39.0,10.0


# Create Google trends park data and Canada provinces park data

In [9]:
# import data
google_trends_park = pd.read_csv('../../data/raw_data/google_trends_park.csv')

# convert date to datetime
google_trends_park.date = pd.to_datetime(google_trends_park.date, infer_datetime_format=True)
google_trends_park = google_trends_park[google_trends_park.date <= '2020-05-25']

# export data
google_trends_park.to_csv(path_or_buf='../../data/cleaned_data/google_trends_park.csv')

Since Google trends does not provide daily data for the past 12 months and provides daily data for the past 90 days, we used the data of Google search trends for "park" for the past 90 days.

In [10]:
google_trends_park.tail()

Unnamed: 0,date,frequency_score
78,2020-05-21,53
79,2020-05-22,59
80,2020-05-23,72
81,2020-05-24,100
82,2020-05-25,83


In [11]:
# create dataframe for Canada province park data

# drop unused columns
canada_provinces_park  = canada_provinces_mobility.drop(['country_code', 'country', 'retail_recreation', \
                                                         'grocery_pharmacy', 'transit', 'workplaces', 
                                                         'residential'], axis=1)

# group data by date
canada_provinces_park = canada_provinces_park.groupby('date').mean().reset_index()

# drop date before 2020-03-04 since Google trends data does not contain data before this date
canada_provinces_park = canada_provinces_park[canada_provinces_park.date >= '2020-03-04']

# export data
canada_provinces_park.to_csv(path_or_buf='../../data/cleaned_data/canada_provinces_park.csv')

In [12]:
canada_provinces_park.tail()

Unnamed: 0,date,parks
96,2020-05-21,93.0
97,2020-05-22,81.444444
98,2020-05-23,83.666667
99,2020-05-24,169.75
100,2020-05-25,93.375


# Standardize data for each category, region_1, date & Create social-distancing score for each date

In [280]:
# import data
canada_provinces_mobility = pd.read_csv('../../data/cleaned_data/canada_provinces_mobility.csv')

canada_provinces_mobility.head()

Unnamed: 0.1,Unnamed: 0,country_code,country,region_1,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,16900,CA,Canada,Alberta,2020-02-15,7,3,24,7,-2,-1
1,16901,CA,Canada,Alberta,2020-02-16,10,-2,28,3,-3,-2
2,16902,CA,Canada,Alberta,2020-02-17,-7,-2,10,-40,-67,15
3,16903,CA,Canada,Alberta,2020-02-18,-1,1,-2,-9,-5,2
4,16904,CA,Canada,Alberta,2020-02-19,3,4,21,-7,-1,1


In [281]:
# Calculate mean and standard deviation for mobility score by provinces
mean_mobility_provinces = canada_provinces_mobility.groupby('region_1').mean().reset_index()
std_mobility_provinces = canada_provinces_mobility.groupby('region_1').std().reset_index()

In [282]:
# z-score
def standardize_data_z(province, category, i):
    canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category] = \
    (canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category] - \
    mean_mobility_provinces.loc[mean_mobility_provinces.region_1 == province, category][i]) / \
    std_mobility_provinces.loc[std_mobility_provinces.region_1 == province, category][i]

#     min-max normalization between [-100, 100]
def standardize_data_min_max(province, category):
#     canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category] = \
#     (200 * (canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category] - \
#     min(canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category])) /\
#     (max(canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category]) - \
#     min(canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category])) - 100)
    canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category] = \
    100 * ((canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category] - \
    min(canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category])) / \
    (max(canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category]) - \
    min(canada_provinces_mobility.loc[canada_provinces_mobility.region_1 == province, category])))

In [283]:
provinces = ['Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador', \
             'Nova Scotia', 'Ontario', 'Quebec', 'Saskatchewan']
categories = ['retail_recreation', 'grocery_pharmacy', 'parks', 'transit', 'workplaces', 'residential']

In [284]:
# standardize data by z-score
i = 0
for province in provinces:
    for category in categories:
        standardize_data_z(province, category, i)
    i += 1

In [285]:
canada_provinces_mobility.head()

Unnamed: 0.1,Unnamed: 0,country_code,country,region_1,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential
0,16900,CA,Canada,Alberta,2020-02-15,2,1,1,2,1,-2
1,16901,CA,Canada,Alberta,2020-02-16,2,0,1,2,1,-2
2,16902,CA,Canada,Alberta,2020-02-17,1,0,0,0,-1,0
3,16903,CA,Canada,Alberta,2020-02-18,1,0,0,1,1,-1
4,16904,CA,Canada,Alberta,2020-02-19,1,1,1,1,1,-1


In [286]:
# create new feature Social Distancing Score for each row

# canada_provinces_mobility['s_d_score'] = -1 * (canada_provinces_mobility['parks']) + \
#                                                canada_provinces_mobility.iloc[:, -6:-5].sum(axis=1) + \
#                                                canada_provinces_mobility.iloc[:, -3:-2].sum(axis=1) + \
#                                                canada_provinces_mobility['residential']

canada_provinces_mobility['s_d_score'] = canada_provinces_mobility.iloc[:, -6:-1].sum(axis=1)

# canada_provinces_mobility['s_d_score'] = -1 * (canada_provinces_mobility.iloc[:, -6:-2].sum(axis=1)) + \
# canada_provinces_mobility['residential']


In [287]:
# standardize data by min_max
i = 0
for province in provinces:
    for category in categories:
        standardize_data_min_max(province, 's_d_score')
        standardize_data_min_max(province, category)
    i += 1

In [288]:
canada_provinces_mobility.head()

Unnamed: 0.1,Unnamed: 0,country_code,country,region_1,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,s_d_score
0,16900,CA,Canada,Alberta,2020-02-15,96,61,38,100,90,3,95
1,16901,CA,Canada,Alberta,2020-02-16,100,54,40,95,89,0,92
2,16902,CA,Canada,Alberta,2020-02-17,77,54,31,44,13,50,52
3,16903,CA,Canada,Alberta,2020-02-18,85,58,25,81,87,12,81
4,16904,CA,Canada,Alberta,2020-02-19,90,62,36,83,92,9,90


In [289]:
canada_provinces_mobility = canada_provinces_mobility.drop(['country_code', 'country'], axis=1)

# export data
canada_provinces_mobility.to_csv(path_or_buf='../../data/cleaned_data/standardized_mobility_data.csv')

# Create social distancing score for each province

In [290]:
# import data
mobility_data = pd.read_csv('../../data/cleaned_data/standardized_mobility_data.csv', parse_dates=['date'], \
                           index_col='Unnamed: 0')

# drop column
mobility_data = mobility_data.drop(['Unnamed: 0.1'], axis=1)

In [291]:
mobility_data.columns

Index(['region_1', 'date', 'retail_recreation', 'grocery_pharmacy', 'parks',
       'transit', 'workplaces', 'residential', 's_d_score'],
      dtype='object')

In [292]:
mobility_data.head()

Unnamed: 0,region_1,date,retail_recreation,grocery_pharmacy,parks,transit,workplaces,residential,s_d_score
0,Alberta,2020-02-15,96,61,38,100,90,3,95
1,Alberta,2020-02-16,100,54,40,95,89,0,92
2,Alberta,2020-02-17,77,54,31,44,13,50,52
3,Alberta,2020-02-18,85,58,25,81,87,12,81
4,Alberta,2020-02-19,90,62,36,83,92,9,90


In [293]:
mobility_data = mobility_data.groupby('region_1').mean().reset_index()
mobility_data = mobility_data.rename(columns={'region_1': 'Province', 's_d_score': 'Social Distancing Score', \
                             'retail_recreation': 'Retail & Recreation', \
                             'grocery_pharmacy': 'Grocery & Pharmacy', 'parks': 'Parks', \
                             'transit': 'Transit', 'workplaces': 'Workplaces',
                             'residential': 'Residential'})
pd.set_option('display.float_format', lambda x: '%.f' % x)
mobility_data.sort_values(by=['Social Distancing Score'], axis=0, ascending=False)

Unnamed: 0,Province,Retail & Recreation,Grocery & Pharmacy,Parks,Transit,Workplaces,Residential,Social Distancing Score
2,Manitoba,58,57,28,41,60,38,61
4,Newfoundland and Labrador,52,53,26,44,41,41,58
3,New Brunswick,53,63,30,49,54,40,56
8,Saskatchewan,50,53,30,45,58,41,55
1,British Columbia,43,42,36,36,53,49,52
0,Alberta,48,51,26,39,55,43,52
6,Ontario,48,52,32,39,51,44,51
7,Quebec,49,57,33,37,49,49,50
5,Nova Scotia,40,46,20,37,47,47,44


In [None]:
# export data
mobility_data.to_csv(path_or_buf='../../data/cleaned_data/social_distancing_score_province.csv')