# Sub-question: Travellling Behaviour
## Part 1: Data Import

In order to analyze the impact of the epidemic on the mobility of Poland and the Netherlands, we will use Google mobility data to study the changes of trips to six different destinations relative to the baseline (traffic volume before the epidemic) during the epidemic, namely Retail & recreation, Grocery & pharmacy, Workplaces, Residential, Parks and Transit stations.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression

### 1.1 Google Community Mobility Data

Data source: Google Community Mobility Reports

https://www.google.com/covid19/mobility/

Report is broken down by location and displays the change in visits to places like grocery stores and parks

In [3]:
# Import the file contians Community Mobility data of NL and PL
df_nl_2020 = pd.read_csv("./raw/2020_NL_Region_Mobility_Report.csv", delimiter=',')
df_nl_2021 = pd.read_csv("./raw/2021_NL_Region_Mobility_Report.csv", delimiter=',')
df_nl_2022 = pd.read_csv("./raw/2022_NL_Region_Mobility_Report.csv", delimiter=',')

df_pl_2020 = pd.read_csv("./raw/2020_PL_Region_Mobility_Report.csv", delimiter=',')
df_pl_2021 = pd.read_csv("./raw/2021_PL_Region_Mobility_Report.csv", delimiter=',')
df_pl_2022 = pd.read_csv("./raw/2022_PL_Region_Mobility_Report.csv", delimiter=',')

# Merge the data
df_nl_m=pd.concat([df_nl_2020,df_nl_2021,df_nl_2022])
df_pl_m=pd.concat([df_pl_2020,df_pl_2021,df_pl_2022])

# Keep the national data, and remove the provincial data
df_nl_m=df_nl_m[df_nl_m['sub_region_1'].isnull().values==True]
df_pl_m=df_pl_m[df_pl_m['sub_region_1'].isnull().values==True]
#display(df_nl_m)
#display(df_pl_m)

  df_nl_2021 = pd.read_csv("./raw/2021_NL_Region_Mobility_Report.csv", delimiter=',')


### 1.2 COVID-19 Data

Data source: WHO Coronavirus (COVID-19) Data
    
https://covid19.who.int/data

In [4]:
# Import the csv file contians data of global daily Daily new confirmed cases per million people
df_covid_raw = pd.read_csv("./raw/Daily new confirmed cases per 1M.csv", delimiter=',')

# Drop unwanted rows
df_covid = df_covid_raw.loc[((df_covid_raw['location'] == 'Netherlands') | (df_covid_raw['location'] == 'Poland'))]

# Drop unwanted columns
df_covid = df_covid[['location', 'date', 'new_cases', 'new_cases_per_million', 'new_deaths', 'new_deaths_per_million']]
df_covid.loc[:, 'date'] = pd.to_datetime(df_covid.loc[:, 'date'])
df_covid.set_index('date', inplace = True)

df_covid

Unnamed: 0_level_0,location,new_cases,new_cases_per_million,new_deaths,new_deaths_per_million
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-27,Netherlands,1.0,0.057,,
2020-02-28,Netherlands,5.0,0.285,,
2020-02-29,Netherlands,4.0,0.228,,
2020-03-01,Netherlands,8.0,0.455,,
2020-03-02,Netherlands,6.0,0.342,,
...,...,...,...,...,...
2022-10-26,Poland,1023.0,25.667,23.0,0.577
2022-10-27,Poland,974.0,24.437,22.0,0.552
2022-10-28,Poland,790.0,19.821,12.0,0.301
2022-10-29,Poland,806.0,20.222,17.0,0.427


## Part 2: Data Processing
### 2.1 Google Community Mobility Data

In [5]:
# Remove useless column
df_nl_m = df_nl_m.drop(['country_region_code','country_region','sub_region_1','sub_region_2','metro_area','iso_3166_2_code','census_fips_code','place_id'], axis=1)
df_pl_m = df_pl_m.drop(['country_region_code','country_region','sub_region_1','sub_region_2','metro_area','iso_3166_2_code','census_fips_code','place_id'], axis=1)

In [6]:
# Change column type to pandas date time of Google Mobility Dataset
df_nl_m.loc[:, 'date'] = pd.to_datetime(df_nl_m.loc[:, 'date'])
df_pl_m.loc[:, 'date'] = pd.to_datetime(df_pl_m.loc[:, 'date'])

# Rename the columns
df_nl_m.columns = ['date', 'NL_retail_and_recreation', 'NL_grocery_and_pharmacy', 'NL_parks', 'NL_transit', 'NL_workplaces', 'NL_residential']
df_pl_m.columns = ['date', 'PL_retail_and_recreation', 'PL_grocery_and_pharmacy', 'PL_parks', 'PL_transit', 'PL_workplaces', 'PL_residential']
display(df_nl_m)

Unnamed: 0,date,NL_retail_and_recreation,NL_grocery_and_pharmacy,NL_parks,NL_transit,NL_workplaces,NL_residential
0,2020-02-15,1.0,1.0,11.0,3.0,0.0,0.0
1,2020-02-16,-10.0,-8.0,-31.0,-5.0,-3.0,2.0
2,2020-02-17,0.0,-1.0,8.0,-3.0,-5.0,1.0
3,2020-02-18,5.0,4.0,21.0,-3.0,-5.0,1.0
4,2020-02-19,3.0,0.0,20.0,-3.0,-5.0,1.0
...,...,...,...,...,...,...,...
283,2022-10-11,-1.0,16.0,60.0,-18.0,-16.0,3.0
284,2022-10-12,-2.0,13.0,53.0,-18.0,-16.0,3.0
285,2022-10-13,-9.0,8.0,11.0,-19.0,-16.0,4.0
286,2022-10-14,-7.0,11.0,30.0,-19.0,-17.0,4.0


In [7]:
df_Mobility_Data = pd.merge(df_nl_m, df_pl_m, on='date')
df_Mobility_Data.set_index('date', inplace=True)
display(df_Mobility_Data)

Unnamed: 0_level_0,NL_retail_and_recreation,NL_grocery_and_pharmacy,NL_parks,NL_transit,NL_workplaces,NL_residential,PL_retail_and_recreation,PL_grocery_and_pharmacy,PL_parks,PL_transit,PL_workplaces,PL_residential
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-02-15,1.0,1.0,11.0,3.0,0.0,0.0,7.0,-1.0,26.0,4.0,0.0,-1.0
2020-02-16,-10.0,-8.0,-31.0,-5.0,-3.0,2.0,12.0,-13.0,18.0,6.0,-2.0,0.0
2020-02-17,0.0,-1.0,8.0,-3.0,-5.0,1.0,6.0,1.0,20.0,1.0,1.0,0.0
2020-02-18,5.0,4.0,21.0,-3.0,-5.0,1.0,3.0,-1.0,13.0,-1.0,1.0,1.0
2020-02-19,3.0,0.0,20.0,-3.0,-5.0,1.0,5.0,0.0,13.0,-1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-11,-1.0,16.0,60.0,-18.0,-16.0,3.0,7.0,32.0,77.0,12.0,8.0,1.0
2022-10-12,-2.0,13.0,53.0,-18.0,-16.0,3.0,10.0,32.0,76.0,13.0,8.0,1.0
2022-10-13,-9.0,8.0,11.0,-19.0,-16.0,4.0,12.0,36.0,89.0,16.0,8.0,0.0
2022-10-14,-7.0,11.0,30.0,-19.0,-17.0,4.0,11.0,34.0,78.0,11.0,-2.0,1.0


### 2.2 Covid-19 Data

In [8]:
# Extract by countries
df_covid_nl=df_covid[~df_covid['location'].isin(['Poland'])]
df_covid_nl=df_covid_nl.drop(['location', 'new_cases', 'new_deaths'], axis=1)
df_covid_nl.columns=[ 'new_cases_per_million_nl', 'new_deaths_per_million_nl']

df_covid_pl=df_covid[~df_covid['location'].isin(['Netherlands'])]
df_covid_pl=df_covid_pl.drop(['location', 'new_cases', 'new_deaths'], axis=1)
df_covid_pl.columns=[ 'new_cases_per_million_pl', 'new_deaths_per_million_pl']

# merge to one table
df_covid = pd.concat([df_covid_nl, df_covid_pl], axis=1)
display(df_covid)

Unnamed: 0_level_0,new_cases_per_million_nl,new_deaths_per_million_nl,new_cases_per_million_pl,new_deaths_per_million_pl
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-02-27,0.057,,,
2020-02-28,0.285,,,
2020-02-29,0.228,,,
2020-03-01,0.455,,,
2020-03-02,0.342,,,
...,...,...,...,...
2022-10-26,0.000,0.00,25.667,0.577
2022-10-27,0.000,0.00,24.437,0.552
2022-10-28,370.587,1.48,19.821,0.301
2022-10-29,,,20.222,0.427


### 2.3 Merge to one dataframe

In [9]:
#df_result =df_covid.join(df_Mobility_Data)
df_Mobility_Data=pd.concat([df_covid, df_Mobility_Data], axis=1)
#df_result=pd.merge(df_Mobility_Data, df_covid, left_index=True, right_index=True)
#print(pd.merge(df1,df2,on='key'))
display(df_Mobility_Data)

Unnamed: 0_level_0,new_cases_per_million_nl,new_deaths_per_million_nl,new_cases_per_million_pl,new_deaths_per_million_pl,NL_retail_and_recreation,NL_grocery_and_pharmacy,NL_parks,NL_transit,NL_workplaces,NL_residential,PL_retail_and_recreation,PL_grocery_and_pharmacy,PL_parks,PL_transit,PL_workplaces,PL_residential
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-02-15,,,,,1.0,1.0,11.0,3.0,0.0,0.0,7.0,-1.0,26.0,4.0,0.0,-1.0
2020-02-16,,,,,-10.0,-8.0,-31.0,-5.0,-3.0,2.0,12.0,-13.0,18.0,6.0,-2.0,0.0
2020-02-17,,,,,0.0,-1.0,8.0,-3.0,-5.0,1.0,6.0,1.0,20.0,1.0,1.0,0.0
2020-02-18,,,,,5.0,4.0,21.0,-3.0,-5.0,1.0,3.0,-1.0,13.0,-1.0,1.0,1.0
2020-02-19,,,,,3.0,0.0,20.0,-3.0,-5.0,1.0,5.0,0.0,13.0,-1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-26,0.000,0.00,25.667,0.577,,,,,,,,,,,,
2022-10-27,0.000,0.00,24.437,0.552,,,,,,,,,,,,
2022-10-28,370.587,1.48,19.821,0.301,,,,,,,,,,,,
2022-10-29,,,20.222,0.427,,,,,,,,,,,,


In [12]:
# Remove axis that contains null value
df_Mobility_Data.dropna(axis=0, how='any', inplace=True)

In [13]:
# Save file
df_Mobility_Data.to_csv("./process/travel_behaviour.csv", index=False)