# Project IART Class - COVID-19 mobility records 
## Machine Learning Analysis - Regression problems
### Explore data, create models and evaluate

In [1]:
#Main imports
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sb
import sys
import os

### Explore the data
Let's start by creating some directory variables:

In [19]:
code_path=os.getcwd()
data_path=code_path.replace('code','data')

/home/fatimab/IART_classes/COVID-19/code
/home/fatimab/IART_classes/COVID-19/data


Now let's read data.csv from the data_path:

In [21]:
os.chdir(data_path)
covid_data=pd.read_csv("data.csv",na_values=["NA"])
os.chdir(code_path)
covid_data.head()

Unnamed: 0,iso,country,date,grocery_pharmacy,parks,residential,retail_recreation,transit_stations,workplaces,total_cases,fatalities
0,AR,Argentina,2020-02-23,8.185,17.563,0.437,13.644,5.334,-2.486,0,0
1,AR,Argentina,2020-02-24,-15.875,25.605,6.321,-9.973,-26.785,-53.687,0,0
2,AR,Argentina,2020-02-25,-17.135,1.325,6.895,-19.717,-28.485,-50.053,0,0
3,AR,Argentina,2020-02-26,2.304,5.399,-2.273,0.845,9.461,12.584,0,0
4,AR,Argentina,2020-02-27,-0.404,-5.267,-0.493,-0.584,7.983,12.407,0,0


Print out summary statistics about the data set:

In [22]:
covid_data.describe()

Unnamed: 0,grocery_pharmacy,parks,residential,retail_recreation,transit_stations,workplaces,total_cases,fatalities
count,817.0,817.0,817.0,817.0,817.0,817.0,817.0,817.0
mean,-8.229424,-13.394035,9.545845,-26.205257,-28.370122,-19.126192,10699.773562,566.007344
std,21.313369,32.891636,10.545377,30.992728,30.211025,25.983021,31656.464952,2008.520488
min,-84.798,-91.456,-3.647,-95.158,-88.956,-79.293,0.0,0.0
25%,-16.578,-32.841,0.744,-49.975,-54.145,-40.582,27.0,0.0
50%,0.088,-6.498,5.912,-14.427,-19.502,-8.395,648.0,6.0
75%,4.838,6.961,16.695,-0.632,-0.97,1.037,4585.0,79.0
max,59.474,75.002,40.343,19.699,16.822,21.347,336912.0,15887.0


Let's also see how many samples the dataset has:

In [24]:
print(covid_data.shape)

(817, 11)


This means we have 817 rows - samples - and 11 columns - features. For a simpler analysis, lets transform the date column. Date column goes from 23-02-2020 to 05-04-2020 for each country, so let's make it from day 0 to day 42. If there's a total of 43 days per country, then we have a total of 19 countries, but let's confirm that:

In [31]:
countries_column=np.array(covid_data.loc[:,"country"])
countries=np.unique(countries_column)
print("Countries: " , countries)
print("Number of countries: ", countries.shape[0])

Countries:  ['Argentina' 'Australia' 'Brazil' 'Canada' 'France' 'Germany' 'India'
 'Indonesia' 'Italy' 'Japan' 'Mexico' 'Saudi Arabia' 'South Africa'
 'South Korea' 'Spain' 'Sweden' 'Turkey' 'US' 'United Kingdom']
Number of countries:  19


In [45]:
#date_column=np.array(covid_data.loc[:,"date"])
days=np.arange(43)
days_column=np.tile(days,countries.shape[0])
covid_data.loc[:,"date"]=days_column
covid_data.head(10)

Unnamed: 0,iso,country,date,grocery_pharmacy,parks,residential,retail_recreation,transit_stations,workplaces,total_cases,fatalities
0,AR,Argentina,0,8.185,17.563,0.437,13.644,5.334,-2.486,0,0
1,AR,Argentina,1,-15.875,25.605,6.321,-9.973,-26.785,-53.687,0,0
2,AR,Argentina,2,-17.135,1.325,6.895,-19.717,-28.485,-50.053,0,0
3,AR,Argentina,3,2.304,5.399,-2.273,0.845,9.461,12.584,0,0
4,AR,Argentina,4,-0.404,-5.267,-0.493,-0.584,7.983,12.407,0,0
5,AR,Argentina,5,2.075,-7.859,-0.577,2.877,7.973,12.857,0,0
6,AR,Argentina,6,4.685,-7.582,1.409,5.265,5.595,2.156,0,0
7,AR,Argentina,7,2.873,-10.518,2.417,-2.509,0.944,-2.36,0,0
8,AR,Argentina,8,3.958,-8.264,-1.363,0.537,6.931,16.164,0,0
9,AR,Argentina,9,5.09,-6.241,-1.33,0.2,8.202,16.192,1,0


Notice that we don't need to columns for country as we have ISO and COUNTRY, so let's delete the ISO column. 


In [51]:
del covid_data["iso"]
covid_data.head()

Unnamed: 0,country,date,grocery_pharmacy,parks,residential,retail_recreation,transit_stations,workplaces,total_cases,fatalities
0,Argentina,0,8.185,17.563,0.437,13.644,5.334,-2.486,0,0
1,Argentina,1,-15.875,25.605,6.321,-9.973,-26.785,-53.687,0,0
2,Argentina,2,-17.135,1.325,6.895,-19.717,-28.485,-50.053,0,0
3,Argentina,3,2.304,5.399,-2.273,0.845,9.461,12.584,0,0
4,Argentina,4,-0.404,-5.267,-0.493,-0.584,7.983,12.407,0,0


So now let's look at some conjugations of the data, being aware of the country:

In [None]:
sb.pairplot(covid_data.dropna(),hue="country")
;