# Exploratory data analysis

## 1. Dataset description




## 2. Load the dataset

In [103]:
library(httr)
library(jsonlite)
library(tidyverse)


"package 'httr' was built under R version 3.6.3"

Attaching package: 'jsonlite'


The following object is masked from 'package:purrr':

    flatten




In [34]:
url <- "https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series"
covid19_confirmed <- read_csv(paste(url, "/time_series_covid19_confirmed_global.csv?raw=true", sep = ""))
covid19_death <- read_csv(paste(url, "/time_series_covid19_deaths_global.csv?raw=true", sep = ""))

Parsed with column specification:
cols(
  .default = col_double(),
  `Province/State` = [31mcol_character()[39m,
  `Country/Region` = [31mcol_character()[39m
)

See spec(...) for full column specifications.

Parsed with column specification:
cols(
  .default = col_double(),
  `Province/State` = [31mcol_character()[39m,
  `Country/Region` = [31mcol_character()[39m
)

See spec(...) for full column specifications.



In [111]:
head(covid19_confirmed)

Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/15/20,4/16/20,4/17/20,4/18/20,4/19/20,4/20/20,4/21/20,4/22/20,4/23/20,4/24/20
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,784,840,906,933,996,1026,1092,1176,1279,1351
,Albania,41.1533,20.1683,0,0,0,0,0,0,...,494,518,539,548,562,584,609,634,663,678
,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,2160,2268,2418,2534,2629,2718,2811,2910,3007,3127
,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,673,673,696,704,713,717,717,723,723,731
,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,19,19,19,24,24,24,24,25,25,25
,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,23,23,23,23,23,23,23,24,24,24


In [162]:
country_data <- read_csv("../data/clean_data/country_data.csv")

Parsed with column specification:
cols(
  country = [31mcol_character()[39m,
  age_1564 = [32mcol_double()[39m,
  age_64up = [32mcol_double()[39m,
  age_0014 = [32mcol_double()[39m,
  smok = [32mcol_double()[39m,
  air_polution = [32mcol_double()[39m,
  doctor = [32mcol_double()[39m,
  nurse_midwivies = [32mcol_double()[39m
)



In [138]:
head(country_data)

country,age_1564,age_64up,age_0014,smok,air_polution,doctor,nurse_midwivies
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Afghanistan,54.3249,2.584927,43.09018,,56.91081,0.2907,
Albania,68.58239,13.744736,17.67287,28.7,18.2006,,
Algeria,63.48882,6.362497,30.14868,15.6,38.88401,,
American Samoa,,,,,12.47382,,
Andorra,,,,33.5,10.30762,3.3333,4.0128
Angola,50.9747,2.216374,46.80892,,32.3885,,


## 3. Explore the dataset

In [56]:
dim(covid19_confirmed)

In [35]:
colnames(covid19_confirmed)[length(covid19_confirmed)]

In [57]:
dim(covid19_death)

In [58]:
colnames(covid19_death)[length(covid19_death)]

In [60]:
all(colnames(covid19_confirmed) == colnames(covid19_death))

In [61]:
all(covid19_confirmed[2] == covid19_death[2])

In [101]:
all(!is.na(covid19_confirmed[length(covid19_confirmed)]))

In [102]:
all(!is.na(covid19_death[length(covid19_death)]))

In [142]:
dim(country_data)

## 4. Initial thoughts


## 5. Wrangling

In [87]:
confirmed <- covid19_confirmed[c(2, length(covid19_confirmed))]
colnames(confirmed) = c("country", "n")

confirmed <- confirmed %>%
    group_by(country) %>%
    summarize(confirmed = sum(n))

In [88]:
death <- covid19_death[c(2, length(covid19_death))]
colnames(death) = c("country", "n")

death <- death %>%
    group_by(country) %>%
    summarize(death = sum(n))

In [96]:
covid19 <- merge(confirmed, death, by = "country") %>%
    mutate(rate = death / confirmed)
head(covid19)

country,confirmed,death,rate
<chr>,<dbl>,<dbl>,<dbl>
Afghanistan,1351,43,0.03182828
Albania,678,27,0.03982301
Algeria,3127,415,0.13271506
Andorra,731,40,0.05471956
Angola,25,2,0.08
Antigua and Barbuda,24,3,0.125


In [132]:
dim(covid19)

In [95]:
covid19 %>%
    filter(rate >= 1)

country,confirmed,death,rate
<chr>,<dbl>,<dbl>,<dbl>


In [145]:
covid19 %>%
    filter(!country %in% country_data$country)

country,confirmed,death,rate
<chr>,<dbl>,<dbl>,<dbl>
Bahamas,73,11,0.150684932
Brunei,138,1,0.007246377
Burma,144,5,0.034722222
Congo (Brazzaville),200,6,0.03
Congo (Kinshasa),394,25,0.063451777
Czechia,7273,214,0.029423897
Diamond Princess,712,13,0.018258427
Egypt,4092,294,0.071847507
Gambia,10,1,0.1
Holy See,9,0,0.0


In [163]:
country_data <- country_data %>%
    mutate(country = case_when(country == 'Bahamas, The' ~ 'Bahamas',
                               country == 'Brunei Darussalam' ~ 'Brunei',
                               country == 'Egypt, Arab Rep.' ~ 'Egypt',
                               country == 'Gambia, The' ~ 'Gambia',
                               country == 'Iran, Islamic Rep.' ~ 'Iran',
                               country == 'Korea, Dem. People’s Rep.' ~ 'Korea, South',
                               country == 'Czech Republic' ~ 'Czechia',
                               country == 'Lao PDR' ~ 'Laos',
                               country == 'Russian Federation' ~ 'Russia',
                               country == 'St. Lucia' ~ 'Saint Lucia',
                               country == 'St. Vincent and the Grenadines' ~ 'Saint Vincent and the Grenadines',
                               country == 'Slovak Republic' ~ 'Slovakia',
                               country == 'Syrian Arab Republic' ~ 'Syria',
                               country == 'Venezuela, RB' ~ 'Venezuela',
                               country == 'Sub-Saharan Africa' ~ 'Western Sahara',
                               country == 'Yemen, Rep.' ~ 'Yemen',
                               TRUE ~ country))

In [153]:
covid19 <- covid19 %>%
    mutate(country = case_when(country == 'Burma' ~ 'Myanmar',
                               country == 'Congo (Brazzaville)' ~ 'Congo, Rep.',
                               country == 'Congo (Kinshasa)' ~ 'Congo, Dem. Rep.',
                               country == 'Kyrgyzstan' ~ 'Kyrgyz Republic',
                               country == 'Kyrgyzstan' ~ 'Kyrgyz Republic',
                               country == 'US' ~ 'United States',
                               TRUE ~ country))

In [164]:
covid19 %>%
    filter(!country %in% country_data$country)

country,confirmed,death,rate
<chr>,<dbl>,<dbl>,<dbl>
Diamond Princess,712,13,0.01825843
Holy See,9,0,0.0
MS Zaandam,9,2,0.22222222
Saint Kitts and Nevis,15,0,0.0
Taiwan*,428,6,0.01401869


In [166]:
data <- merge(country_data, covid19, by = "country")

In [167]:
head(data)

country,age_1564,age_64up,age_0014,smok,air_polution,doctor,nurse_midwivies,confirmed,death,rate
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Afghanistan,54.3249,2.584927,43.09018,,56.91081,0.2907,,1351,43,0.03182828
Albania,68.58239,13.744736,17.67287,28.7,18.2006,,,678,27,0.03982301
Algeria,63.48882,6.362497,30.14868,15.6,38.88401,,,3127,415,0.13271506
Andorra,,,,33.5,10.30762,3.3333,4.0128,731,40,0.05471956
Angola,50.9747,2.216374,46.80892,,32.3885,,,25,2,0.08
Antigua and Barbuda,69.11908,8.799826,22.08109,,18.62234,,,24,3,0.125


## 6. Research questions



## 7. Data Analysis & Visualizations

## 8. Summary and conclusions
