In [1]:
install.packages("xml2")
install.packages("rvest")

library(httr)
library(rvest)

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
“installation of package ‘rvest’ had non-zero exit status”Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
Loading required package: xml2


In [2]:
wiki_base_url <- "https://en.wikipedia.org/w/index.php" 
query_params <- list(title = "Template:COVID-19_testing_by_country")

In [3]:
get_wiki_covid19_page <- function(wiki_base_url, query) {
  response <- GET(wiki_base_url, query = query)
  return(response)
}

In [4]:
response <- get_wiki_covid19_page(wiki_base_url, query_params)

In [13]:
root_node <- read_html(response)

In [19]:
data_tables <- html_table(root_node)
data_frame <- data_tables[[2]]


In [20]:
head(data_frame)

Unnamed: 0_level_0,Country or region,Date[a],Tested,Units[b],Confirmed(cases),"Confirmed /tested,%","Tested /population,%","Confirmed /population,%",Ref.
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,Afghanistan,17 Dec 2020,154767,samples,49621,32.1,0.4,0.13,[1]
2,Albania,18 Feb 2021,428654,samples,96838,22.6,15.0,3.4,[2]
3,Algeria,2 Nov 2020,230553,samples,58574,25.4,0.53,0.13,[3][4]
4,Andorra,23 Feb 2022,300307,samples,37958,12.6,387.0,49.0,[5]
5,Angola,2 Feb 2021,399228,samples,20981,5.3,1.3,0.067,[6]
6,Antigua and Barbuda,6 Mar 2021,15268,samples,832,5.4,15.9,0.86,[7]


In [23]:
str(data_frame)

'data.frame':	173 obs. of  9 variables:
 $ Country or region      : chr  "Afghanistan" "Albania" "Algeria" "Andorra" ...
 $ Date[a]                : chr  "17 Dec 2020" "18 Feb 2021" "2 Nov 2020" "23 Feb 2022" ...
 $ Tested                 : chr  "154,767" "428,654" "230,553" "300,307" ...
 $ Units[b]               : chr  "samples" "samples" "samples" "samples" ...
 $ Confirmed(cases)       : chr  "49,621" "96,838" "58,574" "37,958" ...
 $ Confirmed /tested,%    : chr  "32.1" "22.6" "25.4" "12.6" ...
 $ Tested /population,%   : chr  "0.40" "15.0" "0.53" "387" ...
 $ Confirmed /population,%: chr  "0.13" "3.4" "0.13" "49.0" ...
 $ Ref.                   : chr  "[1]" "[2]" "[3][4]" "[5]" ...


In [28]:
preprocess_covid_data_frame <- function(data_frame) {
    
    shape <- dim(data_frame)

    # Remove the World row
    data_frame <-data_frame[!(data_frame$`Country or region`=="World"),]
    # Remove the last row
    data_frame <- data_frame[1:172, ]
    
    # We dont need the Units and Ref columns, so can be removed
    data_frame["Ref."] <- NULL
    data_frame["Units[b]"] <- NULL
    
    # Renaming the columns
    names(data_frame) <- c("country", "date", "tested", "confirmed", "confirmed.tested.ratio", "tested.population.ratio", "confirmed.population.ratio")
    
    # Convert column data types
    data_frame$country <- as.factor(data_frame$country)
    data_frame$date <- as.factor(data_frame$date)
    data_frame$tested <- as.numeric(gsub(",","",data_frame$tested))
    data_frame$confirmed <- as.numeric(gsub(",","",data_frame$confirmed))
    data_frame$'confirmed.tested.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.tested.ratio`))
    data_frame$'tested.population.ratio' <- as.numeric(gsub(",","",data_frame$`tested.population.ratio`))
    data_frame$'confirmed.population.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.population.ratio`))
    
    return(data_frame)
}

In [29]:
# call `preprocess_covid_data_frame` function and assign it to a new data frame
postprocess <- preprocess_covid_data_frame(data_frame)
head(postprocess)

Unnamed: 0_level_0,country,date,tested,confirmed,confirmed.tested.ratio,tested.population.ratio,confirmed.population.ratio
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,Afghanistan,17 Dec 2020,154767,49621,32.1,0.4,0.13
2,Albania,18 Feb 2021,428654,96838,22.6,15.0,3.4
3,Algeria,2 Nov 2020,230553,58574,25.4,0.53,0.13
4,Andorra,23 Feb 2022,300307,37958,12.6,387.0,49.0
5,Angola,2 Feb 2021,399228,20981,5.3,1.3,0.067
6,Antigua and Barbuda,6 Mar 2021,15268,832,5.4,15.9,0.86


In [30]:
# Print the summary of the processed data frame again
str(postprocess)

'data.frame':	172 obs. of  7 variables:
 $ country                   : Factor w/ 172 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ date                      : Factor w/ 141 levels "1 Aug 2021","1 Feb 2023",..: 32 38 49 60 44 124 29 90 141 2 ...
 $ tested                    : num  154767 428654 230553 300307 399228 ...
 $ confirmed                 : num  49621 96838 58574 37958 20981 ...
 $ confirmed.tested.ratio    : num  32.1 22.6 25.4 12.6 5.3 5.4 25.4 13.6 12.9 2.8 ...
 $ tested.population.ratio   : num  0.4 15 0.53 387 1.3 ...
 $ confirmed.population.ratio: num  0.13 3.4 0.13 49 0.067 0.86 20 14.3 40.3 65 ...


In [31]:
# Export the data frame to a csv file
write.csv(postprocess, file = "covid.csv", row.names = FALSE)

In [32]:
# Get working directory
wd <- getwd()
# Get exported 
file_path <- paste(wd, sep="", "/covid.csv")
# File path
print(file_path)
file.exists(file_path)

[1] "/resources/labs/authoride/IBMSkillsNetwork+RP0101EN/v2/M5_Final/covid.csv"


In [34]:
# Read covid_data_frame_csv from the csv file
covid_data_frame_csv <- read.csv("covid.csv", header=TRUE, sep=",")
covid_data_frame_csv[5:10, c("country","confirmed")]

Unnamed: 0_level_0,country,confirmed
Unnamed: 0_level_1,<fct>,<int>
5,Angola,20981
6,Antigua and Barbuda,832
7,Argentina,9060495
8,Armenia,422963
9,Australia,10112229
10,Austria,5789991


In [35]:
head(covid_data_frame_csv)

Unnamed: 0_level_0,country,date,tested,confirmed,confirmed.tested.ratio,tested.population.ratio,confirmed.population.ratio
Unnamed: 0_level_1,<fct>,<fct>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
1,Afghanistan,17 Dec 2020,154767,49621,32.1,0.4,0.13
2,Albania,18 Feb 2021,428654,96838,22.6,15.0,3.4
3,Algeria,2 Nov 2020,230553,58574,25.4,0.53,0.13
4,Andorra,23 Feb 2022,300307,37958,12.6,387.0,49.0
5,Angola,2 Feb 2021,399228,20981,5.3,1.3,0.067
6,Antigua and Barbuda,6 Mar 2021,15268,832,5.4,15.9,0.86


In [36]:
library(MASS)

In [37]:
sum_confirmed <- sum(covid_data_frame_csv$confirmed)
print(paste("Total confirmed cases worldwide:", sum_confirmed , "cases"))


sum_tested <- sum(covid_data_frame_csv$tested)
print(paste("Total tested cases worldwide:", sum_tested , "cases"))


positive_ratio <- round(sum_confirmed/sum_tested,2)
print(paste("Overall positive ratio:", fractions(positive_ratio)))

[1] "Total confirmed cases worldwide: 431434555 cases"
[1] "Total tested cases worldwide: 5396881644 cases"
[1] "Overall positive ratio: 2/25"


In [38]:
country_col <- covid_data_frame_csv["country"]
head(country_col)

class(covid_data_frame_csv$country)

country_col <- as.character(covid_data_frame_csv$country)
class(country_col)

country_col <- sort(country_col)

country_col <- sort(country_col, decreasing = TRUE)

country_col

Unnamed: 0_level_0,country
Unnamed: 0_level_1,<fct>
1,Afghanistan
2,Albania
3,Algeria
4,Andorra
5,Angola
6,Antigua and Barbuda


In [39]:
grep("United.+", country_col)
grep("United.+", country_col, value = TRUE)

In [40]:
country_1 <- covid_data_frame_csv["country"] == "Malaysia"
malaysia <- covid_data_frame_csv[country_1, c("country","confirmed", "confirmed.population.ratio")]
malaysia

country_2 <- covid_data_frame_csv["country"] == "Norway"
norway <- covid_data_frame_csv[country_2, c("country","confirmed", "confirmed.population.ratio")]
norway

Unnamed: 0_level_0,country,confirmed,confirmed.population.ratio
Unnamed: 0_level_1,<fct>,<int>,<dbl>
99,Malaysia,1880734,5.7


Unnamed: 0_level_0,country,confirmed,confirmed.population.ratio
Unnamed: 0_level_1,<fct>,<int>,<dbl>
122,Norway,554778,10.3


In [41]:
country_3 <- covid_data_frame_csv["country"] == c("Malaysia","Norway")
combined <- covid_data_frame_csv[country_3, c("country","confirmed", "confirmed.population.ratio")]
combined

Unnamed: 0_level_0,country,confirmed,confirmed.population.ratio
Unnamed: 0_level_1,<fct>,<int>,<dbl>
99,Malaysia,1880734,5.7
122,Norway,554778,10.3


In [42]:
x <- combined["country"] == "Malaysia"
y <- combined["country"] == "Norway"

In [43]:
if (combined[x, c("confirmed.population.ratio")] > combined[y, c("confirmed.population.ratio")]) {
    print ("Malaysia has higher COVID-19 infection risk than Norway")
} else {
    print("Norway has higher COVID-19 infection risk than Malaysia")
}

[1] "Norway has higher COVID-19 infection risk than Malaysia"


In [45]:
z <- function(data, isLowerCPR, threshold = 0.01){
    isLowerCPR <- data["confirmed.population.ratio"] < threshold
    return(data[isLowerCPR, c("country","confirmed.population.ratio")])
}

In [46]:
z(covid_data_frame_csv)

Unnamed: 0_level_0,country,confirmed.population.ratio
Unnamed: 0_level_1,<fct>,<dbl>
28,Burundi,0.0074
34,China[c],0.0061
89,Laos,0.00063
119,North Korea,0.0
156,Tanzania,0.00085
