## Project

- Task 1: Get a COVID-19 pandemic Wiki page using HTTP request (2 pts)
- Task 2: Extract COVID-19 testing data table from the wiki HTML page (2 pts)
- Task 3: Pre-process and export the extracted data frame (2 pt)
- Task 4: Get a subset of the extracted data frame (2 pt)
- Task 5: Calculate worldwide COVID testing positive ratio (2 pts)
- Task 6: Get a sorted name list of countries that reported their testing data (2 pts)
- Task 7: Identify country names with a specific pattern (2 pts)
- Task 8: Pick two countries you are interested in, and then review their testing data (2 pts)
- Task 9: Compare which one of the selected countries has a larger ratio of confirmed cases to population (2 pts)
- Task 10: Find countries with confirmedcases to population ratio rate less than a threshold (2 pts)

In [1]:
# Task 0
library(httr)
library(rvest)
library(tidyr)

In [2]:
# Task 1: Get a COVID-19 pandemic Wiki page using HTTP request (2 pts)

get_wiki_covid19_page <- function() {
    
  # Our target COVID-19 wiki page URL is: https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country  
  # Which has two parts: 
    # 1) base URL `https://en.wikipedia.org/w/index.php  
    # 2) URL parameter: `title=Template:COVID-19_testing_by_country`, seperated by question mark ?
    
  # Wiki page base
    wiki_base_url <- "https://en.wikipedia.org/w/index.php"
  # You will need to create a List which has an element called `title` to specify which page you want to get from Wiki
  # in our case, it will be `Template:COVID-19_testing_by_country`
 
  # - Use the `GET` function in httr library with a `url` argument and a `query` arugment to get a HTTP response
    
  # Use the `return` function to return the response

    query = list(title = "Template:COVID-19_testing_by_country")
    response <- GET(url = wiki_base_url, query = query)
    return(response)
}

In [3]:
# Call the get_wiki_covid19_page function and print the response
get_wiki_covid19_page()

Response [https://en.wikipedia.org/w/index.php?title=Template%3ACOVID-19_testing_by_country]
  Date: 2024-02-27 10:51
  Status: 200
  Content-Type: text/html; charset=UTF-8
  Size: 448 kB
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-fea...
<head>
<meta charset="UTF-8">
<title>Template:COVID-19 testing by country - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-heade...
"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["...
"CS1 uses Russian-language script (ru)","CS1 Russian-language sources (ru)","...
,"CS1 Lithuanian-language sources (lt)","CS1 Malagasy-language sources (mg)",...
"wgRelevantArticleId":63303421,"wgIsProbablyEditable":false,"wgRelevantPageIs...
...

In [4]:
# Task 2: Extract COVID-19 testing data table from the wiki HTML page (2 pts)

# Get the root html node from the http response in task 1 
root_html_node <- content(get_wiki_covid19_page(), "text", encoding = "UTF-8") %>% read_html()

# Get the table node from the root html node
tables <- html_nodes(root_html_node, "table")

covid_testing_table <- html_table(tables[[2]]) 
df = as.data.frame(covid_testing_table)
head(df)


Unnamed: 0_level_0,Country or region,Date[a],Tested,Units[b],Confirmed(cases),"Confirmed /tested,%","Tested /population,%","Confirmed /population,%",Ref.
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,Afghanistan,17 Dec 2020,154767,samples,49621,32.1,0.4,0.13,[1]
2,Albania,18 Feb 2021,428654,samples,96838,22.6,15.0,3.4,[2]
3,Algeria,2 Nov 2020,230553,samples,58574,25.4,0.53,0.13,[3][4]
4,Andorra,23 Feb 2022,300307,samples,37958,12.6,387.0,49.0,[5]
5,Angola,2 Feb 2021,399228,samples,20981,5.3,1.3,0.067,[6]
6,Antigua and Barbuda,6 Mar 2021,15268,samples,832,5.4,15.9,0.86,[7]


In [5]:
# Task 3: Pre-process and export the extracted data frame (2 pt)

# Print the summary of the data frame
summary(df)

 Country or region    Date[a]             Tested            Units[b]        
 Length:173         Length:173         Length:173         Length:173        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
 Confirmed(cases)   Confirmed /tested,% Tested /population,%
 Length:173         Length:173          Length:173          
 Class :character   Class :character    Class :character    
 Mode  :character   Mode  :character    Mode  :character    
 Confirmed /population,%     Ref.          
 Length:173              Length:173        
 Class :character        Class :character  
 Mode  :character        Mode  :character  

In [6]:
preprocess_covid_data_frame <- function(data_frame) {
    
    shape <- dim(data_frame)

    # Remove the World row
    #data_frame<-data_frame[!(data_frame$`Country.or.region`=="World"),]
    # Remove the last row
    data_frame <- data_frame[1:172, ]
    
    # We dont need the Units and Ref columns, so can be removed
    data_frame["Ref."] <- NULL
    data_frame["Units[b]"] <- NULL
    
    # Renaming the columns
    names(data_frame) <- c("country", "date", "tested", "confirmed", "confirmed.tested.ratio", "tested.population.ratio", "confirmed.population.ratio")
    
    # Convert column data types
    data_frame$tested <- as.numeric(gsub(",","",data_frame$tested))
    data_frame$confirmed <- as.numeric(gsub(",","",data_frame$confirmed))
    data_frame$'confirmed.tested.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.tested.ratio`))
    data_frame$'tested.population.ratio' <- as.numeric(gsub(",","",data_frame$`tested.population.ratio`))
    data_frame$'confirmed.population.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.population.ratio`))

    
    return(data_frame)
}


In [7]:
# call `preprocess_covid_data_frame` function and assign it to a new data frame
new_df = preprocess_covid_data_frame(df)
# Print the summary of the processed data frame again
summary(new_df)

   country              date               tested            confirmed       
 Length:172         Length:172         Min.   :     3880   Min.   :       0  
 Class :character   Class :character   1st Qu.:   512037   1st Qu.:   37839  
 Mode  :character   Mode  :character   Median :  3029859   Median :  281196  
                                       Mean   : 31377219   Mean   : 2508340  
                                       3rd Qu.: 12386725   3rd Qu.: 1278105  
                                       Max.   :929349291   Max.   :90749469  
 confirmed.tested.ratio tested.population.ratio confirmed.population.ratio
 Min.   : 0.00          Min.   :   0.006        Min.   : 0.000            
 1st Qu.: 5.00          1st Qu.:   9.475        1st Qu.: 0.425            
 Median :10.05          Median :  46.950        Median : 6.100            
 Mean   :11.25          Mean   : 175.504        Mean   :12.769            
 3rd Qu.:15.25          3rd Qu.: 156.500        3rd Qu.:16.250            
 Max

In [8]:
# Export the data frame to a csv file
write.csv(new_df, file = "covid.csv", row.names = FALSE)
# Get working directory
wd <- getwd()
# Get exported 
file_path <- paste(wd, sep="", "/covid.csv")
# File path
print(file_path)
file.exists(file_path)

[1] "c:/Users/hakan/PycharmProjects/VSCode/random/Notes/data_analytics_ibm/covid.csv"


In [9]:
# Task 4: Get a subset of the extracted data frame (2 pt)
dff = read.csv("covid.csv")
subset_df <- dff[5:10, c("country", "confirmed")]
subset_df

Unnamed: 0_level_0,country,confirmed
Unnamed: 0_level_1,<chr>,<int>
5,Angola,20981
6,Antigua and Barbuda,832
7,Argentina,9060495
8,Armenia,422963
9,Australia,10112229
10,Austria,5789991


In [10]:
# Task 5: Calculate worldwide COVID testing positive ratio (2 pts)

# Get the total confirmed cases worldwide
confirmed = sum(dff$confirmed, na.rm = TRUE)
# Get the total tested cases worldwide
total = sum(dff$tested, na.rm = TRUE)
# Get the positive ratio (confirmed / tested)
ratio = confirmed/total
ratio

In [11]:
# Get the `country` column
country = dff$country
# Check its class (should be Factor)
class(country)
# Conver the country column into character so that you can easily sort them
country = as.character(country)
class(country)
# Sort the countries AtoZ
sorted_countries = sort(country)
# Sort the countries ZtoA
sort(country, decreasing = TRUE)

In [12]:
# Task 7: Identify country names with a specific pattern (2 pts)


# Use a regular expression `United.+` to find matches
matches <- grep("United.+", country, value = TRUE)
# Print the matched country names
matches

In [13]:
# Task 8: Pick two countries you are interested in, and then review their testing data (2 pts)
# select country, confirmed, confirmed-population-ratio columns

# us
us_data = dff[dff$country == "United States", c("country", "confirmed", "confirmed.population.ratio")]
us_data

# germany
germany_data = dff[dff$country == "Germany", c("country", "confirmed", "confirmed.population.ratio")]
germany_data

Unnamed: 0_level_0,country,confirmed,confirmed.population.ratio
Unnamed: 0_level_1,<chr>,<int>,<dbl>
166,United States,90749469,27.4


Unnamed: 0_level_0,country,confirmed,confirmed.population.ratio
Unnamed: 0_level_1,<chr>,<int>,<dbl>
60,Germany,3733519,4.5


In [14]:
# Task 9: Compare which one of the selected countries has a larger ratio of confirmed cases to population (2 pts)

# it ensures data is numeric (to be fair we did this upwards but maybe sth changed, better to be cautious)
# covers the possibility that they are equal

us_ratio = as.numeric(us_data$confirmed.population.ratio)
germany_ratio = as.numeric(germany_data$confirmed.population.ratio)

if (us_ratio > germany_ratio) {
    print("US has a greater COVID-19 infection risk.")
} else if (us_ratio < germany_ratio) {
    print("Germany has a greater COVID-19 infection risk.")
} else {
    print("U.S. and Germany have equal COVID-19 infection risks.")
}

[1] "US has a greater COVID-19 infection risk."


In [15]:
# Task 10: Find countries with confirmedcases to population ratio rate less than a threshold (2 pts)

# Get a subset of any countries with `confirmed.population.ratio` less than the threshold
threshold = 0.01

countries_below_threshold = subset(dff, confirmed.population.ratio < threshold)

selected_columns = countries_below_threshold[, c("country", "confirmed.population.ratio")]
selected_columns

Unnamed: 0_level_0,country,confirmed.population.ratio
Unnamed: 0_level_1,<chr>,<dbl>
28,Burundi,0.0074
34,China[c],0.0061
89,Laos,0.00063
119,North Korea,0.0
156,Tanzania,0.00085


In [16]:
dff

country,date,tested,confirmed,confirmed.tested.ratio,tested.population.ratio,confirmed.population.ratio
<chr>,<chr>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
Afghanistan,17 Dec 2020,154767,49621,32.10,0.40,0.1300
Albania,18 Feb 2021,428654,96838,22.60,15.00,3.4000
Algeria,2 Nov 2020,230553,58574,25.40,0.53,0.1300
Andorra,23 Feb 2022,300307,37958,12.60,387.00,49.0000
Angola,2 Feb 2021,399228,20981,5.30,1.30,0.0670
Antigua and Barbuda,6 Mar 2021,15268,832,5.40,15.90,0.8600
Argentina,16 Apr 2022,35716069,9060495,25.40,78.30,20.0000
Armenia,29 May 2022,3099602,422963,13.60,105.00,14.3000
Australia,9 Sep 2022,78548492,10112229,12.90,313.00,40.3000
Austria,1 Feb 2023,205817752,5789991,2.80,2312.00,65.0000
