# Part 02 - Data Preprocessing with R

#### by Liew Jun Yen

## Packages Installation

In [1]:
install.packages("dplyr") #manipulate, clean, summarize unstructured data

also installing the dependencies 'pkgconfig', 'withr', 'generics', 'magrittr', 'R6', 'tibble', 'tidyselect'




package 'pkgconfig' successfully unpacked and MD5 sums checked
package 'withr' successfully unpacked and MD5 sums checked
package 'generics' successfully unpacked and MD5 sums checked
package 'magrittr' successfully unpacked and MD5 sums checked
package 'R6' successfully unpacked and MD5 sums checked
package 'tibble' successfully unpacked and MD5 sums checked
package 'tidyselect' successfully unpacked and MD5 sums checked
package 'dplyr' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\USER\AppData\Local\Temp\Rtmpkz5RoX\downloaded_packages


In [2]:
cov_data <- read.csv("C:\\Users\\USER\\Desktop\\Portfolio Project\\owid-covid-data.csv",header=TRUE)

## 2.1 Data Reduction

In [3]:
# [Feature Selection]
# Remove unwanted columns from cov_data dataset
cov_data <- subset(cov_data, select = -c(iso_code, new_cases_smoothed, total_deaths, new_deaths_smoothed, 
                                         total_cases_per_million, new_cases_per_million,
                                         new_cases_smoothed_per_million, total_deaths_per_million, 
                                         new_deaths_per_million, new_deaths_smoothed_per_million, icu_patients, 
                                         icu_patients_per_million, hosp_patients, hosp_patients_per_million, 
                                         weekly_icu_admissions, weekly_icu_admissions_per_million, 
                                         weekly_hosp_admissions, weekly_hosp_admissions_per_million, total_tests, 
                                         total_tests_per_thousand, new_tests_per_thousand, new_tests_smoothed, 
                                         new_tests_smoothed_per_thousand, tests_units, total_vaccinations, 
                                         new_vaccinations_smoothed, total_vaccinations_per_hundred, 
                                         people_vaccinated_per_hundred, people_fully_vaccinated_per_hundred,
                                         total_boosters_per_hundred, new_vaccinations_smoothed_per_million, 
                                         new_people_vaccinated_smoothed, new_people_vaccinated_smoothed_per_hundred, 
                                         excess_mortality_cumulative_absolute, excess_mortality_cumulative, 
                                         excess_mortality, excess_mortality_cumulative_per_million))

## 2.2 Data Cleaning

In [5]:
# Fill in missing values with 0 value
library(dplyr)

cov_data <- cov_data %>%
  mutate(total_cases = coalesce(total_cases, 0),
         new_cases = coalesce(new_cases, 0),
         new_deaths = coalesce(new_deaths, 0),
         reproduction_rate = coalesce(reproduction_rate, 0),
         new_tests = coalesce(new_tests, 0),
         positive_rate = coalesce(positive_rate, 0),
         tests_per_case = coalesce(tests_per_case, 0),
         people_vaccinated = coalesce(people_vaccinated, 0),
         people_fully_vaccinated = coalesce(people_fully_vaccinated, 0),
         total_boosters = coalesce(total_boosters, 0),
         new_vaccinations = coalesce(new_vaccinations, 0),
         stringency_index = coalesce(stringency_index, 0),
         population_density = coalesce(population_density, 0),
         median_age = coalesce(median_age, 0),
         aged_65_older = coalesce(aged_65_older, 0),
         aged_70_older = coalesce(aged_70_older, 0),
         gdp_per_capita = coalesce(gdp_per_capita, 0))

In [6]:
# Fill in missing values with mean value
cov_data$extreme_poverty[is.na(cov_data$extreme_poverty)] <- mean(cov_data$extreme_poverty, na.rm = TRUE)
cov_data$cardiovasc_death_rate[is.na(cov_data$cardiovasc_death_rate)] <- mean(cov_data$cardiovasc_death_rate, 
                                                                              na.rm = TRUE)
cov_data$diabetes_prevalence[is.na(cov_data$diabetes_prevalence)] <- mean(cov_data$diabetes_prevalence, na.rm = TRUE)
cov_data$female_smokers[is.na(cov_data$female_smokers)] <- mean(cov_data$female_smokers, na.rm = TRUE)
cov_data$male_smokers[is.na(cov_data$male_smokers)] <- mean(cov_data$male_smokers, na.rm = TRUE)
cov_data$handwashing_facilities[is.na(cov_data$handwashing_facilities)] <- mean(cov_data$handwashing_facilities, 
                                                                                na.rm = TRUE)
cov_data$hospital_beds_per_thousand[is.na(cov_data$hospital_beds_per_thousand)] <- 
  mean(cov_data$hospital_beds_per_thousand, na.rm = TRUE)
cov_data$life_expectancy[is.na(cov_data$life_expectancy)] <- mean(cov_data$life_expectancy, na.rm = TRUE)
cov_data$human_development_index[is.na(cov_data$human_development_index)] <- mean(cov_data$human_development_index, 
                                                                                  na.rm = TRUE)

## 2.3 Data Transformation

In [7]:
# Rename Column
names(cov_data)[names(cov_data) == 'location'] <- 'country'

In [8]:
# Data Types Conversion
cov_data$date = as.Date(cov_data$date, "%Y-%m-%d")