DataJoiner.Rmd

---
title: "R Notebook"
output: html_notebook
---

```{r}
# load data.table for faster operations
library(data.table)
library(tidyverse)
```


```{r}
# use fread for fast reading of data csv files
  # ColumnNameCleaner.rmd should be run first
cases <- fread("Data/time_series_covid19_confirmed_US.csv")
vaccineCounty <- fread("Data/Vaccination/COVID-19_Vaccinations_in_the_United_States_County.csv")
#vaccineJurisdiction <- fread("Data/Vaccination/COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv")
pop <- fread("Data/Population/co-est2019-alldata.csv")
distancing <- fread("Data/Distancing/Unacast_Social_Distancing_Grades.csv")
masking <- fread("Data/Masking/mask-use-by-county.csv")
```


```{r}
# preview the data tables
head(cases)
head(vaccineCounty)
#head(vaccineJurisdiction)
head(distancing)
head(masking)
head(pop)
```


```{r}
# either create or fix the FIPS column using available data
cases[, FIPS := sprintf("%05d", cases[,FIPS])]
pop[, FIPS := sprintf("%02d%03d", pop[,STATE], pop[,COUNTY])]
# create a population density column too
distancing[, c("FIPS", "popDensity") := .(sprintf("%05d", county_fips), ((county_population / Shape__Area * 100) + 1))]
masking[, FIPS := sprintf("%05d", COUNTYFP)]
masking <- cbind("FIPS" = masking[, FIPS], masking[,!c("FIPS")] + 1)
```


```{r}
# drops unneeded columns
cases <- cases[, !c("V1", "UID", "iso2", "iso3", "code3", "CombinedKey")]
# selects the most recent population estimate (2019) and drops what will be redundant columns
years2019 <- grep("^([^0-9]*)$|2019", colnames(pop))
pop <- pop[, ..years2019][, !c("SUMLEV", "STATE", "COUNTY", "STNAME", "CTYNAME")]
# drops what will be redundant columns
distancing <- distancing[, !c("OBJECTID", "state_fips", "state_name", "county_fips", "county_name")]
masking <- masking[, !c("COUNTYFP")]
```


```{r}
# Overall total
totalCases <- grep("[0-9]{4}", colnames(cases))
cases[, TotalCases := rowSums(cases[, ..totalCases], na.rm = TRUE)]
# Total before vaccinations
# 22 January 2020 - 30 November 2020
# https://www.washingtonpost.com/nation/2020/12/14/first-covid-vaccines-new-york/
totalCases <- totalCases[1:314]
cases[, TotalCasesVaccines := rowSums(cases[, ..totalCases], na.rm = TRUE)]
head(cases)
```


```{r}
grades <- c("A+"=4.3,"A"=4,"A-"=3.7,"B+"=3.3,"B"=3,"B-"=2.7, "C+"=2.3,"C"=2,"C-"=1.7,"D+"=1.3,"D"=1,"D-"= 0.7, "F"=0)

gfunc <- function(x) {
  grades[as.character(x)]
}

distancing[, c("grade_total", "grade_distance", "grade_visitation", "grade_encounters") :=
             .(gfunc(grade_total), gfunc(grade_distance), gfunc(grade_visitation), gfunc(grade_encounters))]
```


```{r}
# set the FIPS column as the key for faster data.table operations
# time series
setkey(cases, FIPS)
setkey(vaccineCounty, FIPS)
setkey(distancing, FIPS)
# not time series
setkey(masking, FIPS)
setkey(pop, FIPS)
```


```{r}
# pivot_wider(vaccineCounty, names_from = Date, values_from = )
```


```{r}
csv <- cases[pop,][distancing,][masking,]
write.csv(csv, "Data/casesPopDistancingMasking.csv", row.names = FALSE)
openxlsx::write.xlsx(csv, "Data/casesPopDistancingMasking.xlsx", overwrite = TRUE)
dates <- grep("D[0-9]{4}_[0-9]{2}_[0-9]{2}", colnames(csv))
openxlsx::write.xlsx(csv[, !..dates], "Data/noDates.xlsx", overwrite = TRUE)
csv
```