# Working Ennvironment Setup

In [None]:
library(foreign)    # Importing package
library(psych)

```
Error: package or namespace load failed for ‘psych’ in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]):
 there is no package called ‘mnormt’
 ```

In [None]:
install.packages("psych")    # Installing missing package

In [None]:
getwd()    # Where is the currnet working directory?

```
[1] "C:/Users/clayop/Documents"
```

In [None]:
setwd("D:/Presentations/2018_PPD_R_Workshop")    # Changing working directory
getwd()

In [None]:
options(scipen=999)    # Disabling scientific notation

# Importing Data

You need to run `library(foreign)` to `read.csv`

In [None]:
dp02 <- read.csv("./Data/ACS_16_5YR_DP02/ACS_16_5YR_DP02_with_ann.csv", stringsAsFactors=FALSE)
dp03 <- read.csv("./Data/ACS_16_5YR_DP03/ACS_16_5YR_DP03_with_ann.csv", stringsAsFactors=FALSE)

Let's do some checks

In [None]:
nrow(dp02)    # Number of rows
nrow(dp03)

In [None]:
head(dp02)    # First lines (default:5)
head(dp03, 10)    # First 10 lines
tail(dp02)    # Last lines

Oops! we have a wrong first row. Let's remove the first row later

Is a variable encoded in character or number?

In [None]:
dp02$HC01_VC03    # Dataframe$Variable
head(dp02$HC01_VC03)
class(dp02$HC01_VC03)

# Basic Data Manupulation

How to index data?

In [None]:
dp02[1,]    # First row of dataframe
dp02[,1]    # First column of datafrome
dp02[3,5]

Let's remove the annoying first row

In [None]:
dp02a <- dp02[-1,]
dp03a <- dp03[-1,]
head(dp02a)

In [None]:
dp02a[c(1:5),]    # Indexing by range
dp03a[c(1:5),c(2:3)]

In [None]:
head(dp02a[,"GEO.id"])    # By variable name

In [None]:
dp02b <- dp02a[,c("GEO.id", "HC01_VC03", "HC01_VC21", "HC01_VC26", "HC01_VC85", "HC01_VC92")]    # Check "ACS_16_5YR_DP02_metadata.csv"
dp03b <- dp03a[,c("GEO.id", "HC01_VC03", "HC01_VC09", "HC01_VC36", "HC01_VC85", "HC01_VC161")]    # Check "ACS_16_5YR_DP03_metadata.csv"

# Advanced Data Manipulation

## Rename variable names

In [None]:
dp02c <- rename(dp02b, "TOTHH"="HC01_VC03", "AVRHHSIZE"="HC01_VC21", "TOTPOP"="HC01_VC26", "POP25OVER"="HC01_VC85", "POP25GRADDEGREE"="HC01_VC92")
dp03c <- rename(dp03b, "POP16OVER"="HC01_VC03", "POP16NOLABOR"="HC01_VC09", "COMMTIME"="HC01_VC36", "MEDHHINC"="HC01_VC85", "POVERTY"="HC01_VC161")
head(dp02c)
head(dp03c)

## Modify variable class / Dealing with NA

In [None]:
dp02c$TOTPOP/dp02c$TOTHH

```
Error in dp02b$TOTPOP/dp02b$TOTHH: 이항연산자에 수치가 아닌 인수입니다
Traceback:
```

In [None]:
class(dp02c$TOTPOP)
class(dp02c$TOTHH)
sapply(dp03c, class)

In [None]:
dp02d <- dp02c
dp03d <- dp03c

In [None]:
dp02d$TOTPOP <- as.numeric(dp02d$TOTPOP)
dp02d$TOTHH <- as.numeric(dp02d$TOTHH)

In [None]:
dp02d$AVRHHSIZE2 <- dp02d$TOTPOP/dp02d$TOTHH
head(dp02d)

In [None]:
dp02d$AVRHHSIZE2 <- NULL
head(dp02d)

In [None]:
dp03d[,c(2:5)] <- sapply(dp03d[,c(2:5)], as.numeric)

In [None]:
sapply(dp03d,class)

In [None]:
dp03d[!complete.cases(dp03d),]

In [None]:
dp03c["133",]
dp03c["362",]
dp03c["3078",]
dp03c["5658",]
dp03c[row.names(dp03d[!complete.cases(dp03d),]),]

In [None]:
dp03d <- dp03c

In [None]:
dp03d$MEDHHINC <- ifelse(dp03d$MEDHHINC=="250,000+", 250000, dp03d$MEDHHINC)
dp03d$MEDHHINC <- ifelse(dp03d$MEDHHINC=="-", -9, dp03d$MEDHHINC)
dp03d$COMMTIME <- ifelse(dp03d$COMMTIME=="-", -9, dp03d$COMMTIME)
dp03d$COMMTIME <- ifelse(dp03d$COMMTIME=="N", -9, dp03d$COMMTIME)

In [None]:
sapply(dp03d,class)
dp03d[,c(2:5)] <- sapply(dp03d[,c(2:5)], as.numeric)

In [None]:
dp03d[!complete.cases(dp03d),]

In [None]:
sapply(dp02d,class)
dp02d[,c(2:6)] <- sapply(dp02d[,c(2:6)], as.numeric)

In [None]:
dp02d[!complete.cases(dp02d),]
nrow(dp02d)
nrow(dp02d[!complete.cases(dp02d),])

In [None]:
dp02c["420",]
dp02c["5709",]
dp02c["8061",]

In [None]:
dp02e <- na.omit(dp02d)

In [None]:
dp02e[!complete.cases(dp02e),]
nrow(dp02e)
nrow(dp02e[!complete.cases(dp02e),])

## Subset dataframe

In [None]:
subset(dp03d, COMMTIME == -9)
nrow(subset(dp03d, COMMTIME == -9))

In [None]:
subset(dp03d, COMMTIME == -9 & MEDHHINC == -9)
subset(dp03d, COMMTIME == -9 | MEDHHINC == -9)
subset(dp03d, COMMTIME != -9 & MEDHHINC != -9)
nrow(subset(dp03d, COMMTIME == -9 & MEDHHINC == -9))
nrow(subset(dp03d, COMMTIME == -9 | MEDHHINC == -9))
nrow(subset(dp03d, COMMTIME != -9 & MEDHHINC != -9))

In [None]:
dp03e <- subset(dp03d, COMMTIME != -9 & MEDHHINC != -9)

# Super Advanced Data Manipulation<sup>TM</sup>

In [None]:
head(dp02e)
head(dp03e)

In [None]:
dp02e$GEOID <- substr(dp02e$GEO.id,10,20)
dp03e$GEOID <- substr(dp03e$GEO.id,10,20)
head(dp02e)
head(dp03e)

In [None]:
dp02e$COUNTY <- substr(dp02e$GEOID,3,5)
dp03e$COUNTY <- substr(dp03e$GEOID,3,5)
dp02e$TRACT <- substr(dp02e$GEOID,6,11)
dp03e$TRACT <- substr(dp03e$GEOID,6,11)
head(dp02e)
head(dp03e)

In [None]:
dp02e$GEOID2 <- paste(dp02e$COUNTY, dp02e$TRACT, sep = "")
head(dp02e)
dp02e$GEOID2 <- NULL

In [None]:
library(dplyr)

In [None]:
head(dp02e)

In [None]:
dp02_county <- aggregate(subset(dp02e, select=c("TOTHH", "TOTPOP", "POP25OVER", "POP25GRADDEGREE")), by=list(dp02e$COUNTY), FUN=sum)

In [None]:
head(dp02_county)
dp02_county <- rename(dp02_county, "COUNTY"="Group.1")

In [None]:
joined <- inner_join(dp02e, dp03e, by=c("TRACT"="TRACT"))