# Tidy Data

- Each column = one variable (convention is to have "fixed" columns first)
- Each row = one observation
- Each cell = one value

In [None]:
library(tidyverse)

## Data set

In [None]:
n <- 4
df <- data.frame(
    pid = c(1,3,4,5),
    desc = paste(sample(c('M', 'F'), n, replace=T),
                    '-', 
                    sample(10:70, n),
                    sep=''),
    visit1 = rpois(n, lambda = 20),
    visit2 = rpois(n, lambda=10)
)

In [None]:
df[3,3] = NA

In [None]:
df

## Gather

In [None]:
df %>% 
gather(visit, measurement, -pid, -desc)

In [None]:
df %>% 
gather(key=visit, value=measurement, visit1:visit2)

## Separate

In [None]:
df %>% 
gather(key=visit, value=measurement, visit1:visit2) %>%
separate(desc, sep='-', into=c("sex", "age"))

## Clean-up and type coercion

In [None]:
df %>% 
gather(key=visit, value=measurement, visit1:visit2) %>%
separate(desc, sep='-', into=c("sex", "age")) %>%
mutate(age=as.integer(age), 
       visit=str_remove(visit, "visit"),
       visit=as.integer(visit)) %>%
drop_na(measurement) -> df1

In [None]:
df1

## Joins

In [None]:
names <- data.frame(
    pid = 1:6,
    first = c( "bob", "dan","ann", "liz", "joe", "jen"),
    last = c("lim", "tan", "liu", "nguyn", "smith", "finkelstein")
)

In [None]:
names

In [None]:
inner_join(df, names, by = "pid")

In [None]:
left_join(df, names, by = "pid")

In [None]:
right_join(df, names, by = "pid")

In [None]:
full_join(df, names, by = "pid")

## Exercise

**1**. Using the `who` data set, summarize the total count for each method of TB diagnosis across all years for which there is data for countries that begin wiht 'Z'.

In [None]:
help(who)