## A.1 Install packages

In [5]:
#Step 1
#install.packages('downloader')
#install.packages('foreign')
#install.packages('dplyr')
#install.packages('ggplot2')

In [6]:
#Step 2
library(foreign) #Imports dta files
library(dplyr) #Data manipulation
library(downloader) #Downloads files from the internet
library(ggplot2)

## A.2 Data set 

In [8]:
#URL 
url = "http://www.ennvih-mxfls.org/english/assets/hh02dta_bc.zip"
#File name
file_name = "mxfls.zip"
# "Downloader" library--function: download.file()
download.file(url, file_name)
unzip("mxfls.zip")
df = read.dta("hh02dta_bc/c_ls.dta")
df %>% head(3)

folio,ls,secuencia,ls00,ls02_1,ls02_2,ls03_1,ls03_21,ls03_22,ls04,...,ls09,ls10,ls11,ls12,ls13_1,ls13_2,ls14,ls15_1,ls16,ls18
1000,1,1,1,1,37,,,,1,...,1,5,2.0,1,1.0,32000.0,3,6.0,3,
1000,2,2,2,1,35,,,,3,...,1,5,1.0,3,,,1,,3,
1000,3,3,3,1,16,,,,3,...,1,6,,3,,,6,0.0,1,2.0


## B.1 Problem: "Could not find function “%>%”"

In [12]:
df_renamed = df %>%  
  rename("Age"= "ls02_2",
         "Attendance" = "ls16",
         "Gender" = "ls04",
         "Household_ID" = 'folio',
         "Individual_ID"= 'ls')

df_renamed %>% head(3)

Household_ID,Individual_ID,secuencia,ls00,ls02_1,Age,ls03_1,ls03_21,ls03_22,Gender,...,ls09,ls10,ls11,ls12,ls13_1,ls13_2,ls14,ls15_1,Attendance,ls18
1000,1,1,1,1,37,,,,1,...,1,5,2.0,1,1.0,32000.0,3,6.0,3,
1000,2,2,2,1,35,,,,3,...,1,5,1.0,3,,,1,,3,
1000,3,3,3,1,16,,,,3,...,1,6,,3,,,6,0.0,1,2.0


## B.2 Recode

In [13]:
df_renamed = df_renamed  %>% 
  mutate(
    Attendance = recode(Attendance, "3"=0, "1"=1),
    Gender = recode(Gender, "3"=0, "1"=1)
  )

df_renamed %>% select(Attendance) %>% unique()

Unnamed: 0,Attendance
1,0.0
3,1.0
22,


# 1. Troubleshooting in R

## 1.1 PROBLEM: Column 'NAME' is not found


In [14]:
#Method 1
df_renamed %>% 
  group_by(folio) %>% #Groupby house
  count() %>% #Counts each member in each house
  summary() #Shows summary stats

ERROR: Error: Column `folio` is unknown


In [None]:
#Method 2
df_renamed %>% 
  group_by(Household_ID) %>% #Groupby house
  summarise(members = n()) %>% 
  mutate(members_mean = mean(members),
         members_sd = sd(members))

In [None]:
#Method 3
df_renamed %>% 
  group_by(Household_ID) %>% #Groupby house
  count() %>% #counts
  pull(n) %>% #pull is like select. selects n column
  sd() #or use mean() instead of sd

## 1.2 PROBLEM: "Problem with function()"


In [15]:
# Method 1 
df_renamed %>% 
  filter(Age=18) %>% 
  group_by(Household_ID) %>% 
  count() %>% 
  summary()

ERROR: `Age` (`Age = 18`) must not be named, do you need `==`?

In [None]:
df_renamed %>% 
  filter(Age<18) %>% 
  group_by(Household_ID) %>% 
  count() %>% 
  pull(n) %>% 
  sd()

## 1.3 PROBLEM: object 'NAME OF DATA' not found


In [17]:
df_dwelling = read.dta('hh02dta_bc/c_cv.dta')
df_dwelling %>% head(3)

folio,cv01_1,cv02_1,cv03_1,cv03_2,cv04_1,cv04_2,cv05,cv06,cv07,...,cv19_1e,cv19_1f,cv19_1g,cv19_1h,cv20_1a,cv20_1b,cv20_1c,cv20_1d,cv20_1e,cv20_1f
1000,3,4,,,,,1,3,2,...,,,,,1,,,4.0,,
2000,3,2,,,,,1,3,2,...,,,,,1,,,4.0,,
3000,3,2,,,,,1,3,1,...,,,,,1,2.0,,,,


### 1.3.1 Solution: Method 1

In [18]:
df_renamed  %>% 
  mutate(toliet_dummy = as.numeric(cv16==1)) %>% 
  filter(toliet_dummy!='NA') %>% 
  summarise(
    mean_toliet = mean(toliet_dummy),
    sd_toliet = sd(toliet_dummy)
  )

ERROR: Error: object 'cv16' not found


# 1.4 Problem: incorrect spacing, pipes, etc

## 1.4.1 Explore data

In [20]:
df_dwelling %>% 
  select(cv07) %>% 
  unique() %>% 
  

ERROR: Error in parse(text = x, srcfile = src): <text>:5:0: unexpected end of input
3:   unique() %>% 
4:   
  ^


  ### 1.4.2 Answer


In [21]:
df_dwelling %>% 
  select(cv07) %>% 
  mutate(no_sleeping_dummy = as.numeric(cv07==0)) %>% 
  summarise(
    mean(no_sleeping_dummy)*100,
    sd(no_sleeping_dummy)*100
  )

mean(no_sleeping_dummy) * 100,sd(no_sleeping_dummy) * 100
0.01184834,1.088501


# 1.5 Problem: break down function into pieces

## 1.5.1 Explore data

In [22]:
df_dwelling %>% 
  select(cv20_1a) %>% 
  table %>% 
  sort(decreasing = TRUE)

### 1.5.2 Method 1: replace function

In [23]:
df_dwelling %>% 
  mutate(
    firewood_dummy = replace(cv20_1a, is.na(cv20_1a), 0)
  ) %>% 
  summarise(
    mean(firewoo_dummy),
    sd(firewood_dummy)
  )

ERROR: Error in mean(firewoo_dummy): object 'firewoo_dummy' not found


## 1.5.3 Method 2: case_when

In [24]:
df_dwelling %>% 
  mutate(firewood_dummy = case_when(cv20_1a==1~1, TRUE~0)) %>% 
  summarise(
    mean(firewood_dummy)
    sd(firewood_dummy)
  )

ERROR: Error in parse(text = x, srcfile = src): <text>:5:5: unexpected symbol
4:     mean(firewood_dummy)
5:     sd
       ^


# 2.  Problem: no NA filter
# 2.1 average age

In [26]:
# Filter out non head of house
df_head = df_renamed %>% 
  filter(ls05_1==1)

In [27]:
# Hint: filter out na summarise
df_head %>% 
  #filter(Age!= "NA") %>% 
  summarise(
    mean(Age),
    sd(Age)
  )

mean(Age),sd(Age)
,


# 2.2 Problem: Missing pipe operator

In [None]:
df_head %>% 
  filter(Gender!= "NA") 
summarise(
  mean(Gender),
  sd(Gender)
)

## Omit 2.3 & 2.4

# 3. Problem solving: Breakdown function

In [31]:
df_ages = df_renamed%>% 
  filter(Age>5 & Age<19)

In [32]:
df_ages %>% 
  filter(Attendance!="NA")  %>% head(2)

Household_ID,Individual_ID,secuencia,ls00,ls02_1,Age,ls03_1,ls03_21,ls03_22,Gender,...,ls09,ls10,ls11,ls12,ls13_1,ls13_2,ls14,ls15_1,Attendance,ls18
1000,3,3,3,1,16,,,,0,...,1,6.0,,3,,,6,0,1,2
1000,4,4,4,1,11,,,,1,...,1,,,3,,,3,4,1,2


In [None]:
%>% 
  group_by(Age) %>% 
  summarise(
    Avg_attend = mean(Attendance)
  ) %>% ggplot(aes(x=Age, y= Avg_attend)) + 
  geom_bar(stat="identity") + 
  xlab('Attendance (%)') + 
  ylab('Age') 

## 3.1.2 Omitted


# 3.2 Omitted

# 4. Problem: Too much code!

## 4.1 My solution


In [None]:
total_houses = df_renamed %>% select(Household_ID) %>% unique() %>% nrow()
df_renamed %>% 
  mutate(
    house_head = as.numeric(ls05_1==1), #dummy
    house_spouse =  as.numeric(ls05_1==2), #dummy
    house_child =  as.numeric(ls05_1==3), #dummy
    house_other_resident = as.numeric(ls05_1!=3 & ls05_1!=2 & ls05_1!=1) #dummy==1 for all individuals not parents, spouse or child
  ) %>%  
  group_by(Household_ID) %>% #groupby house
  select(house_head, house_spouse,house_child,house_other_resident) %>%  #select created dummies
  summarise_each(funs(sum)) %>% #sum dummy columns
  filter(house_head==1 #house hold head
         & house_spouse==1 #spouse
         & house_child>0 #at least one child
         & house_other_resident!=0 #no other types of residents
  ) %>% 
  count() / total_houses #count of the filter is divided by total # of houses

## 4.2 A brilliant students solution

In [None]:
df_renamed%>% 
  rename('relatheadhh'='ls05_1') %>% 
  filter(relatheadhh!="NA")%>%
  mutate(relatheadhh2=as.numeric(relatheadhh>4))%>%
  group_by(Household_ID)%>%
  summarise(extfamilies=max(relatheadhh2)) %>% 
  filter(extfamilies==1) %>% 
  summarise(pct = n()/total_houses)