# Setup and data ingestion

In [1]:
library(conflicted)

# Data manipulation
library(tidyverse)
conflict_prefer("filter", "dplyr")
library(reshape2)

# Data loading
library(jsonlite)
library(writexl)
library(readxl)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0       ✔ purrr   0.3.2  
✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
[conflicted] Will prefer dplyr::filter over any other package


In [15]:
# Workbook settings
options(repr.matrix.max.rows=50, repr.matrix.max.cols=30)

First we need to import the data and tidy up the datatypes a bit

In [2]:
get_rel_file <- function(file) {
    return(file.path("output", "2018-12-07_2019-03-19", file))
}

In [3]:
datafile <- bzfile(get_rel_file("out.json.bz2"), open='r')
data <- stream_in(datafile, flatten=TRUE)
close(datafile)

 Imported 29863 records. Simplifying...


In [4]:
data$treatment.ethnicity <- fct_explicit_na(as.factor(data$treatment.ethnicity))
data$treatment.gender <- fct_explicit_na(as.factor(data$treatment.gender))
data$type <- as.factor(data$type)
data$scraper <- as.factor(data$scraper)
data$block_id <- as.factor(data$block_id)
data$agent_id <- as.factor(data$agent_id)

Now we split off just the ads from our data

In [17]:
ads <- data[data$type == 'ad', ]
ads$ranking <- NULL
nrow(ads)

Next we write out the data for coding

In [6]:
# str(data)

In [12]:
unique_ads <- unique(ads[c("ad.title", "ad.url", "ad.image_path")])
write_xlsx(unique_ads, get_rel_file("unique-ads.xlsx"))
nrow(unique_ads)

Now code the results, read them back in, and join them to the existing data

In [49]:
categorized_ads <- read_excel(get_rel_file("unique-ads-coded.xlsx"))
categorized_ads$ad.category <- as.factor(categorized_ads$category)
categorized_ads$category <- NULL
categorized_ads$ad.title <- gsub("\r\n", "\n", categorized_ads$ad.title)
levels(categorized_ads$ad.category)
# colnames(ads)
# colnames(categorized_ads)
# nrow(categorized_ads)

In [23]:
# google_ads <- ads %>%
#   filter(is.na(ad.image_path)) %>%
#   filter(url != "https://www.bbc.com/news/world")

# categorized_google_ads <- google_ads %>%
#   left_join(categorized_ads, by = c("ad.title", "ad.url", "ad.image_path"))

# nrow(google_ads)
# nrow(filter(categorized_google_ads,!is.na(ad.category)))
# nrow(filter(categorized_google_ads,is.na(ad.category)))

# filter(categorized_ads, grepl("www.centralilhomefinder.com/Champaign/Houses", ad.title, fixed=TRUE))[1:10,]
# filter(categorized_google_ads, is.na(ad.category))[1,]$ad.title
# filter(categorized_google_ads, is.na(ad.category))

# categorized_bbc_ads <- ads %>%
#   filter(url == "https://www.bbc.com/news/world") %>%
#   inner_join(categorized_ads, by = c("ad.title", "ad.url", "ad.image_path"))

# nrow(filter(ads, url == "https://www.bbc.com/news/world"))
# nrow(categorized_bbc_ads)

In [26]:
merged <- ads %>%
  inner_join(categorized_ads, by = c("ad.url", "ad.image_path", "ad.title"))

nrow(ads)
nrow(merged)

# Ad Analysis

## Overall ads

In [48]:
overall_ads <- merged %>%
  filter(ad.category == 'foreclosure' | ad.category == 'land' | ad.category == 'mobile homes' | ad.category == 'real estate education' | ad.category == 'real estate investment' | ad.category == 'realtor' | ad.category == 'rental' | ad.category == 'rent-to-own' | ad.category == 'rv' | ad.category == 'search engine' | ad.category == 'senior living')

nrow(overall_ads)
count(overall_ads, treatment.ethnicity, treatment.gender)
# str(overall_ads)

treatment.ethnicity,treatment.gender,n
african american,female,1219
african american,male,1308
african-american,female,87
african-american,male,77
asian,female,1384
asian,male,1422
caucasian,female,1340
caucasian,male,1490
hispanic,female,1397
hispanic,male,1389


In [42]:
# aggregator_ads <- count(subset(merged, ad.category == "aggregator"), agent_id, treatment.gender, treatment.ethnicity, scraper)
res.overall <- aov(n ~ treatment.ethnicity * treatment.gender, data = count(overall_ads, agent_id, treatment.ethnicity, treatment.gender, scraper))
summary(res.overall)

                                       Df Sum Sq Mean Sq F value Pr(>F)  
treatment.ethnicity                     5    331   66.26   1.144 0.3353  
treatment.gender                        1    159  158.80   2.742 0.0981 .
treatment.ethnicity:treatment.gender    4    128   32.00   0.553 0.6972  
Residuals                            1058  61283   57.92                 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

In [45]:
count(filter(overall_ads, ad.category == 'listing'), agent_id, treatment.ethnicity, treatment.gender)

ERROR: Error: `ad.category` (`ad.category = "listing"`) must not be named, do you need `==`?


In [47]:
res.listings <- aov(n ~ treatment.ethnicity * treatment.gender,
                    data = count(filter(merged, ad.category == 'listing'), agent_id, treatment.ethnicity, treatment.gender))
summary(res.listings)

                                     Df Sum Sq Mean Sq F value Pr(>F)
treatment.ethnicity                   4   3.10  0.7742   0.967  0.430
treatment.gender                      1   0.44  0.4406   0.550  0.460
treatment.ethnicity:treatment.gender  3   2.22  0.7406   0.925  0.432
Residuals                            88  70.47  0.8008               