In [None]:
library(jsonlite)
library(writexl)
library(readxl)
library(dplyr)
library(car)
library(readr)
library(tidyr)
library(geosphere)
library(rgdal)
# library(jtools)

options(repr.matrix.max.rows=200, repr.matrix.max.cols=20)

In [None]:
datafile <- file("bkp/out.json", open="r")
data <- stream_in(datafile, flatten=TRUE)
close(datafile)

In [None]:
data$treatment.ethnicity <- factor(data$treatment.ethnicity, levels=c("caucasian", "african american", "african-american", "asian", "hispanic"))
# levels(data$treatment.ethnicity)
levels(data$treatment.ethnicity) <- c("caucasian", "african-american", "african-american", "asian", "hispanic")
# levels(data$treatment.ethnicity)
data$treatment.gender <- as.factor(data$treatment.gender)
data$type <- as.factor(data$type)
data$scraper <- as.factor(data$scraper)
data$block_id <- as.factor(data$block_id)
data$agent_id <- as.factor(data$agent_id)
data$time <- as.POSIXct(data$time)

In [None]:
# levels(data$treatment.ethnicity)

In [None]:
rankings <- data[data$type == 'ranking', ]
rankings <- rankings[!is.null(rankings$ranking), ]
rankings$ad.image_path <- NULL
# rankings$ad.query <- NULL
rankings$ad.title <- NULL
rankings$ad.url <- NULL
rankings$ad.body <- NULL

In [None]:
format_ranking <- function(df) {
    df$idx <- seq.int(nrow(df))
    df$position <- log(df$idx)
    if ("price" %in% colnames(df)) {
        df$price <- parse_number(df$price)
    }
    if("beds" %in% colnames(df)) {
        df$beds[df$beds == "Studio"] <- "0"
        df$beds <- parse_number(df$beds, na = c("", "NA"))
    }
    if("baths" %in% colnames(df)) {
        df$baths <- parse_number(df$baths)
    }
    if("latitude" %in% colnames(df)) {
        df$latitude <- parse_number(df$latitude)
        df$longitude <- parse_number(df$longitude)
    }
    return(df)
}

chicago_coords <- c(41.881832, -87.623177)

## Trulia rankings

In [None]:
truliaRankings <- rankings %>% 
  filter(scraper == 'TruliaScraper') %>% 
  filter(url == 'https://www.trulia.com/IL/Chicago/')

truliaRankings$ranking <- lapply(truliaRankings$ranking, format_ranking)
nrow(truliaRankings)

In [None]:
max(truliaRankings$time) - min(truliaRankings$time)

In [None]:
unnestedTruliaRankings <- unnest(truliaRankings, ranking)
colnames(unnestedTruliaRankings)[colnames(unnestedTruliaRankings)=="street address"] <- "street_address"
unnestedTruliaRankings <- unnestedTruliaRankings %>% filter(!is.na(street_address))
unnestedTruliaRankings$dist <- apply(unnestedTruliaRankings[,c('latitude', 'longitude')], 1, function(x) { distHaversine(chicago_coords, x) })

In [None]:
# str(unnestedTruliaRankings)
unique(unnestedTruliaRankings$url)

In [None]:
res.trulia <- aov(position ~ treatment.ethnicity * treatment.gender * price, data=unnestedTruliaRankings)
summary(res.trulia)

In [None]:
coefficients(res.trulia)

In [None]:
unnestedTruliaRankings %>%
  filter(!is.na(street_address)) %>%
  group_by(treatment.gender) %>%
  summarise_at(vars(price), funs(mean(., na.rm=TRUE)))

In [None]:
unnestedTruliaRankings %>%
  filter(!is.na(street_address)) %>%
  group_by(treatment.ethnicity) %>%
  summarise_at(vars(price), funs(mean(., na.rm=TRUE)))

In [None]:
unnestedTruliaRankings %>%
  filter(!is.na(street_address)) %>%
  group_by(treatment.ethnicity, treatment.gender) %>%
  summarise_at(vars(price), funs(mean(., na.rm=TRUE)))

## Realtor.com

In [None]:
realtorRankings <- rankings[rankings$scraper == 'RealtorRanking', ]
realtorRankings$ranking <- lapply(realtorRankings$ranking, format_ranking)
nrow(realtorRankings)
# str(realtorRankings)

In [None]:
unnestedRealtorRankings <- unnest(realtorRankings, ranking)
colnames(unnestedRealtorRankings)[colnames(unnestedRealtorRankings)=="street address"] <- "street_address"
unnestedRealtorRankings <- unnestedRealtorRankings %>% filter(!is.na(street_address))
# unnestedRealtorRankings <- unnestedRealtorRankings %>% filter(!is.na(street_address)) %>% filter(idx <= 10)
unnestedRealtorRankings$dist <- apply(unnestedRealtorRankings[,c('latitude', 'longitude')], 1, function(x) { distHaversine(chicago_coords, c(x[1], x[2])) })

In [None]:
length(unique(unnestedRealtorRankings$street_address))

In [None]:
max(realtorRankings$time)
min(realtorRankings$time)
max(realtorRankings$time) - min(realtorRankings$time)

In [None]:
res.realtor <- aov(position ~ treatment.ethnicity * treatment.gender * price, data=unnestedRealtorRankings)
summary(res.realtor)

In [None]:
coefficients(res.realtor)

In [None]:
mean(unnestedRealtorRankings$price)

In [None]:
unique(unnestedTruliaRankings$url)

In [None]:
unnestedRealtorRankings %>%
  filter(!is.na(street_address)) %>%
  group_by(treatment.gender) %>%
  summarise_at(vars(price), funs(mean(., na.rm=TRUE)))

In [None]:
unnestedRealtorRankings %>%
  filter(!is.na(street_address)) %>%
  filter(idx == 1) %>%
  group_by(treatment.ethnicity) %>%
  summarise_at(vars(price), funs(mean(., na.rm=TRUE)))

In [None]:
unnestedRealtorRankings %>%
  filter(!is.na(street_address)) %>%
  group_by(treatment.ethnicity, treatment.gender) %>%
  summarise_at(vars(price), funs(mean(., na.rm=TRUE)))

In [None]:
unnestedRealtorRankings %>%
  filter(!is.na(street_address)) %>%
  filter(idx == 4) %>%
  group_by(treatment.ethnicity) %>%
  summarise_at(vars(price), funs(mean(., na.rm=TRUE)))

In [None]:
dat <- unnestedRealtorRankings %>%
  filter(!is.na(street_address)) %>%
#   filter(treatment.ethnicity == "african-american") %>%
  group_by(idx, treatment.ethnicity) %>%
  summarise(vars(price), funs(mean(., na.rm=TRUE)))

dat
# plot(dat)

In [None]:
# dat
fit <- lm(price ~ idx, data = dat)
# fit
# effect_plot(fit, pred = idx, interval = TRUE, plot.points = TRUE)

In [None]:
unnestedRealtorRankings %>%
  filter(!is.na(street_address)) %>%
  group_by(street_address, treatment.ethnicity) %>%
  count()

## Redfin

In [None]:
redfinRankings <- rankings[rankings$scraper == 'RedfinScraper', ]
redfinRankings$ranking <- lapply(redfinRankings$ranking, format_ranking)
unnestedRedfinRankings <- unnest(redfinRankings, ranking)
str(unnestedRedfinRankings)
# res.redfin <- aov(position ~ treatment.ethnicity * treatment.gender * price, data=unnestedRedfinRankings)
# summary(res.redfin)

In [None]:
chicago_neighborhoods <- readOGR(dsn="gis info/Neighborhoods_2012b", layer="Neighborhoods_2012b")