# Setup and data ingestion

In [81]:
library(conflicted)

# Data manipulation
library(tidyverse)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
library(reshape2)

# Data loading
library(jsonlite)
library(writexl)
library(readxl)

# Map + location
library(rgdal)
library(geosphere)
library(raster)
library(maps)
library(car)

[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::select over any other package


In [50]:
# Workbook settings
options(repr.matrix.max.rows=50, repr.matrix.max.cols=30)

In [51]:
get_rel_file <- function(file) {
    return(file.path("output", "2018-12-07_2019-03-19", file))
}

In [52]:
datafile <- bzfile(get_rel_file("out.json.bz2"), open='r')
data <- stream_in(datafile, flatten=TRUE)
close(datafile)

 Imported 29863 records. Simplifying...


In [53]:
data$treatment.ethnicity <- factor(data$treatment.ethnicity, levels=c("caucasian", "african american", "african-american", "asian", "hispanic"))
# levels(data$treatment.ethnicity)
levels(data$treatment.ethnicity) <- c("caucasian", "african-american", "african-american", "asian", "hispanic")
# levels(data$treatment.ethnicity)
data$treatment.gender <- as.factor(data$treatment.gender)
data$type <- as.factor(data$type)
data$scraper <- as.factor(data$scraper)
data$block_id <- as.factor(data$block_id)
data$agent_id <- as.factor(data$agent_id)
data$time <- as.POSIXct(data$time)

In [54]:
rankings <- data[data$type == 'ranking', ]
rankings <- rankings[!is.null(rankings$ranking), ]
rankings$ad.image_path <- NULL
# rankings$ad.query <- NULL
rankings$ad.title <- NULL
rankings$ad.url <- NULL
rankings$ad.body <- NULL

In [55]:
format_ranking <- function(df) {
    df$idx <- seq.int(nrow(df))
    df$position <- log(df$idx)
    if ("price" %in% colnames(df)) {
        df$price <- parse_number(df$price)
    }
    if("beds" %in% colnames(df)) {
        df$beds[df$beds == "Studio"] <- "0"
        df$beds <- parse_number(df$beds, na = c("", "NA"))
    }
    if("baths" %in% colnames(df)) {
        df$baths <- parse_number(df$baths)
    }
    if("latitude" %in% colnames(df)) {
        df$latitude <- parse_number(df$latitude)
        df$longitude <- parse_number(df$longitude)
    }
    return(df)
}

In [56]:
chicago_coords <- c(41.881832, -87.623177)

chicago_neighborhoods <- readOGR("./gis info/Neighborhoods_2012b", "Neighborhoods_2012b")

OGR data source with driver: ESRI Shapefile 
Source: "/home/asplund3/experiments/auditor/gis info/Neighborhoods_2012b", layer: "Neighborhoods_2012b"
with 98 features
It has 4 fields


# Trulia neighborhood analysis

## Data setup

In [57]:
nestedTruliaRankings <- filter(rankings, scraper == 'TruliaScraper')
nestedTruliaRankings$ranking <- lapply(nestedTruliaRankings$ranking, format_ranking)
nrow(nestedTruliaRankings)

In [59]:
min(nestedTruliaRankings$time)
max(nestedTruliaRankings$time)
max(nestedTruliaRankings$time) - min(nestedTruliaRankings$time)

[1] "2018-11-21 17:02:05 CST"

[1] "2019-03-19 17:56:43 CDT"

Time difference of 117.9963 days

In [60]:
truliaRankings <- unnest(nestedTruliaRankings, ranking)
colnames(truliaRankings)[colnames(truliaRankings)=="street address"] <- "street_address"
truliaRankings <- truliaRankings %>% filter(!is.na(street_address))
nrow(truliaRankings)

## Chicago neighborhood analysis

In [61]:
chicagoTR <- truliaRankings %>%
    filter(url == 'https://www.trulia.com/IL/Chicago/') %>%
    filter(!is.na(latitude)) %>%
    filter(!is.na(longitude))
nrow(chicagoTR)
chicagoTR$dist <- apply(chicagoTR[,c('latitude', 'longitude')], 1, function(x) { distHaversine(chicago_coords, x) })

cTR <- chicagoTR
coordinates(cTR) <- ~longitude+latitude
proj4string(cTR) <- CRS("+init=epsg:4326")
cTR <- spTransform(cTR, proj4string(chicago_neighborhoods))
chicagoTR$neighborhood <- sp::over(cTR, chicago_neighborhoods)$PRI_NEIGH

# chicagoTR[1:10,]

In [64]:
# (chicagoTR %>%
#     count(neighborhood, treatment.ethnicity, sort = TRUE))[1:10,]

Are there neighborhoods that are predominantly advertised to one race?

In [66]:
chiNeighborhoodCounts <- chicagoTR %>%
    count(neighborhood, treatment.ethnicity) %>%
    group_by(neighborhood) %>%
    mutate(total=sum(n)) %>%
    ungroup() %>%
    spread(treatment.ethnicity, n, fill = 0)

# chiNeighborhoodCounts

In [67]:
cnc <- chiNeighborhoodCounts %>%
    filter(total > 16) 
  
cnc$a <- cnc %>%
    select(-total, -neighborhood) %>%
    apply(1, function(df) {
        test <- chisq.test(unlist(df), p=c(1/4,1/4,1/4,1/4))
        return(c(test$p.value))
    })

summarize(cnc, min = min(a), neighborhood = neighborhood[which.min(a)])

min,neighborhood
0.7468217,Douglas


In [39]:
chiNeighborhoodCounts %>%
    filter(!total > 16)

neighborhood,total,caucasian,african-american,asian,hispanic
Avalon Park,7,1,2,2,2
Avondale,7,1,2,2,2
Beverly,5,2,1,1,1
Brighton Park,16,4,4,4,4
Bucktown,4,1,2,0,1
Dunning,16,4,4,4,4
Hegewisch,6,1,1,2,2
Hermosa,7,1,2,2,2
Lake View,14,4,4,2,4
Loop,1,0,0,1,0


Since the lowest p-value seen in a chi-quared test is 0.747, and there are no clear outliers for n < 16, we find no neighborhood discrimination.

# Realtor.com neighborhood analysis

## Data setup

In [68]:
nestedRealtorRankings <- rankings[rankings$scraper == 'RealtorRanking', ]
nestedRealtorRankings$ranking <- lapply(nestedRealtorRankings$ranking, format_ranking)
nrow(nestedRealtorRankings)

In [69]:
max(nestedRealtorRankings$time)
min(nestedRealtorRankings$time)
max(nestedRealtorRankings$time) - min(realtorRankings$time)

[1] "2019-03-19 17:32:10 CDT"

[1] "2018-12-07 12:43:04 CST"

Time difference of 102.1591 days

In [70]:
realtorRankings <- unnest(nestedRealtorRankings, ranking)
colnames(realtorRankings)[colnames(realtorRankings)=="street address"] <- "street_address"
realtorRankings <- realtorRankings %>% filter(!is.na(street_address))
# realtorRankings$dist <- apply(realtorRankings[,c('latitude', 'longitude')], 1, function(x) { distHaversine(chicago_coords, c(x[1], x[2])) })

nrow(realtorRankings)

## Chicago neighborhood analysis

In [71]:
unique(realtorRankings$url)

In [72]:
chicagoRR <- realtorRankings %>%
    filter(url == 'https://www.realtor.com/realestateandhomes-search/Chicago_IL') %>%
    filter(!is.na(latitude)) %>%
    filter(!is.na(longitude))

nrow(chicagoRR)

In [73]:
cRR <- chicagoRR
coordinates(cRR) <- ~longitude+latitude
proj4string(cRR) <- CRS("+init=epsg:4326")
cRR <- spTransform(cRR, proj4string(chicago_neighborhoods))
chicagoRR$neighborhood <- sp::over(cRR, chicago_neighborhoods)$PRI_NEIGH
chicagoRR$neighborhood <- as.character(chicagoRR$neighborhood)
chicagoRR$neighborhood[is.na(chicagoRR$neighborhood)] <- chicagoRR$locality[is.na(chicagoRR$neighborhood)]

# chicagoRR[1:10,]

In [75]:
chiNeighborhoods <- chicagoRR %>%
    count(neighborhood, treatment.ethnicity) %>%
    group_by(neighborhood) %>%
    mutate(total=sum(n)) %>%
    ungroup() %>%
    spread(treatment.ethnicity, n, fill = 0)

# chiNeighborhoods[1:5,]

In [80]:
cnc <- chiNeighborhoods %>%
    filter(total > 16)

cnc$pval <- cnc %>%
    select(-total, -neighborhood) %>%
    apply(1, function(df) {
        test <- chisq.test(unlist(df), p=c(1/4,1/4,1/4,1/4))
        return(c(test$p.value))
    })

filter(cnc, pval < 0.1)

neighborhood,total,caucasian,african-american,asian,hispanic,pval
Lake View,63,19,8,15,21,0.09919499


In [79]:
cnc2 <- chiNeighborhoods %>%
    filter(!total > 16) %>%
    arrange(desc(total), caucasian)

Possible redlining in Lake View area