# Setup and data ingestion

In [30]:
library(conflicted)
library(IRdisplay)

# Data manipulation
library(tidyverse)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
library(reshape2)

# Data loading
library(jsonlite)
library(writexl)
library(readxl)

# Map + location
library(rgdal)
library(geosphere)
library(raster)
library(maps)
library(car)


Please cite as: 

 Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.2. https://CRAN.R-project.org/package=stargazer 

[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::select over any other package


In [2]:
# Workbook settings
options(repr.matrix.max.rows=50, repr.matrix.max.cols=30)

In [3]:
get_rel_file <- function(file) {
    return(file.path("output", "2018-12-07_2019-03-19", file))
}

In [4]:
datafile <- bzfile(get_rel_file("out2.json.bz2"), open='r')
data <- stream_in(datafile, flatten=TRUE)
close(datafile)

 Imported 29863 records. Simplifying...


In [5]:
data <- data %>%
  filter(!is.na(treatment.ethnicity)) %>%
  filter(!is.na(treatment.gender))

data$treatment.ethnicity <- as.factor(data$treatment.ethnicity)
data$treatment.gender <- as.factor(data$treatment.gender)
data$type <- as.factor(data$type)
data$scraper <- as.factor(data$scraper)
data$block_id <- as.factor(data$block_id)
data$agent_id <- as.factor(data$agent_id)
data$time <- as.POSIXct(data$time)

In [6]:
rankings <- data[data$type == 'ranking', ]
rankings <- rankings[!is.null(rankings$ranking), ]
rankings$ad.image_path <- NULL
rankings$ad.title <- NULL
rankings$ad.url <- NULL
rankings$ad.body <- NULL

In [7]:
format_ranking <- function(df) {
    df$idx <- seq.int(nrow(df))
    df$position <- log(df$idx)
    if ("price" %in% colnames(df)) {
        df$price <- parse_number(df$price)
    }
    if("beds" %in% colnames(df)) {
        df$beds[df$beds == "Studio"] <- "0"
        df$beds <- parse_number(df$beds, na = c("", "NA"))
    }
    if("baths" %in% colnames(df)) {
        df$baths <- parse_number(df$baths)
    }
    if("latitude" %in% colnames(df)) {
        df$latitude <- parse_number(df$latitude)
        df$longitude <- parse_number(df$longitude)
    }
    return(df)
}

In [9]:
chicago_coords <- c(41.881832, -87.623177)

chicago_neighborhoods <- readOGR("./gis info/Neighborhoods_2012b", "Neighborhoods_2012b")

OGR data source with driver: ESRI Shapefile 
Source: "/home/asplund3/experiments/auditor/gis info/Neighborhoods_2012b", layer: "Neighborhoods_2012b"
with 98 features
It has 4 fields


# Trulia neighborhood analysis

## Data setup

In [10]:
nestedTruliaRankings <- filter(rankings, scraper == 'TruliaScraper')
nestedTruliaRankings$ranking <- lapply(nestedTruliaRankings$ranking, format_ranking)
nrow(nestedTruliaRankings)

In [11]:
min(nestedTruliaRankings$time)
max(nestedTruliaRankings$time)
max(nestedTruliaRankings$time) - min(nestedTruliaRankings$time)

[1] "2018-11-21 17:02:05 CST"

[1] "2019-03-19 17:56:43 CDT"

Time difference of 117.9963 days

In [12]:
truliaRankings <- unnest(nestedTruliaRankings, ranking)
truliaRankings$type1 <- NULL
colnames(truliaRankings)[colnames(truliaRankings)=="street address"] <- "street_address"
truliaRankings <- truliaRankings %>% filter(!is.na(street_address))
nrow(truliaRankings)

## Chicago neighborhood analysis

In [13]:
chicagoTR <- truliaRankings %>%
    filter(url == 'https://www.trulia.com/IL/Chicago/') %>%
    filter(!is.na(latitude)) %>%
    filter(!is.na(longitude))
nrow(chicagoTR)
chicagoTR$dist <- apply(chicagoTR[,c('latitude', 'longitude')], 1, function(x) { distHaversine(chicago_coords, x) })

cTR <- chicagoTR
coordinates(cTR) <- ~longitude+latitude
proj4string(cTR) <- CRS("+init=epsg:4326")
cTR <- spTransform(cTR, proj4string(chicago_neighborhoods))
chicagoTR$neighborhood <- sp::over(cTR, chicago_neighborhoods)$PRI_NEIGH

# chicagoTR[1:10,]

In [33]:
# str(chicagoTR)

Are there neighborhoods that are predominantly advertised to one race?

In [37]:
chiNeighborhoodCounts <- chicagoTR %>%
    filter(idx < 30) %>%
    count(neighborhood, treatment.ethnicity) %>%
    group_by(neighborhood) %>%
    mutate(total=sum(n)) %>%
    ungroup() %>%
    spread(treatment.ethnicity, n, fill = 0)

# chiNeighborhoodCounts

In [40]:
cnc <- chiNeighborhoodCounts %>%
    filter(total > 16) 
  
cnc$pval <- cnc %>%
    select(-total, -neighborhood) %>%
    apply(1, function(df) {
        test <- chisq.test(unlist(df), p=c(1/4,1/4,1/4,1/4))
        return(c(test$p.value))
    })

cnc %>% top_n(-5, pval) %>% arrange(pval)
# summarize(cnc, min = min(a), neighborhood = neighborhood[which.min(a)])

“Chi-squared approximation may be incorrect”

neighborhood,total,african-american,asian,caucasian,hispanic,pval
Douglas,62,18,12,16,16,0.7468217
Old Town,63,18,13,16,16,0.8471877
Grand Crossing,93,26,20,23,24,0.8479233
Chatham,62,18,14,14,16,0.8709254
Belmont Cragin,85,23,18,22,22,0.8745867


In [41]:
cnc %>%
    filter(neighborhood %in% c("Calumet Heights", "Kenwood", "Washington Park", "West Lawn"))

neighborhood,total,african-american,asian,caucasian,hispanic,pval
Calumet Heights,45,12,10,12,11,0.9701143
Kenwood,39,10,10,9,10,0.994455
Washington Park,83,20,21,22,20,0.9876664
West Lawn,47,12,11,12,12,0.9957922


In [34]:
chiNeighborhoodCounts <- chicagoTR %>%
    filter(idx < 30) %>%
    count(neighborhood, treatment.gender) %>%
    group_by(neighborhood) %>%
    mutate(total=sum(n)) %>%
    ungroup() %>%
    spread(treatment.gender, n, fill = 0)

cnc <- chiNeighborhoodCounts %>%
    filter(total > 16) 
  
cnc$a <- cnc %>%
    select(-total, -neighborhood) %>%
    apply(1, function(df) {
        test <- chisq.test(unlist(df), p=c(1/2,1/2))
        return(c(test$p.value))
    })

summarize(cnc, min = min(a), neighborhood = neighborhood[which.min(a)])

min,neighborhood
0.4912971,Streeterville


In [17]:
chiNeighborhoodCounts %>%
   filter(neighborhood == "Douglas")

neighborhood,total,african-american,asian,caucasian,hispanic
Douglas,62,18,12,16,16


In [18]:
chiNeighborhoodCounts %>%
    sample_n(5)

neighborhood,total,african-american,asian,caucasian,hispanic
Gage Park,28,8,7,6,7
North Lawndale,22,5,6,5,6
Garfield Park,28,8,7,7,6
Austin,53,15,14,11,13
Brighton Park,16,4,4,4,4


In [19]:
chiNeighborhoodCounts %>%
    filter(!total > 16)

neighborhood,total,african-american,asian,caucasian,hispanic
Avalon Park,7,2,2,1,2
Avondale,7,2,2,1,2
Beverly,5,1,1,2,1
Brighton Park,16,4,4,4,4
Bucktown,4,2,0,1,1
Dunning,16,4,4,4,4
Hegewisch,6,1,2,1,2
Hermosa,7,2,2,1,2
Lake View,10,3,2,2,3
Loop,1,0,1,0,0


Since the lowest p-value seen in a chi-quared test is 0.747, and there are no clear outliers for n < 16, we find no neighborhood discrimination.

# Realtor.com neighborhood analysis

## Data setup

In [20]:
nestedRealtorRankings <- rankings[rankings$scraper == 'RealtorRanking', ]
nestedRealtorRankings$ranking <- lapply(nestedRealtorRankings$ranking, format_ranking)
nrow(nestedRealtorRankings)

In [21]:
max(nestedRealtorRankings$time)
min(nestedRealtorRankings$time)
max(nestedRealtorRankings$time) - min(nestedRealtorRankings$time)

[1] "2019-03-19 17:32:10 CDT"

[1] "2018-12-07 12:43:04 CST"

Time difference of 102.1591 days

In [22]:
realtorRankings <- unnest(nestedRealtorRankings, ranking)
realtorRankings$type1 <- NULL
colnames(realtorRankings)[colnames(realtorRankings)=="street address"] <- "street_address"
realtorRankings <- realtorRankings %>% filter(!is.na(street_address))
# realtorRankings$dist <- apply(realtorRankings[,c('latitude', 'longitude')], 1, function(x) { distHaversine(chicago_coords, c(x[1], x[2])) })

nrow(realtorRankings)

## Chicago neighborhood analysis

In [23]:
unique(realtorRankings$url)

In [24]:
chicagoRR <- realtorRankings %>%
    filter(url == 'https://www.realtor.com/realestateandhomes-search/Chicago_IL') %>%
    filter(!is.na(latitude)) %>%
    filter(!is.na(longitude))

nrow(chicagoRR)

In [25]:
cRR <- chicagoRR
coordinates(cRR) <- ~longitude+latitude
proj4string(cRR) <- CRS("+init=epsg:4326")
cRR <- spTransform(cRR, proj4string(chicago_neighborhoods))
chicagoRR$neighborhood <- sp::over(cRR, chicago_neighborhoods)$PRI_NEIGH
chicagoRR$neighborhood <- as.character(chicagoRR$neighborhood)
chicagoRR$neighborhood[is.na(chicagoRR$neighborhood)] <- chicagoRR$locality[is.na(chicagoRR$neighborhood)]

# chicagoRR[1:10,]

In [26]:
chiNeighborhoods <- chicagoRR %>%
    filter(idx < 16) %>%
    count(neighborhood, treatment.ethnicity) %>%
    group_by(neighborhood) %>%
    mutate(total=sum(n)) %>%
    ungroup() %>%
    spread(treatment.ethnicity, n, fill = 0)

# chiNeighborhoods[1:5,]

In [28]:
cnc <- chiNeighborhoods %>%
    filter(total > 16)

cnc$pval <- cnc %>%
    select(-total, -neighborhood) %>%
    apply(1, function(df) {
        test <- chisq.test(unlist(df), p=c(1/4,1/4,1/4,1/4))
        return(c(test$p.value))
    })

cnc %>% top_n(-5, pval) %>% arrange(pval)

“Chi-squared approximation may be incorrect”

neighborhood,total,african-american,asian,caucasian,hispanic,pval
Lake View,62,7,15,19,21,0.05966718
South Shore,21,7,6,2,6,0.42193453
Rogers Park,21,8,3,5,5,0.48833822
Cicero,29,7,10,5,7,0.62398066
Austin,36,9,6,10,11,0.66950875


In [32]:
cnc %>%
    filter(neighborhood %in% c("Lake View", "Loop", "Norwood Park", "Cicero", "Chatham")) %>%
    arrange(neighborhood)

neighborhood,total,african-american,asian,caucasian,hispanic,pval
Chatham,36,8,9,10,9,0.97392457
Cicero,29,7,10,5,7,0.62398066
Lake View,62,7,15,19,21,0.05966718
Loop,27,7,5,8,7,0.87233246
Norwood Park,20,5,4,6,5,0.94024249


In [97]:
chiNeighborhoods %>%
    filter(!total > 16) %>%
    arrange(desc(total), desc(caucasian))

neighborhood,total,african-american,asian,caucasian,hispanic
Chicago Lawn,16,6,2,4,4
Humboldt Park,16,7,4,3,2
West Loop,15,4,2,5,4
Auburn Gresham,15,5,4,3,3
Dunning,14,3,3,5,3
Jefferson Park,14,4,3,5,2
West Town,14,3,3,4,4
Old Town,14,5,4,2,3
Elmwood Park,13,3,3,3,4
Logan Square,13,3,3,3,4


In [35]:
chicagoRR <- realtorRankings %>%
    filter(url == 'https://www.realtor.com/realestateandhomes-search/Chicago_IL') %>%
    filter(!is.na(latitude)) %>%
    filter(!is.na(longitude))

cRR <- chicagoRR
coordinates(cRR) <- ~longitude+latitude
proj4string(cRR) <- CRS("+init=epsg:4326")
cRR <- spTransform(cRR, proj4string(chicago_neighborhoods))
chicagoRR$neighborhood <- sp::over(cRR, chicago_neighborhoods)$PRI_NEIGH
chicagoRR$neighborhood <- as.character(chicagoRR$neighborhood)
chicagoRR$neighborhood[is.na(chicagoRR$neighborhood)] <- chicagoRR$locality[is.na(chicagoRR$neighborhood)]

chiNeighborhoods <- chicagoRR %>%
    filter(idx < 16) %>%
    count(neighborhood, treatment.gender) %>%
    group_by(neighborhood) %>%
    mutate(total=sum(n)) %>%
    ungroup() %>%
    spread(treatment.gender, n, fill = 0)

cnc <- chiNeighborhoods %>%
    filter(total > 16)

cnc$pval <- cnc %>%
    select(-total, -neighborhood) %>%
    apply(1, function(df) {
        test <- chisq.test(unlist(df), p=c(1/2,1/2))
        return(c(test$p.value))
    })

cnc %>% top_n(-5, pval) %>% arrange(pval)

neighborhood,total,female,male,pval
Loop,27,10,17,0.1779317
Evergreen Park,37,15,22,0.2498174
South Shore,21,8,13,0.2752335
Lake View,62,27,35,0.3096289
Cicero,29,12,17,0.3531604


Possible redlining in Lake View area