# Setup and data ingestion

In [1]:
library(conflicted)

# Data manipulation
library(tidyverse)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
library(reshape2)

# Data loading
library(jsonlite)
library(writexl)
library(readxl)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0       ✔ purrr   0.3.2  
✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
[conflicted] Will prefer dplyr::filter over any other package
[conflicted] Will prefer dplyr::select over any other package


In [2]:
# Workbook settings
options(repr.matrix.max.rows=50, repr.matrix.max.cols=30)

In [3]:
get_rel_file <- function(file) {
    return(file.path("output", "2018-12-07_2019-03-19", file))
}

In [4]:
datafile <- bzfile(get_rel_file("out.json.bz2"), open='r')
data <- stream_in(datafile, flatten=TRUE)
close(datafile)

 Imported 29863 records. Simplifying...


In [5]:
data$treatment.ethnicity <- factor(data$treatment.ethnicity, levels=c("caucasian", "african american", "african-american", "asian", "hispanic"))
# levels(data$treatment.ethnicity)
levels(data$treatment.ethnicity) <- c("caucasian", "african-american", "african-american", "asian", "hispanic")
# levels(data$treatment.ethnicity)
data$treatment.gender <- as.factor(data$treatment.gender)
data$type <- as.factor(data$type)
data$scraper <- as.factor(data$scraper)
data$block_id <- as.factor(data$block_id)
data$agent_id <- as.factor(data$agent_id)
data$time <- as.POSIXct(data$time)

In [6]:
rankings <- data[data$type == 'ranking', ]
rankings <- rankings[!is.null(rankings$ranking), ]
rankings$ad.image_path <- NULL
# rankings$ad.query <- NULL
rankings$ad.title <- NULL
rankings$ad.url <- NULL
rankings$ad.body <- NULL

In [7]:
format_ranking <- function(df) {
    df$idx <- seq.int(nrow(df))
    df$position <- log(df$idx)
    if ("price" %in% colnames(df)) {
        df$price <- parse_number(df$price)
    }
    if("beds" %in% colnames(df)) {
        df$beds[df$beds == "Studio"] <- "0"
        df$beds <- parse_number(df$beds, na = c("", "NA"))
    }
    if("baths" %in% colnames(df)) {
        df$baths <- parse_number(df$baths)
    }
    if("latitude" %in% colnames(df)) {
        df$latitude <- parse_number(df$latitude)
        df$longitude <- parse_number(df$longitude)
    }
    return(df)
}

# Trulia rankings

## Data setup

In [8]:
nestedTruliaRankings <- filter(rankings, scraper == 'TruliaScraper')
nestedTruliaRankings$ranking <- lapply(nestedTruliaRankings$ranking, format_ranking)
nrow(nestedTruliaRankings)

In [9]:
min(nestedTruliaRankings$time)
max(nestedTruliaRankings$time)
max(nestedTruliaRankings$time) - min(nestedTruliaRankings$time)

[1] "2018-11-21 17:02:05 CST"

[1] "2019-03-19 17:56:43 CDT"

Time difference of 117.9963 days

In [10]:
truliaRankings <- unnest(nestedTruliaRankings, ranking)
colnames(truliaRankings)[colnames(truliaRankings)=="street address"] <- "street_address"
truliaRankings <- truliaRankings %>% filter(!is.na(street_address))
nrow(truliaRankings)

In [11]:
# str(truliaRankings)

## Marketwide analysis

### Correlation between demographics and price on index

#### All results

In [12]:
res.trulia <- aov(idx ~ treatment.gender * treatment.ethnicity * price, data=truliaRankings)
summary(res.trulia)

                                             Df Sum Sq Mean Sq F value Pr(>F)
treatment.gender                              1      2    1.70   0.050  0.824
treatment.ethnicity                           3     35   11.67   0.342  0.795
price                                         1     34   34.32   1.006  0.316
treatment.gender:treatment.ethnicity          3     16    5.41   0.159  0.924
treatment.gender:price                        1     15   14.79   0.434  0.510
treatment.ethnicity:price                     3     17    5.74   0.168  0.918
treatment.gender:treatment.ethnicity:price    3     14    4.64   0.136  0.939
Residuals                                  9536 325233   34.11               

#### Top 20

In [13]:
res.trulia <- aov(idx ~ treatment.ethnicity * treatment.gender * price, data=filter(truliaRankings, idx <= 20))
summary(res.trulia)

                                             Df Sum Sq Mean Sq F value   Pr(>F)
treatment.ethnicity                           3      5    1.51   0.108 0.955379
treatment.gender                              1      0    0.28   0.020 0.887266
price                                         1    165  165.18  11.799 0.000595
treatment.ethnicity:treatment.gender          3      2    0.78   0.056 0.982622
treatment.ethnicity:price                     3      0    0.08   0.006 0.999366
treatment.gender:price                        1      0    0.22   0.016 0.900260
treatment.ethnicity:treatment.gender:price    3      1    0.24   0.017 0.996966
Residuals                                  8986 125797   14.00                 
                                              
treatment.ethnicity                           
treatment.gender                              
price                                      ***
treatment.ethnicity:treatment.gender          
treatment.ethnicity:price                    

#### Top 10

In [14]:
res.trulia <- aov(idx ~ treatment.ethnicity * treatment.gender * price, data=filter(truliaRankings, idx <= 10))
summary(res.trulia)

                                             Df Sum Sq Mean Sq F value Pr(>F)  
treatment.ethnicity                           3      0    0.00   0.000 1.0000  
treatment.gender                              1      0    0.00   0.000 0.9999  
price                                         1     47   46.86   5.668 0.0173 *
treatment.ethnicity:treatment.gender          3      0    0.00   0.000 1.0000  
treatment.ethnicity:price                     3      0    0.16   0.019 0.9963  
treatment.gender:price                        1      0    0.41   0.050 0.8238  
treatment.ethnicity:treatment.gender:price    3      1    0.21   0.026 0.9945  
Residuals                                  8436  69745    8.27                 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

#### Top 5

In [15]:
res.trulia <- aov(idx ~ treatment.ethnicity * treatment.gender * price, data=filter(truliaRankings, idx <= 5))
summary(res.trulia)

                                             Df Sum Sq Mean Sq F value Pr(>F)
treatment.ethnicity                           3      0  0.0025   0.001  1.000
treatment.gender                              1      0  0.0000   0.000  0.999
price                                         1      1  0.8629   0.431  0.512
treatment.ethnicity:treatment.gender          3      0  0.0026   0.001  1.000
treatment.ethnicity:price                     3      1  0.3935   0.196  0.899
treatment.gender:price                        1      0  0.1988   0.099  0.753
treatment.ethnicity:treatment.gender:price    3      5  1.6052   0.801  0.493
Residuals                                  4210   8437  2.0040               

#### Top 3

In [16]:
res.trulia <- aov(idx ~ treatment.ethnicity * treatment.gender * price, data=filter(truliaRankings, idx <= 3))
summary(res.trulia)

                                             Df Sum Sq Mean Sq F value Pr(>F)
treatment.ethnicity                           3    0.0  0.0000   0.000  1.000
treatment.gender                              1    0.0  0.0000   0.000  1.000
price                                         1    1.5  1.5201   2.269  0.132
treatment.ethnicity:treatment.gender          3    0.0  0.0001   0.000  1.000
treatment.ethnicity:price                     3    0.1  0.0282   0.042  0.989
treatment.gender:price                        1    0.1  0.1050   0.157  0.692
treatment.ethnicity:treatment.gender:price    3    0.8  0.2760   0.412  0.744
Residuals                                  2522 1689.5  0.6699               

### Correlation between demographics and index on price

#### All results

In [17]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender * idx, data=truliaRankings)
summary(res.trulia)

                                           Df    Sum Sq   Mean Sq F value
treatment.ethnicity                         3 8.108e+12 2.703e+12   0.192
treatment.gender                            1 2.108e+11 2.108e+11   0.015
idx                                         1 1.415e+13 1.415e+13   1.006
treatment.ethnicity:treatment.gender        3 2.152e+12 7.172e+11   0.051
treatment.ethnicity:idx                     3 8.673e+12 2.891e+12   0.206
treatment.gender:idx                        1 5.839e+12 5.839e+12   0.415
treatment.ethnicity:treatment.gender:idx    3 5.955e+12 1.985e+12   0.141
Residuals                                9536 1.341e+17 1.406e+13        
                                         Pr(>F)
treatment.ethnicity                       0.902
treatment.gender                          0.903
idx                                       0.316
treatment.ethnicity:treatment.gender      0.985
treatment.ethnicity:idx                   0.893
treatment.gender:idx                      0.51

#### Top 20

In [18]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender * idx, data=filter(truliaRankings, idx <= 20))
summary(res.trulia)

                                           Df    Sum Sq   Mean Sq F value
treatment.ethnicity                         3 1.091e+13 3.637e+12   0.255
treatment.gender                            1 5.981e+10 5.981e+10   0.004
idx                                         1 1.680e+14 1.680e+14  11.800
treatment.ethnicity:treatment.gender        3 3.580e+12 1.193e+12   0.084
treatment.ethnicity:idx                     3 2.669e+12 8.897e+11   0.062
treatment.gender:idx                        1 1.764e+11 1.764e+11   0.012
treatment.ethnicity:treatment.gender:idx    3 1.338e+12 4.460e+11   0.031
Residuals                                8986 1.279e+17 1.424e+13        
                                           Pr(>F)    
treatment.ethnicity                      0.857506    
treatment.gender                         0.948321    
idx                                      0.000595 ***
treatment.ethnicity:treatment.gender     0.968875    
treatment.ethnicity:idx                  0.979582    
treatment.

#### Top 10

In [19]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender * idx, data=filter(truliaRankings, idx <= 10))
summary(res.trulia)

                                           Df    Sum Sq   Mean Sq F value
treatment.ethnicity                         3 1.153e+13 3.842e+12   0.253
treatment.gender                            1 1.036e+11 1.036e+11   0.007
idx                                         1 8.595e+13 8.595e+13   5.668
treatment.ethnicity:treatment.gender        3 3.750e+12 1.250e+12   0.082
treatment.ethnicity:idx                     3 2.408e+12 8.028e+11   0.053
treatment.gender:idx                        1 5.646e+11 5.646e+11   0.037
treatment.ethnicity:treatment.gender:idx    3 1.617e+12 5.389e+11   0.036
Residuals                                8436 1.279e+17 1.516e+13        
                                         Pr(>F)  
treatment.ethnicity                      0.8590  
treatment.gender                         0.9341  
idx                                      0.0173 *
treatment.ethnicity:treatment.gender     0.9696  
treatment.ethnicity:idx                  0.9839  
treatment.gender:idx              

#### Top 5

In [20]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender * idx, data=filter(truliaRankings, idx <= 5))
summary(res.trulia)

                                           Df    Sum Sq   Mean Sq F value
treatment.ethnicity                         3 3.842e+12 1.281e+12   0.084
treatment.gender                            1 3.139e+10 3.139e+10   0.002
idx                                         1 6.584e+12 6.584e+12   0.431
treatment.ethnicity:treatment.gender        3 3.725e+12 1.242e+12   0.081
treatment.ethnicity:idx                     3 8.049e+12 2.683e+12   0.175
treatment.gender:idx                        1 1.783e+12 1.783e+12   0.117
treatment.ethnicity:treatment.gender:idx    3 3.649e+13 1.216e+13   0.796
Residuals                                4210 6.437e+16 1.529e+13        
                                         Pr(>F)
treatment.ethnicity                       0.969
treatment.gender                          0.964
idx                                       0.512
treatment.ethnicity:treatment.gender      0.970
treatment.ethnicity:idx                   0.913
treatment.gender:idx                      0.73

#### Top 3

In [21]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender * idx, data=filter(truliaRankings, idx <= 3))
summary(res.trulia)

                                           Df    Sum Sq   Mean Sq F value
treatment.ethnicity                         3 1.542e+13 5.140e+12   0.341
treatment.gender                            1 9.006e+11 9.006e+11   0.060
idx                                         1 3.420e+13 3.420e+13   2.270
treatment.ethnicity:treatment.gender        3 1.099e+13 3.663e+12   0.243
treatment.ethnicity:idx                     3 3.394e+12 1.131e+12   0.075
treatment.gender:idx                        1 1.083e+12 1.083e+12   0.072
treatment.ethnicity:treatment.gender:idx    3 1.371e+13 4.568e+12   0.303
Residuals                                2522 3.800e+16 1.507e+13        
                                         Pr(>F)
treatment.ethnicity                       0.796
treatment.gender                          0.807
idx                                       0.132
treatment.ethnicity:treatment.gender      0.866
treatment.ethnicity:idx                   0.973
treatment.gender:idx                      0.78

### Correlation between demographics on price

#### All results

In [22]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender, data=truliaRankings)
summary(res.trulia)

                                       Df    Sum Sq   Mean Sq F value Pr(>F)
treatment.ethnicity                     3 8.108e+12 2.703e+12   0.192  0.902
treatment.gender                        1 2.108e+11 2.108e+11   0.015  0.903
treatment.ethnicity:treatment.gender    3 2.082e+12 6.940e+11   0.049  0.985
Residuals                            9544 1.341e+17 1.405e+13               

#### Top 20

In [23]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender, data=filter(truliaRankings, idx <= 20))
summary(res.trulia)

                                       Df    Sum Sq   Mean Sq F value Pr(>F)
treatment.ethnicity                     3 1.091e+13 3.637e+12   0.255  0.858
treatment.gender                        1 5.981e+10 5.981e+10   0.004  0.948
treatment.ethnicity:treatment.gender    3 3.407e+12 1.136e+12   0.080  0.971
Residuals                            8994 1.281e+17 1.424e+13               

#### Top 10

In [24]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender, data=filter(truliaRankings, idx <= 10))
summary(res.trulia)

                                       Df    Sum Sq   Mean Sq F value Pr(>F)
treatment.ethnicity                     3 1.153e+13 3.842e+12   0.253  0.859
treatment.gender                        1 1.036e+11 1.036e+11   0.007  0.934
treatment.ethnicity:treatment.gender    3 3.747e+12 1.249e+12   0.082  0.970
Residuals                            8444 1.280e+17 1.516e+13               

#### Top 5

In [25]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender, data=filter(truliaRankings, idx <= 5))
summary(res.trulia)

                                       Df    Sum Sq   Mean Sq F value Pr(>F)
treatment.ethnicity                     3 3.842e+12 1.281e+12   0.084  0.969
treatment.gender                        1 3.139e+10 3.139e+10   0.002  0.964
treatment.ethnicity:treatment.gender    3 3.727e+12 1.242e+12   0.081  0.970
Residuals                            4218 6.442e+16 1.527e+13               

#### Top 3

In [26]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender, data=filter(truliaRankings, idx <= 3))
summary(res.trulia)

                                       Df    Sum Sq   Mean Sq F value Pr(>F)
treatment.ethnicity                     3 1.542e+13 5.140e+12   0.342  0.795
treatment.gender                        1 9.006e+11 9.006e+11   0.060  0.807
treatment.ethnicity:treatment.gender    3 1.099e+13 3.663e+12   0.244  0.866
Residuals                            2530 3.806e+16 1.504e+13               

#### Top Result

In [27]:
res.trulia <- aov(price ~ treatment.ethnicity * treatment.gender, data=filter(truliaRankings, idx == 1))
summary(res.trulia)

                                      Df    Sum Sq   Mean Sq F value Pr(>F)
treatment.ethnicity                    3 7.126e+12 2.375e+12   0.143  0.934
treatment.gender                       1 2.789e+12 2.789e+12   0.167  0.682
treatment.ethnicity:treatment.gender   3 7.448e+12 2.483e+12   0.149  0.930
Residuals                            838 1.396e+16 1.665e+13               

### Demographic comparisons

#### By price

In [28]:
truliaRankings %>%
    group_by(treatment.ethnicity, treatment.gender) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.ethnicity,treatment.gender,price
caucasian,female,571665.7
caucasian,male,599231.2
african-american,female,634535.6
african-american,male,608319.2
asian,female,565348.9
asian,male,517509.8
hispanic,female,592218.1
hispanic,male,601952.8


In [29]:
truliaRankings %>%
    group_by(treatment.ethnicity) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.ethnicity,price
caucasian,586083.4
african-american,621212.3
asian,541128.9
hispanic,596918.9


In [30]:
truliaRankings %>%
    group_by(treatment.gender) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.gender,price
female,591297.8
male,581728.4


##### Top 10

In [31]:
truliaRankings %>%
    filter(idx <= 10) %>%
    group_by(treatment.ethnicity, treatment.gender) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.ethnicity,treatment.gender,price
caucasian,female,573353.0
caucasian,male,630397.3
african-american,female,650640.3
african-american,male,641412.9
asian,female,572382.9
asian,male,519303.7
hispanic,female,603364.3
hispanic,male,637316.7


In [32]:
truliaRankings %>%
    filter(idx <= 10) %>%
    group_by(treatment.ethnicity) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.ethnicity,price
caucasian,603081.5
african-american,645941.1
asian,545465.6
hispanic,619680.6


In [33]:
truliaRankings %>%
    filter(idx <= 10) %>%
    group_by(treatment.gender) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.gender,price
female,600329.0
male,607000.8


##### Top 5

In [34]:
truliaRankings %>%
    filter(idx <= 5) %>%
    group_by(treatment.ethnicity, treatment.gender) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.ethnicity,treatment.gender,price
caucasian,female,603720.7
caucasian,male,581419.7
african-american,female,680908.8
african-american,male,589554.0
asian,female,517724.7
asian,male,584308.5
hispanic,female,594548.0
hispanic,male,622878.8


In [35]:
truliaRankings %>%
    filter(idx <= 5) %>%
    group_by(treatment.ethnicity) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.ethnicity,price
caucasian,592098.1
african-american,634385.6
asian,551522.0
hispanic,608149.0


In [36]:
truliaRankings %>%
    filter(idx <= 5) %>%
    group_by(treatment.gender) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.gender,price
female,599597.6
male,593843.6


#### By Size

In [37]:
truliaRankings %>%
    group_by(treatment.ethnicity, treatment.gender) %>%
    summarise_at(vars(beds), mean, na.rm = TRUE)

treatment.ethnicity,treatment.gender,beds
caucasian,female,3.91019
caucasian,male,3.977383
african-american,female,3.963875
african-american,male,3.958665
asian,female,3.934319
asian,male,3.915371
hispanic,female,3.924466
hispanic,male,3.928918


In [38]:
truliaRankings %>%
    group_by(treatment.ethnicity) %>%
    summarise_at(vars(beds), mean, na.rm = TRUE)

treatment.ethnicity,beds
caucasian,3.944908
african-american,3.961228
asian,3.925084
hispanic,3.92671


In [39]:
truliaRankings %>%
    group_by(treatment.gender) %>%
    summarise_at(vars(beds), mean, na.rm = TRUE)

treatment.gender,beds
female,3.9335
male,3.945626


##### Top 10

In [40]:
truliaRankings %>%
    filter(idx <= 10) %>%
    group_by(treatment.ethnicity, treatment.gender) %>%
    summarise_at(vars(beds), mean, na.rm = TRUE)

treatment.ethnicity,treatment.gender,beds
caucasian,female,3.96732
caucasian,male,4.037578
african-american,female,4.023454
african-american,male,4.01636
asian,female,3.989339
asian,male,3.974943
hispanic,female,3.972281
hispanic,male,3.993737


In [41]:
truliaRankings %>%
    filter(idx <= 10) %>%
    group_by(treatment.ethnicity) %>%
    summarise_at(vars(beds), mean, na.rm = TRUE)

treatment.ethnicity,beds
caucasian,4.003198
african-american,4.019833
asian,3.982379
hispanic,3.983122


In [42]:
truliaRankings %>%
    filter(idx <= 10) %>%
    group_by(treatment.gender) %>%
    summarise_at(vars(beds), mean, na.rm = TRUE)

treatment.gender,beds
female,3.98821
male,4.006363


##### Top 5

In [43]:
truliaRankings %>%
    filter(idx <= 5) %>%
    group_by(treatment.ethnicity, treatment.gender) %>%
    summarise_at(vars(beds), mean, na.rm = TRUE)

treatment.ethnicity,treatment.gender,beds
caucasian,female,3.812227
caucasian,male,3.895397
african-american,female,3.885106
african-american,male,3.906122
asian,female,3.876068
asian,male,3.872727
hispanic,female,3.868085
hispanic,male,3.870293


In [44]:
truliaRankings %>%
    filter(idx <= 5) %>%
    group_by(treatment.ethnicity) %>%
    summarise_at(vars(beds), mean, na.rm = TRUE)

treatment.ethnicity,beds
caucasian,3.854701
african-american,3.895833
asian,3.874449
hispanic,3.869198


In [45]:
truliaRankings %>%
    filter(idx <= 5) %>%
    group_by(treatment.gender) %>%
    summarise_at(vars(beds), mean, na.rm = TRUE)

treatment.gender,beds
female,3.860665
male,3.886532


# Realtor.com

## Data setup

In [46]:
nestedRealtorRankings <- rankings[rankings$scraper == 'RealtorRanking', ]
nestedRealtorRankings$ranking <- lapply(nestedRealtorRankings$ranking, format_ranking)
nrow(nestedRealtorRankings)

In [48]:
max(nestedRealtorRankings$time)
min(nestedRealtorRankings$time)
max(nestedRealtorRankings$time) - min(nestedRealtorRankings$time)

[1] "2019-03-19 17:32:10 CDT"

[1] "2018-12-07 12:43:04 CST"

Time difference of 102.1591 days

In [49]:
realtorRankings <- unnest(nestedRealtorRankings, ranking)
colnames(realtorRankings)[colnames(realtorRankings)=="street address"] <- "street_address"
realtorRankings <- realtorRankings %>% filter(!is.na(street_address))
# realtorRankings$dist <- apply(realtorRankings[,c('latitude', 'longitude')], 1, function(x) { distHaversine(chicago_coords, c(x[1], x[2])) })

nrow(realtorRankings)

## Marketwide Analysis

### Correlation between demographics and price on index

In [50]:
res.realtor <- aov(idx ~ treatment.ethnicity * treatment.gender * price, data=realtorRankings)
summary(res.realtor)

                                             Df Sum Sq Mean Sq F value Pr(>F)
treatment.ethnicity                           3    100      33   0.511 0.6750
treatment.gender                              1     19      19   0.291 0.5894
price                                         1  11627   11627 177.771 <2e-16
treatment.ethnicity:treatment.gender          3    152      51   0.775 0.5077
treatment.ethnicity:price                     3    623     208   3.175 0.0231
treatment.gender:price                        1     16      16   0.249 0.6179
treatment.ethnicity:treatment.gender:price    3    292      97   1.487 0.2159
Residuals                                  6545 428076      65               
                                              
treatment.ethnicity                           
treatment.gender                              
price                                      ***
treatment.ethnicity:treatment.gender          
treatment.ethnicity:price                  *  
treatment.gender

In [51]:
res.realtor <- aov(price ~ treatment.ethnicity * treatment.gender * idx, data=realtorRankings)
summary(res.realtor)

                                           Df    Sum Sq   Mean Sq F value
treatment.ethnicity                         3 7.062e+10 2.354e+10   0.211
treatment.gender                            1 3.262e+11 3.262e+11   2.929
idx                                         1 1.978e+13 1.978e+13 177.670
treatment.ethnicity:treatment.gender        3 3.946e+11 1.315e+11   1.181
treatment.ethnicity:idx                     3 2.752e+11 9.174e+10   0.824
treatment.gender:idx                        1 5.834e+11 5.834e+11   5.240
treatment.ethnicity:treatment.gender:idx    3 1.778e+11 5.927e+10   0.532
Residuals                                6545 7.288e+14 1.114e+11        
                                         Pr(>F)    
treatment.ethnicity                      0.8886    
treatment.gender                         0.0870 .  
idx                                      <2e-16 ***
treatment.ethnicity:treatment.gender     0.3153    
treatment.ethnicity:idx                  0.4805    
treatment.gender:idx  

In [52]:
# coefficients(res.realtor)

In [53]:
length(unique(realtorRankings$street_address))

In [54]:
mean(realtorRankings$price)

In [55]:
unique(realtorRankings$url)

### Demographic comparison

In [56]:
realtorRankings %>%
    filter(!is.na(street_address)) %>%
    group_by(treatment.gender) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.gender,price
female,326619.0
male,312426.9


In [57]:
realtorRankings %>%
    filter(!is.na(street_address)) %>%
    filter(idx == 1) %>%
    group_by(treatment.ethnicity) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.ethnicity,price
caucasian,381750.9
african-american,384445.9
asian,363263.2
hispanic,354308.7


In [58]:
realtorRankings %>%
    filter(!is.na(street_address)) %>%
    group_by(treatment.ethnicity, treatment.gender) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.ethnicity,treatment.gender,price
caucasian,female,336797.4
caucasian,male,301855.5
african-american,female,337578.5
african-american,male,312484.4
asian,female,312410.5
asian,male,320450.2
hispanic,female,320422.4
hispanic,male,314870.6


In [59]:
realtorRankings %>%
    filter(!is.na(street_address)) %>%
    filter(idx == 4) %>%
    group_by(treatment.ethnicity) %>%
    summarise_at(vars(price), mean, na.rm = TRUE)

treatment.ethnicity,price
caucasian,407150.2
african-american,366912.9
asian,412190.3
hispanic,389553.5


In [61]:
# dat <- realtorRankings %>%
#     filter(!is.na(street_address)) %>%
#   filter(treatment.ethnicity == "african-american") %>%
#     group_by(idx, treatment.ethnicity) %>%
#     summarise(vars(price), mean, na.rm = TRUE)

# dat
# plot(dat)

# Redfin

In [63]:
# redfinRankings <- rankings[rankings$scraper == 'RedfinScraper', ]
# redfinRankings$ranking <- lapply(redfinRankings$ranking, format_ranking)
# unnestedRedfinRankings <- unnest(redfinRankings, ranking)
# str(unnestedRedfinRankings)
# res.redfin <- aov(position ~ treatment.ethnicity * treatment.gender * price, data=unnestedRedfinRankings)
# summary(res.redfin)