In [2]:
library(geosphere)
library(igraph)
library(dplyr)
library(blockTools)
library(foreach)
library(doSNOW)
library(rlang)
library(Matrix)

airbnb_listings <- read.csv('Airbnb_Listings_Miami.csv', row.names=1)

data_directory <- '/nfs/sloanlab004/projects/covid_mobility_proj/data/PROCESSED_DATA/isr_market/'


Attaching package: ‘igraph’


The following objects are masked from ‘package:stats’:

    decompose, spectrum


The following object is masked from ‘package:base’:

    union



Attaching package: ‘dplyr’


The following objects are masked from ‘package:igraph’:

    as_data_frame, groups, union


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: iterators

Loading required package: snow


Attaching package: ‘rlang’


The following object is masked from ‘package:igraph’:

    is_named




In [37]:
Mode <- function(vector) { 
  as.numeric(names(sort(-table(vector)))[1])
}

imputeMode <- function(vector) { 
  vector[is.na(vector)] <- Mode(vector)
  return(vector)
}

imputeMean <- function(vector) { 
  vector[is.na(vector)] <- mean(vector, na.rm=T)
  return(vector)
}

create_df_for_blocking <- function(df, var_name) { 
  df %>%
    group_by(!!var_name) %>% 
    summarise(n = n(),
              avg_reviews_scaled = mean(reviews_scaled),
              avg_satisfaction_scaled = mean(satisfaction_scaled),
              avg_accom_scaled = mean(accomm_scaled),
              avg_bed_scaled = mean(bed_scaled),
              avg_bath_scaled = mean(bath_scaled),
              avg_minstay_scaled = mean(minstay_scaled),
              avg_lat = mean(latitude),
              avg_lon = mean(longitude),
              pct_private_room = mean(`Private room`),
              pct_shared_room = mean(`Shared room`),
              pct_entire_home = mean(`Entire home/apt`),
              sd_reviews_scaled = sd(reviews_scaled),
              sd_satisfaction_scaled = sd(satisfaction_scaled),
              sd_accom_scaled = sd(accomm_scaled),
              sd_bed_scaled = sd(bed_scaled),
              sd_bath_scaled = sd(bath_scaled),
              sd_minstay_scaled = sd(minstay_scaled),
              sd_lat = sd(latitude),
              sd_lon = sd(longitude)) %>% 
    ungroup() -> df
  
  return(df)
}

generate_blocks <- function(blocking_df, var_name) {
  blocks <- blockTools::block(as.data.frame(blocking_df), id.vars=var_name, 
                              block.vars = c('n', 
                                             'avg_reviews_scaled',
                                             'avg_satisfaction_scaled',
                                             'avg_bed_scaled',
                                             'avg_bath_scaled',
                                             'avg_minstay_scaled',
                                             'avg_lat',
                                             'avg_lon',
                                             'pct_private_room',
                                             'pct_shared_room'))
}

create_df_for_blocking_wash <- function(df, var_name) { 
  df %>%
    group_by(!!var_name) %>% 
    summarise(n = n(),
              avg_reviews_scaled = mean(reviews_scaled),
              avg_satisfaction_scaled = mean(satisfaction_scaled),
              avg_accom_scaled = mean(accomm_scaled),
              avg_bed_scaled = mean(bed_scaled),
              avg_minstay_scaled = mean(minstay_scaled),
              avg_lat = mean(latitude),
              avg_lon = mean(longitude),
              pct_private_room = mean(`Private room`),
              pct_shared_room = mean(`Shared room`),
              pct_entire_home = mean(`Entire home/apt`),
              sd_reviews_scaled = sd(reviews_scaled),
              sd_satisfaction_scaled = sd(satisfaction_scaled),
              sd_accom_scaled = sd(accomm_scaled),
              sd_bed_scaled = sd(bed_scaled),
              sd_minstay_scaled = sd(minstay_scaled),
              sd_lat = sd(latitude),
              sd_lon = sd(longitude)) %>% 
    ungroup() -> df
  
  return(df)
}

generate_blocks_wash <- function(blocking_df, var_name) {
  blocks <- blockTools::block(as.data.frame(blocking_df), id.vars=var_name, 
                              block.vars = c('n', 
                                             'avg_reviews_scaled',
                                             'avg_satisfaction_scaled',
                                             'avg_bed_scaled',
                                             'avg_minstay_scaled',
                                             'avg_lat',
                                             'avg_lon',
                                             'pct_private_room',
                                             'pct_shared_room'))
}

hajek_probabilities <- function(dataframe, clusters, graph, threshold, treatment_prob, n_iter, n_clusters, blocks) { 
  n_effective_treatment <- matrix(0L, nrow = nrow(dataframe), ncol = nrow(dataframe))
  n_effective_control <- matrix(0L, nrow = nrow(dataframe), ncol = nrow(dataframe))
  
  cl <- makeSOCKcluster(n_clusters)
  registerDoSNOW(cl)
  tmp <- foreach(i=1:n_iter, .combine = '+', .packages=c('blockTools', 'igraph', 
                                                     'Matrix', 'dplyr'),
                 .multicombine= TRUE,
                 .maxcombine = 2) %dopar% {
    n_communities <- length(unique(clusters))
    #vertsInAssignments <- membership(clusters) %in% which(assignments == 1)
    assignments <- rep(0, n_communities)
    assignments[as.numeric(
      as.character(unlist(blockTools::assignment(blocks)[[1]][[1]]['Treatment 1'])))] <- 1
    vertsInAssignments <- clusters %in% which(assignments == 1)
    dataframe$cluster_membership <- clusters
    dataframe$pct_treated_neighbors <- sapply(1:vcount(graph), FUN=function(i){
      neigh <- neighbors(graph, i)
      pct <- sum(vertsInAssignments[neigh])/length(neigh)
      ifelse(is.nan(pct), assignments[dataframe$cluster_membership[i]], pct)
    })
    dataframe %>% mutate(
      effective_treatment = ifelse(assignments[cluster_membership] == 1 & pct_treated_neighbors >= threshold, 1, 0),
      effective_control = ifelse(assignments[cluster_membership] == 0 & pct_treated_neighbors <= (1-threshold), 1, 0)
    ) -> dataframe
    
    n_effective_treatment = dataframe$effective_treatment %o% dataframe$effective_treatment
    n_effective_control = dataframe$effective_control %o% dataframe$effective_control
    n_effective_treatment_control = dataframe$effective_treatment %o% dataframe$effective_control
    
    as(bdiag(n_effective_control, n_effective_treatment, n_effective_treatment_control), "sparseMatrix")
  }
  stopCluster(cl)
  
  prob_effective_treatment <- tmp[1:nrow(dataframe), 1:nrow(dataframe)]/n_iter
  prob_effective_control <- tmp[(nrow(dataframe)+1):(2*nrow(dataframe)), (nrow(dataframe)+1):(2*nrow(dataframe))]/n_iter
  prob_effective_treatment_control <- tmp[(2*nrow(dataframe)+1):(3*nrow(dataframe)), 
                                          (2*nrow(dataframe)+1):(3*nrow(dataframe))]/n_iter
  
  return(list(prob_effective_treatment, prob_effective_control, prob_effective_treatment_control))
}

In [4]:
mile_in_meters <- 1609.34

airbnb_listings$accommodates <- imputeMode(airbnb_listings$accommodates)
airbnb_listings$bedrooms <- imputeMode(airbnb_listings$bedrooms)
airbnb_listings$bathrooms <- imputeMode(airbnb_listings$bathrooms)
airbnb_listings$minstay[airbnb_listings$minstay > 30] <- 30
airbnb_listings$minstay <- imputeMode(airbnb_listings$minstay)
airbnb_listings$overall_satisfaction <- imputeMean(airbnb_listings$overall_satisfaction)

airbnb_listings$accommodates[is.na(airbnb_listings$accommodates)] <- Mode(airbnb_listings$accommodates)

In [None]:
distances <- matrix(, nrow=nrow(airbnb_listings), ncol=nrow(airbnb_listings))
room_types <- matrix(, nrow=nrow(airbnb_listings), ncol = nrow(airbnb_listings))
accommodates <- matrix(, nrow=nrow(airbnb_listings), ncol = nrow(airbnb_listings))


for (i in 1:nrow(airbnb_listings)) {
  for (j in i:nrow(airbnb_listings)) {
    distances[i, j] <- distHaversine(c(airbnb_listings$longitude[i], 
                                       airbnb_listings$latitude[i]), 
                                     c(airbnb_listings$longitude[j], airbnb_listings$latitude[j]))
    distances[j, i] <- distances[i, j]
    room_types[i, j] <- ifelse(airbnb_listings$room_type[i] == airbnb_listings$room_type[j], 1, 0)
    room_types[j, i] <- room_types[i, j]
    accommodates[i, j] <- ifelse(abs(airbnb_listings$accommodates[i] - airbnb_listings$accommodates[j]) <= 1, 1, 0)
    accommodates[j, i] <- accommodates[i, j]
  }
  if(i %% 1000 == 0) {
      message(as.character(i))
  }
}

In [None]:
save(accommodates, distances, room_types, file=paste0(
         c(data_directory, 'parsed_data.Rdata'),
         sep='', collapse=''
     ))

In [16]:
load(paste0(
         c(data_directory, 'parsed_data.Rdata'),
         sep='', collapse=''
     ))

In [32]:
set.seed(15840)

In [33]:
### Create graph and communities for distance only graph
distances_binary <- ifelse(distances <= mile_in_meters, 1, 0)
graph_distance_only <- graph_from_adjacency_matrix(distances_binary, mode = 'undirected', diag = FALSE)
clusters_distance_only <- membership(cluster_louvain(graph_distance_only))

### Create graph and communities for distance plus same room type graph
distances_plus_same_type <- distances_binary + room_types
distances_plus_same_type_binary <- ifelse(distances_plus_same_type == 2, 1, 0)
graph_distance_room_type <- graph_from_adjacency_matrix(distances_plus_same_type_binary, mode='undirected',
                                                        diag = FALSE)
clusters_distance_room_type <- membership(cluster_louvain(graph_distance_room_type))

### Create graph and communities for distance plus same room type plus same size graph
distances_plus_same_type_plus_accom <- distances_plus_same_type + accommodates
distances_plus_same_type_plus_accom_binary <- ifelse(distances_plus_same_type_plus_accom == 3, 1, 0)
graph_distance_room_type_accom <- graph_from_adjacency_matrix(distances_plus_same_type_plus_accom_binary, 
                                                              mode='undirected', diag = FALSE)
clusters_distance_room_type_accom <- membership(cluster_louvain(graph_distance_room_type_accom))

clusters_distance_room_type_accom_fg <- cluster_fast_greedy(graph_distance_room_type_accom)

In [34]:
clusters_distance_room_type_accom_100 <- cutree(as.hclust(clusters_distance_room_type_accom_fg), k=100)
clusters_distance_room_type_accom_200 <- cutree(as.hclust(clusters_distance_room_type_accom_fg), k=200)
clusters_distance_room_type_accom_500 <- cutree(as.hclust(clusters_distance_room_type_accom_fg), k=500)
clusters_distance_room_type_accom_1000 <- cutree(as.hclust(clusters_distance_room_type_accom_fg), k=1000)

graph_distance_room_type_accom_rw_01 <- rewire(graph_distance_room_type_accom, each_edge(prob = 0.01))
graph_distance_room_type_accom_rw_02 <- rewire(graph_distance_room_type_accom, each_edge(prob = 0.02))
graph_distance_room_type_accom_rw_05 <- rewire(graph_distance_room_type_accom, each_edge(prob = 0.05))
graph_distance_room_type_accom_rw_10 <- rewire(graph_distance_room_type_accom, each_edge(prob = 0.1))
graph_distance_room_type_accom_rw_15 <- rewire(graph_distance_room_type_accom, each_edge(prob = 0.15))

clusters_distance_room_type_accom_rw_01 <- membership(cluster_louvain(graph_distance_room_type_accom_rw_01))
clusters_distance_room_type_accom_rw_02 <- membership(cluster_louvain(graph_distance_room_type_accom_rw_02))
clusters_distance_room_type_accom_rw_05 <- membership(cluster_louvain(graph_distance_room_type_accom_rw_05))
clusters_distance_room_type_accom_rw_10 <- membership(cluster_louvain(graph_distance_room_type_accom_rw_10))
clusters_distance_room_type_accom_rw_15 <- membership(cluster_louvain(graph_distance_room_type_accom_rw_15))

In [35]:
airbnb_listings %>%
  mutate(`Entire home/apt` = ifelse(as.character(room_type) == 'Entire home/apt', 1, 0),
         `Private room` = ifelse(as.character(room_type) == 'Private room', 1, 0),
         `Shared room` = ifelse(as.character(room_type) == 'Shared room', 1, 0)) %>%
  dplyr::select(room_id, host_id, reviews, overall_satisfaction, 
                accommodates, bedrooms, bathrooms, price, minstay, latitude, longitude,
                `Entire home/apt`, `Private room`, `Shared room`) -> airbnb_listings

airbnb_listings %>% 
  mutate(reviews_scaled = as.numeric(scale(reviews, scale=TRUE)),
         satisfaction_scaled = as.numeric(scale(overall_satisfaction, scale=TRUE)),
         accomm_scaled = as.numeric(scale(overall_satisfaction, scale=TRUE)),
         bed_scaled = as.numeric(scale(bedrooms, scale=TRUE)),
         bath_scaled = as.numeric(scale(bathrooms, scale=TRUE)),
         minstay_scaled = as.numeric(scale(minstay, scale=TRUE))) -> airbnb_listings

In [36]:
df_for_blocking <- airbnb_listings
df_for_blocking$cluster_assignment <- clusters_distance_room_type_accom
df_for_blocking$cluster_assignment_100 <- clusters_distance_room_type_accom_100
df_for_blocking$cluster_assignment_200 <- clusters_distance_room_type_accom_200
df_for_blocking$cluster_assignment_500 <- clusters_distance_room_type_accom_500
df_for_blocking$cluster_assignment_1000 <- clusters_distance_room_type_accom_1000
df_for_blocking$cluster_assignment_rw_01 <- clusters_distance_room_type_accom_rw_01
df_for_blocking$cluster_assignment_rw_02 <- clusters_distance_room_type_accom_rw_02
df_for_blocking$cluster_assignment_rw_05 <- clusters_distance_room_type_accom_rw_05
df_for_blocking$cluster_assignment_rw_10 <- clusters_distance_room_type_accom_rw_10
df_for_blocking$cluster_assignment_rw_15 <- clusters_distance_room_type_accom_rw_15

df_for_blocking_base <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment'))
df_for_blocking_base_100 <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment_100'))
df_for_blocking_base_200 <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment_200'))
df_for_blocking_base_500 <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment_500'))
df_for_blocking_base_1000 <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment_1000'))
df_for_blocking_base_rw_01 <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment_rw_01'))
df_for_blocking_base_rw_02 <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment_rw_02'))
df_for_blocking_base_rw_05 <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment_rw_05'))
df_for_blocking_base_rw_10 <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment_rw_10'))
df_for_blocking_base_rw_15 <- create_df_for_blocking(df_for_blocking, sym('cluster_assignment_rw_15'))

save(df_for_blocking_base, df_for_blocking_base_100, df_for_blocking_base_200,
     df_for_blocking_base_500, df_for_blocking_base_1000, df_for_blocking_base_rw_01,
     df_for_blocking_base_rw_02, df_for_blocking_base_rw_05, df_for_blocking_base_rw_10,
     df_for_blocking_base_rw_15, file=paste0(
         c(data_directory, 'df_for_blocking.Rdata'),
         sep='', collapse=''
     ))

In [37]:
blocks_base <- generate_blocks(df_for_blocking_base, 'cluster_assignment')
blocks_base_100 <- generate_blocks(df_for_blocking_base_100, 'cluster_assignment_100')
blocks_base_200 <- generate_blocks(df_for_blocking_base_200, 'cluster_assignment_200')
blocks_base_500 <- generate_blocks(df_for_blocking_base_500, 'cluster_assignment_500')
blocks_base_1000 <- generate_blocks(df_for_blocking_base_1000, 'cluster_assignment_1000')
blocks_base_rw_01 <- generate_blocks(df_for_blocking_base_rw_01, 'cluster_assignment_rw_01')
blocks_base_rw_02 <- generate_blocks(df_for_blocking_base_rw_02, 'cluster_assignment_rw_02')
blocks_base_rw_05 <- generate_blocks(df_for_blocking_base_rw_05, 'cluster_assignment_rw_05')
blocks_base_rw_10 <- generate_blocks(df_for_blocking_base_rw_10, 'cluster_assignment_rw_10')
blocks_base_rw_15 <- generate_blocks(df_for_blocking_base_rw_15, 'cluster_assignment_rw_15')

In [62]:
hajek_probabilities_50_50_drta_base <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom, 
                                                      graph_distance_room_type_accom, .5, .5, 200, 20, blocks_base)

hajek_probabilities_75_50_drta_base <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom, 
                                                 graph_distance_room_type_accom, .75, .5, 200, 20, blocks_base)

hajek_probabilities_95_50_drta_base <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom, 
                                                      graph_distance_room_type_accom, .95, .5, 100, 20, blocks_base)

save(hajek_probabilities_50_50_drta_base, hajek_probabilities_75_50_drta_base, 
     hajek_probabilities_95_50_drta_base, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base.Rdata'),
         sep='', collapse=''
     ))

In [63]:
hajek_probabilities_50_50_drta_base_100 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_100, 
                                                      graph_distance_room_type_accom, .5, .5, 200, 20, blocks_base_100)

hajek_probabilities_75_50_drta_base_100 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_100, 
                                                 graph_distance_room_type_accom, .75, .5, 200, 20, blocks_base_100)

hajek_probabilities_95_50_drta_base_100 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_100, 
                                                      graph_distance_room_type_accom, .95, .5, 100, 20, blocks_base_100)

save(hajek_probabilities_50_50_drta_base_100, hajek_probabilities_75_50_drta_base_100, 
     hajek_probabilities_95_50_drta_base_100, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_100.Rdata'),
         sep='', collapse=''
     ))

In [64]:
hajek_probabilities_50_50_drta_base_200 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_200, 
                                                      graph_distance_room_type_accom, .5, .5, 200, 20, blocks_base_200)

hajek_probabilities_75_50_drta_base_200 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_200, 
                                                 graph_distance_room_type_accom, .75, .5, 200, 20, blocks_base_200)

hajek_probabilities_95_50_drta_base_200 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_200, 
                                                      graph_distance_room_type_accom, .95, .5, 100, 20, blocks_base_200)

save(hajek_probabilities_50_50_drta_base_200, hajek_probabilities_75_50_drta_base_200, 
     hajek_probabilities_95_50_drta_base_200, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_200.Rdata'),
         sep='', collapse=''
     ))

In [65]:
hajek_probabilities_50_50_drta_base_500 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_500, 
                                                      graph_distance_room_type_accom, .5, .5, 200, 20, blocks_base_500)

hajek_probabilities_75_50_drta_base_500 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_500, 
                                                 graph_distance_room_type_accom, .75, .5, 200, 20, blocks_base_500)

hajek_probabilities_95_50_drta_base_500 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_500, 
                                                      graph_distance_room_type_accom, .95, .5, 100, 20, blocks_base_500)

save(hajek_probabilities_50_50_drta_base_500, hajek_probabilities_75_50_drta_base_500, 
     hajek_probabilities_95_50_drta_base_500, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_500.Rdata'),
         sep='', collapse=''
     ))

In [66]:
hajek_probabilities_50_50_drta_base_1000 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_1000, 
                                                      graph_distance_room_type_accom, .5, .5, 200, 20, blocks_base_1000)

hajek_probabilities_75_50_drta_base_1000 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_1000, 
                                                 graph_distance_room_type_accom, .75, .5, 200, 20, blocks_base_1000)

hajek_probabilities_95_50_drta_base_1000 <- hajek_probabilities(airbnb_listings, clusters_distance_room_type_accom_1000, 
                                                      graph_distance_room_type_accom, .95, .5, 100, 20, blocks_base_1000)

save(hajek_probabilities_50_50_drta_base_1000, hajek_probabilities_75_50_drta_base_1000, 
     hajek_probabilities_95_50_drta_base_1000, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_1000.Rdata'),
         sep='', collapse=''
     ))

In [67]:
hajek_probabilities_50_50_drta_base_rw_01 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_01, 
                                                                 graph_distance_room_type_accom_rw_01, 
                                                                 .5, .5, 200, 20, blocks_base_rw_01)

hajek_probabilities_75_50_drta_base_rw_01 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_01, 
                                                                 graph_distance_room_type_accom_rw_01, 
                                                                 .75, .5, 200, 20, blocks_base_rw_01)

hajek_probabilities_95_50_drta_base_rw_01 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_01, 
                                                                 graph_distance_room_type_accom_rw_01, 
                                                                 .95, .5, 100, 20, blocks_base_rw_01)

save(hajek_probabilities_50_50_drta_base_rw_01, hajek_probabilities_75_50_drta_base_rw_01, 
     hajek_probabilities_95_50_drta_base_rw_01, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_rw_01.Rdata'),
         sep='', collapse=''
     ))

In [68]:
hajek_probabilities_50_50_drta_base_rw_02 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_02, 
                                                                 graph_distance_room_type_accom_rw_02, 
                                                                 .5, .5, 200, 20, blocks_base_rw_02)

hajek_probabilities_75_50_drta_base_rw_02 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_02, 
                                                                 graph_distance_room_type_accom_rw_02, 
                                                                 .75, .5, 200, 20, blocks_base_rw_02)

hajek_probabilities_95_50_drta_base_rw_02 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_02, 
                                                                 graph_distance_room_type_accom_rw_02, 
                                                                 .95, .5, 100, 20, blocks_base_rw_02)

save(hajek_probabilities_50_50_drta_base_rw_02, hajek_probabilities_75_50_drta_base_rw_02, 
     hajek_probabilities_95_50_drta_base_rw_02, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_rw_02.Rdata'),
         sep='', collapse=''
     ))

In [69]:
hajek_probabilities_50_50_drta_base_rw_05 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_05, 
                                                                 graph_distance_room_type_accom_rw_05, 
                                                                 .5, .5, 200, 20, blocks_base_rw_05)

hajek_probabilities_75_50_drta_base_rw_05 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_05, 
                                                                 graph_distance_room_type_accom_rw_05, 
                                                                 .75, .5, 200, 20, blocks_base_rw_05)

hajek_probabilities_95_50_drta_base_rw_05 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_05, 
                                                                 graph_distance_room_type_accom_rw_05, 
                                                                 .95, .5, 100, 20, blocks_base_rw_05)

save(hajek_probabilities_50_50_drta_base_rw_05, hajek_probabilities_75_50_drta_base_rw_05, 
     hajek_probabilities_95_50_drta_base_rw_05, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_rw_05.Rdata'),
         sep='', collapse=''
     ))

In [70]:
hajek_probabilities_50_50_drta_base_rw_10 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_10, 
                                                                 graph_distance_room_type_accom_rw_10, 
                                                                 .5, .5, 200, 20, blocks_base_rw_10)

hajek_probabilities_75_50_drta_base_rw_10 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_10, 
                                                                 graph_distance_room_type_accom_rw_10, 
                                                                 .75, .5, 200, 20, blocks_base_rw_10)

hajek_probabilities_95_50_drta_base_rw_10 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_10, 
                                                                 graph_distance_room_type_accom_rw_10, 
                                                                 .95, .5, 100, 20, blocks_base_rw_10)

save(hajek_probabilities_50_50_drta_base_rw_10, hajek_probabilities_75_50_drta_base_rw_10, 
     hajek_probabilities_95_50_drta_base_rw_10, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_rw_10.Rdata'),
         sep='', collapse=''
     ))

In [71]:
hajek_probabilities_50_50_drta_base_rw_15 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_15, 
                                                                 graph_distance_room_type_accom_rw_15, 
                                                                 .5, .5, 200, 20, blocks_base_rw_15)

hajek_probabilities_75_50_drta_base_rw_15 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_15, 
                                                                 graph_distance_room_type_accom_rw_15, 
                                                                 .75, .5, 200, 20, blocks_base_rw_15)

hajek_probabilities_95_50_drta_base_rw_15 <- hajek_probabilities(airbnb_listings, 
                                                                 clusters_distance_room_type_accom_rw_15, 
                                                                 graph_distance_room_type_accom_rw_15, 
                                                                 .95, .5, 100, 20, blocks_base_rw_15)

save(hajek_probabilities_50_50_drta_base_rw_15, hajek_probabilities_75_50_drta_base_rw_15, 
     hajek_probabilities_95_50_drta_base_rw_15, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_rw_15.Rdata'),
         sep='', collapse=''
     ))

In [59]:
airbnb_listings$prob_treat_50_thresh_base <- diag(hajek_probabilities_50_50_drta_base[[1]])
airbnb_listings$prob_control_50_thresh_base <- diag(hajek_probabilities_50_50_drta_base[[2]])
airbnb_listings$prob_treat_75_thresh_base <- diag(hajek_probabilities_75_50_drta_base[[1]])
airbnb_listings$prob_control_75_thresh_base <- diag(hajek_probabilities_75_50_drta_base[[2]])
airbnb_listings$prob_treat_95_thresh_base <- diag(hajek_probabilities_95_50_drta_base[[1]])
airbnb_listings$prob_control_95_thresh_base <- diag(hajek_probabilities_95_50_drta_base[[2]])

airbnb_listings$prob_treat_50_thresh_base_100 <- diag(hajek_probabilities_50_50_drta_base_100[[1]])
airbnb_listings$prob_control_50_thresh_base_100 <- diag(hajek_probabilities_50_50_drta_base_100[[2]])
airbnb_listings$prob_treat_75_thresh_base_100 <- diag(hajek_probabilities_75_50_drta_base_100[[1]])
airbnb_listings$prob_control_75_thresh_base_100 <- diag(hajek_probabilities_75_50_drta_base_100[[2]])
airbnb_listings$prob_treat_95_thresh_base_100 <- diag(hajek_probabilities_95_50_drta_base_100[[1]])
airbnb_listings$prob_control_95_thresh_base_100 <- diag(hajek_probabilities_95_50_drta_base_100[[2]])

airbnb_listings$prob_treat_50_thresh_base_200 <- diag(hajek_probabilities_50_50_drta_base_200[[1]])
airbnb_listings$prob_control_50_thresh_base_200 <- diag(hajek_probabilities_50_50_drta_base_200[[2]])
airbnb_listings$prob_treat_75_thresh_base_200 <- diag(hajek_probabilities_75_50_drta_base_200[[1]])
airbnb_listings$prob_control_75_thresh_base_200 <- diag(hajek_probabilities_75_50_drta_base_200[[2]])
airbnb_listings$prob_treat_95_thresh_base_200 <- diag(hajek_probabilities_95_50_drta_base_200[[1]])
airbnb_listings$prob_control_95_thresh_base_200 <- diag(hajek_probabilities_95_50_drta_base_200[[2]])

airbnb_listings$prob_treat_50_thresh_base_500 <- diag(hajek_probabilities_50_50_drta_base_500[[1]])
airbnb_listings$prob_control_50_thresh_base_500 <- diag(hajek_probabilities_50_50_drta_base_500[[2]])
airbnb_listings$prob_treat_75_thresh_base_500 <- diag(hajek_probabilities_75_50_drta_base_500[[1]])
airbnb_listings$prob_control_75_thresh_base_500 <- diag(hajek_probabilities_75_50_drta_base_500[[2]])
airbnb_listings$prob_treat_95_thresh_base_500 <- diag(hajek_probabilities_95_50_drta_base_500[[1]])
airbnb_listings$prob_control_95_thresh_base_500 <- diag(hajek_probabilities_95_50_drta_base_500[[2]])

airbnb_listings$prob_treat_50_thresh_base_1000 <- diag(hajek_probabilities_50_50_drta_base_1000[[1]])
airbnb_listings$prob_control_50_thresh_base_1000 <- diag(hajek_probabilities_50_50_drta_base_1000[[2]])
airbnb_listings$prob_treat_75_thresh_base_1000 <- diag(hajek_probabilities_75_50_drta_base_1000[[1]])
airbnb_listings$prob_control_75_thresh_base_1000 <- diag(hajek_probabilities_75_50_drta_base_1000[[2]])
airbnb_listings$prob_treat_95_thresh_base_1000 <- diag(hajek_probabilities_95_50_drta_base_1000[[1]])
airbnb_listings$prob_control_95_thresh_base_1000 <- diag(hajek_probabilities_95_50_drta_base_1000[[2]])

airbnb_listings$prob_treat_50_thresh_base_rw_01 <- diag(hajek_probabilities_50_50_drta_base_rw_01[[1]])
airbnb_listings$prob_control_50_thresh_base_rw_01 <- diag(hajek_probabilities_50_50_drta_base_rw_01[[2]])
airbnb_listings$prob_treat_75_thresh_base_rw_01 <- diag(hajek_probabilities_75_50_drta_base_rw_01[[1]])
airbnb_listings$prob_control_75_thresh_base_rw_01 <- diag(hajek_probabilities_75_50_drta_base_rw_01[[2]])
airbnb_listings$prob_treat_95_thresh_base_rw_01 <- diag(hajek_probabilities_95_50_drta_base_rw_01[[1]])
airbnb_listings$prob_control_95_thresh_base_rw_01 <- diag(hajek_probabilities_95_50_drta_base_rw_01[[2]])

airbnb_listings$prob_treat_50_thresh_base_rw_02 <- diag(hajek_probabilities_50_50_drta_base_rw_02[[1]])
airbnb_listings$prob_control_50_thresh_base_rw_02 <- diag(hajek_probabilities_50_50_drta_base_rw_02[[2]])
airbnb_listings$prob_treat_75_thresh_base_rw_02 <- diag(hajek_probabilities_75_50_drta_base_rw_02[[1]])
airbnb_listings$prob_control_75_thresh_base_rw_02 <- diag(hajek_probabilities_75_50_drta_base_rw_02[[2]])
airbnb_listings$prob_treat_95_thresh_base_rw_02 <- diag(hajek_probabilities_95_50_drta_base_rw_02[[1]])
airbnb_listings$prob_control_95_thresh_base_rw_02 <- diag(hajek_probabilities_95_50_drta_base_rw_02[[2]])

airbnb_listings$prob_treat_50_thresh_base_rw_05 <- diag(hajek_probabilities_50_50_drta_base_rw_05[[1]])
airbnb_listings$prob_control_50_thresh_base_rw_05 <- diag(hajek_probabilities_50_50_drta_base_rw_05[[2]])
airbnb_listings$prob_treat_75_thresh_base_rw_05 <- diag(hajek_probabilities_75_50_drta_base_rw_05[[1]])
airbnb_listings$prob_control_75_thresh_base_rw_05 <- diag(hajek_probabilities_75_50_drta_base_rw_05[[2]])
airbnb_listings$prob_treat_95_thresh_base_rw_05 <- diag(hajek_probabilities_95_50_drta_base_rw_05[[1]])
airbnb_listings$prob_control_95_thresh_base_rw_05 <- diag(hajek_probabilities_95_50_drta_base_rw_05[[2]])

airbnb_listings$prob_treat_50_thresh_base_rw_10 <- diag(hajek_probabilities_50_50_drta_base_rw_10[[1]])
airbnb_listings$prob_control_50_thresh_base_rw_10 <- diag(hajek_probabilities_50_50_drta_base_rw_10[[2]])
airbnb_listings$prob_treat_75_thresh_base_rw_10 <- diag(hajek_probabilities_75_50_drta_base_rw_10[[1]])
airbnb_listings$prob_control_75_thresh_base_rw_10 <- diag(hajek_probabilities_75_50_drta_base_rw_10[[2]])
airbnb_listings$prob_treat_95_thresh_base_rw_10 <- diag(hajek_probabilities_95_50_drta_base_rw_10[[1]])
airbnb_listings$prob_control_95_thresh_base_rw_10 <- diag(hajek_probabilities_95_50_drta_base_rw_10[[2]])

airbnb_listings$prob_treat_50_thresh_base_rw_15 <- diag(hajek_probabilities_50_50_drta_base_rw_15[[1]])
airbnb_listings$prob_control_50_thresh_base_rw_15 <- diag(hajek_probabilities_50_50_drta_base_rw_15[[2]])
airbnb_listings$prob_treat_75_thresh_base_rw_15 <- diag(hajek_probabilities_75_50_drta_base_rw_15[[1]])
airbnb_listings$prob_control_75_thresh_base_rw_15 <- diag(hajek_probabilities_75_50_drta_base_rw_15[[2]])
airbnb_listings$prob_treat_95_thresh_base_rw_15 <- diag(hajek_probabilities_95_50_drta_base_rw_15[[1]])
airbnb_listings$prob_control_95_thresh_base_rw_15 <- diag(hajek_probabilities_95_50_drta_base_rw_15[[2]])

In [60]:
save(airbnb_listings, 
     clusters_distance_room_type_accom, 
     graph_distance_room_type_accom, 
     clusters_distance_room_type, 
     graph_distance_room_type, 
     graph_distance_only, 
     clusters_distance_only,
     blocks_base,
     blocks_base_100,
     blocks_base_200,
     blocks_base_500,
     blocks_base_1000,
     blocks_base_rw_01,
     blocks_base_rw_02,
     blocks_base_rw_05,
     blocks_base_rw_10,
     blocks_base_rw_15, 
     df_for_blocking, 
     df_for_blocking_base,
     df_for_blocking_base_100,
     df_for_blocking_base_200,
     df_for_blocking_base_500,
     df_for_blocking_base_1000,
     df_for_blocking_base_rw_01,
     df_for_blocking_base_rw_02,
     df_for_blocking_base_rw_05,
     df_for_blocking_base_rw_10,
     df_for_blocking_base_rw_15,
     clusters_distance_room_type_accom_100,
     clusters_distance_room_type_accom_200,
     clusters_distance_room_type_accom_500,
     clusters_distance_room_type_accom_1000, 
     graph_distance_room_type_accom_rw_01,
     graph_distance_room_type_accom_rw_02,
     graph_distance_room_type_accom_rw_05,
     graph_distance_room_type_accom_rw_10,
     graph_distance_room_type_accom_rw_15,
     clusters_distance_room_type_accom_rw_01,
     clusters_distance_room_type_accom_rw_02,
     clusters_distance_room_type_accom_rw_05,
     clusters_distance_room_type_accom_rw_10,
     clusters_distance_room_type_accom_rw_15,
     clusters_distance_room_type_accom_fg,
     file=paste0(
         c(data_directory, 'data_for_simulation_blocked.Rdata'),
         sep='', collapse=''
     ))

In [29]:
airbnb_listings_washington <- read.csv('Airbnb_Listings_Washington.csv')

In [31]:
airbnb_listings_washington$accommodates <- imputeMode(airbnb_listings_washington$accommodates)
airbnb_listings_washington$bedrooms <- imputeMode(airbnb_listings_washington$bedrooms)
airbnb_listings_washington$bathrooms <- imputeMode(airbnb_listings_washington$bathrooms)
airbnb_listings_washington$minstay[airbnb_listings_washington$minstay > 30] <- 30
airbnb_listings_washington$minstay <- imputeMode(airbnb_listings_washington$minstay)
airbnb_listings_washington$overall_satisfaction <- imputeMean(airbnb_listings_washington$overall_satisfaction)

airbnb_listings_washington$accommodates[is.na(airbnb_listings_washington$accommodates)] <- Mode(airbnb_listings_washington$accommodates)

In [19]:
distances_washington <- matrix(, nrow=nrow(airbnb_listings_washington), ncol=nrow(airbnb_listings_washington))
room_types_washington <- matrix(, nrow=nrow(airbnb_listings_washington), ncol = nrow(airbnb_listings_washington))
accommodates_washington <- matrix(, nrow=nrow(airbnb_listings_washington), ncol = nrow(airbnb_listings_washington))


for (i in 1:nrow(airbnb_listings_washington)) {
  for (j in i:nrow(airbnb_listings_washington)) {
    distances_washington[i, j] <- distHaversine(c(airbnb_listings_washington$longitude[i], 
                                       airbnb_listings_washington$latitude[i]), 
                                     c(airbnb_listings_washington$longitude[j], 
                                       airbnb_listings_washington$latitude[j]))
    distances_washington[j, i] <- distances_washington[i, j]
    room_types_washington[i, j] <- ifelse(airbnb_listings_washington$room_type[i] == 
                                          airbnb_listings_washington$room_type[j], 1, 0)
    room_types_washington[j, i] <- room_types_washington[i, j]
    accommodates_washington[i, j] <- ifelse(abs(airbnb_listings_washington$accommodates[i] - 
                                                airbnb_listings_washington$accommodates[j]) <= 1, 1, 0)
    accommodates_washington[j, i] <- accommodates_washington[i, j]
  }
  if(i %% 1000 == 0) {
      message(as.character(i))
  }
}

1000

2000

3000

4000



In [22]:
save(accommodates_washington, distances_washington, room_types_washington, file=paste0(
         c(data_directory, 'parsed_data_washington.Rdata'),
         sep='', collapse=''
     ))

In [41]:
set.seed(158402)

In [42]:
### Create graph and communities for distance only graph
distances_binary_washington <- ifelse(distances_washington <= mile_in_meters, 1, 0)
graph_distance_only_washington <- graph_from_adjacency_matrix(distances_binary_washington, mode = 'undirected', diag = FALSE)
clusters_distance_only_washington <- membership(cluster_louvain(graph_distance_only_washington))

### Create graph and communities for distance plus same room type graph
distances_plus_same_type_washington <- distances_binary_washington + room_types_washington
distances_plus_same_type_binary_washington <- ifelse(distances_plus_same_type_washington == 2, 1, 0)
graph_distance_room_type_washington <- graph_from_adjacency_matrix(distances_plus_same_type_binary_washington, 
                                                                   mode='undirected',
                                                        diag = FALSE)
clusters_distance_room_type_washington <- membership(cluster_louvain(graph_distance_room_type_washington))

### Create graph and communities for distance plus same room type plus same size graph
distances_plus_same_type_plus_accom_washington <- distances_plus_same_type_washington + accommodates_washington
distances_plus_same_type_plus_accom_binary_washington <- ifelse(distances_plus_same_type_plus_accom_washington == 3, 1, 0)
graph_distance_room_type_accom_washington <- graph_from_adjacency_matrix(distances_plus_same_type_plus_accom_binary_washington, 
                                                              mode='undirected', diag = FALSE)
clusters_distance_room_type_accom_washington <- membership(cluster_louvain(graph_distance_room_type_accom_washington))

In [43]:
airbnb_listings_washington %>%
  mutate(`Entire home/apt` = ifelse(as.character(room_type) == 'Entire home/apt', 1, 0),
         `Private room` = ifelse(as.character(room_type) == 'Private room', 1, 0),
         `Shared room` = ifelse(as.character(room_type) == 'Shared room', 1, 0)) %>%
  dplyr::select(room_id, host_id, reviews, overall_satisfaction, 
                accommodates, bedrooms, price, minstay, latitude, longitude,
                `Entire home/apt`, `Private room`, `Shared room`) -> airbnb_listings_washington

airbnb_listings_washington %>% 
  mutate(reviews_scaled = as.numeric(scale(reviews, scale=TRUE)),
         satisfaction_scaled = as.numeric(scale(overall_satisfaction, scale=TRUE)),
         accomm_scaled = as.numeric(scale(overall_satisfaction, scale=TRUE)),
         bed_scaled = as.numeric(scale(bedrooms, scale=TRUE)),
         minstay_scaled = as.numeric(scale(minstay, scale=TRUE))) -> airbnb_listings_washington

ERROR: Error in ifelse(as.character(room_type) == "Entire home/apt", 1, 0): object 'room_type' not found


In [49]:
save(airbnb_listings_washington, 
     clusters_distance_room_type_accom_washington, 
     graph_distance_room_type_accom_washington, 
     clusters_distance_room_type_washington, 
     graph_distance_room_type_washington, 
     graph_distance_only_washington, 
     clusters_distance_only_washington,
     blocks_base_washington,
     df_for_blocking_washington,
     file=paste0(
         c(data_directory, 'data_for_simulation_blocked_washington.Rdata'),
         sep='', collapse=''
     ))

In [44]:
df_for_blocking_washington <- airbnb_listings_washington
df_for_blocking_washington$cluster_assignment <- clusters_distance_room_type_accom_washington

df_for_blocking_base_washington <- create_df_for_blocking_wash(df_for_blocking_washington, sym('cluster_assignment'))

save(df_for_blocking_base_washington, file=paste0(
         c(data_directory, 'df_for_blocking_washington.Rdata'),
         sep='', collapse=''
     ))

In [45]:
blocks_base_washington <- generate_blocks_wash(df_for_blocking_base_washington, 'cluster_assignment')

In [46]:
hajek_probabilities_50_50_drta_base_wash <- hajek_probabilities(airbnb_listings_washington, 
                                                                clusters_distance_room_type_accom_washington, 
                                                                graph_distance_room_type_accom_washington, 
                                                                .5, .5, 200, 20, blocks_base_washington)

hajek_probabilities_75_50_drta_base_wash <- hajek_probabilities(airbnb_listings_washington, 
                                                                clusters_distance_room_type_accom_washington, 
                                                                graph_distance_room_type_accom_washington, 
                                                                .75, .5, 200, 20, blocks_base_washington)

hajek_probabilities_95_50_drta_base_wash <- hajek_probabilities(airbnb_listings_washington, 
                                                                clusters_distance_room_type_accom_washington, 
                                                                graph_distance_room_type_accom_washington, 
                                                                .95, .5, 100, 20, blocks_base_washington)

save(hajek_probabilities_50_50_drta_base_wash, hajek_probabilities_75_50_drta_base_wash, 
     hajek_probabilities_95_50_drta_base_wash, file=paste0(
         c(data_directory, 'hajek_probabilities_blocked_base_wash.Rdata'),
         sep='', collapse=''
     ))

In [51]:
airbnb_listings_washington$prob_treat_50_thresh_base <- diag(hajek_probabilities_50_50_drta_base_wash[[1]])
airbnb_listings_washington$prob_control_50_thresh_base <- diag(hajek_probabilities_50_50_drta_base_wash[[2]])
airbnb_listings_washington$prob_treat_75_thresh_base <- diag(hajek_probabilities_75_50_drta_base_wash[[1]])
airbnb_listings_washington$prob_control_75_thresh_base <- diag(hajek_probabilities_75_50_drta_base_wash[[2]])
airbnb_listings_washington$prob_treat_95_thresh_base <- diag(hajek_probabilities_95_50_drta_base_wash[[1]])
airbnb_listings_washington$prob_control_95_thresh_base <- diag(hajek_probabilities_95_50_drta_base_wash[[2]])

In [52]:
save(airbnb_listings_washington, 
     clusters_distance_room_type_accom_washington, 
     graph_distance_room_type_accom_washington, 
     clusters_distance_room_type_washington, 
     graph_distance_room_type_washington, 
     graph_distance_only_washington, 
     clusters_distance_only_washington,
     blocks_base_washington,
     df_for_blocking_washington,
     file=paste0(
         c(data_directory, 'data_for_simulation_blocked_washington.Rdata'),
         sep='', collapse=''
     ))