# Wikipedia Thanks-Receiver Study Randomization 
[J. Nathan Matias](https://twitter.com/natematias)
April 27, 2019

This code takes as input data described in the [randomization data format](https://docs.google.com/document/d/1plhoDbQryYQ32vZMXu8YmlLSp30QTdup43k6uTePOT4/edit?usp=drive_web&ouid=117701977297551627494) and produces randomizations for the Thanker study.

In [7]:
options("scipen"=9, "digits"=4)
library(ggplot2)
library(rlang)
library(tidyverse)
library(viridis)
library(blockTools)
library(blockrand)
library(DeclareDesign)
options(repr.plot.width=7, repr.plot.height=3.5)
sessionInfo()

R version 3.5.1 (2018-07-02)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.5 LTS

Matrix products: default
BLAS: /usr/lib/libblas/libblas.so.3.6.0
LAPACK: /usr/lib/lapack/liblapack.so.3.6.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] DeclareDesign_0.12.0 estimatr_0.14        fabricatr_0.6.0     
 [4] randomizr_0.16.1     blockrand_1.3        blockTools_0.6-3    
 [7] viridis_0.5.1        viridisLite_0.3.0    forcats_0.3.0       
[10] stringr_1.3.1        dplyr_0.7.8          purrr_0.2.5         
[13] readr_1.2.1          tid

# Load Input Dataframe

In [None]:
filename = "FILENAME"
data.path <- "~/Tresors/CivilServant/projects/wikipedia-integration/gratitude-study/datasets/power_analysis"
recipient.df <- read.csv(file.path(data.path, filename))

### Subset values outside the 99% confidence intervals

In [None]:
## CALCULATE MEANS AND SDs before making any removals
df.labor.mean <- mean(recipient.df$labor_hours_84_days_pre_sample)
df.labor.sd   <- sd(recipient.df$labor_hours_84_days_pre_sample)

print(paste("Removing", 
            nrow(subset(recipient.df,
                        labor_hours_84_days_pre_sample > df.labor.mean + 2.58* df.labor.sd), "outliers"),
           "observations because labor_hours_84_days_pre_sample is an outlier."))
recipient.df.final <- subset(recipient.df, labor_hours_84_days_pre_sample <= df.labor.mean + 2.58* df.labor.sd)

# Review and Generate Variables

In [None]:
## Add the newcomer field
recipient.df.final$newcomer <- recipient.df.final$prev_experience == 0

## Update the has_email field
recipient.df.final$has_email <- recipient.df.final$has_email == "True"

## PREVIOUS EXPERIENCE
print("prev_experience")
print(summary(factor(recipient.df.final$prev_experience)))
cat("\n")

## SHOW LABOR HOURS BY EXPERIENCE GROUP:
print("Aggregate labor_hours_84_days_pre_sample")
print(aggregate(recipient.df.final[c("labor_hours_84_days_pre_sample")],
          FUN=mean, by = list(recipient.df.final$prev_experience)))
cat("\n")

print("NEWCOMERS AND EMAILS")
print("--------------------")
print(CrossTable(recipient.df.final$has_email, recipient.df.final$newcomer, 
       prop.r = FALSE, prop.c=TRUE, prop.t = FALSE, prop.chisq = FALSE))

# VARIABLE: num_prev_thanks_pre_treatment
print("num_prev_thanks_pre_sample")
print(summary(recipient.df.final$num_prev_thanks_pre_sample))
cat("\n")
    
## SHOW PREVIOUS THANKS BY EXPERIENCE GROUP:
print("num_prev_thanks_pre_sample by prev_experience")
print(aggregate(recipient.df.final[c("num_prev_thanks_pre_sample")],
      FUN=mean, by = list(recipient.df.final$prev_experience)))
cat("\n")

# Generate Randomization Blocks

In [None]:
## BLOCKING VARIABLES
bv = c("labor_hours_90_pre_treatment", "num_prev_thanks_pre_treatment")
#bv = c("labor_hours_90_pre_treatment", "num_prev_thanks_pre_treatment", "has_email")

block.size = 2

## TODO: CHECK TO SEE IF I CAN DO BALANCED RANDOMIZATION
## WITHIN BLOCKS LARGER THAN 2
blockobj = block(data=recipient.df.final,
            n.tr = block.size,
            groups = recipient.df.final$prev_experience,
            id.vars="id",
            block.vars = bv,
            distance ="mahalanobis"
            )
## CHECK DISTANCES
print(blockobj)
recipient.df.final$randomization_block_id <- createBlockIDs(blockobj,
                                   data=recipient.df.final,
                                   id.var = "id")
recipient.df.final$randomization_block_size = block.size

# Generate Randomizations

In [11]:
## Seed generated by Brooklyn Integers
# https://www.brooklynintegers.com/int/1377549523/
set.seed(1377549523)

In [None]:
assignments <- block_ra(blocks=recipient.df.final$randomization_block_id, 
                           block_prob=0.5, num_arms = 2)
recipient.df.final$randomization_arm <- assignments 

### Check Balance

In [None]:
## TODO

# Output and Archive Randomizations