# Wikipedia Thanks-Receiver Study Randomization 
[J. Nathan Matias](https://twitter.com/natematias)
April 27, 2019

This code takes as input data described in the [randomization data format](https://docs.google.com/document/d/1plhoDbQryYQ32vZMXu8YmlLSp30QTdup43k6uTePOT4/edit?usp=drive_web&ouid=117701977297551627494) and produces randomizations for the Thanks Recipient study.

## TODO
* Remove outliers on a per-language basis, not on an overall basis

In [23]:
options("scipen"=9, "digits"=4)
library(ggplot2)
library(rlang)
library(tidyverse)
library(viridis)
library(blockTools)
library(blockrand)
library(gmodels) # contains CrossTable
library(DeclareDesign)
library(DescTools) # contains Freq
options(repr.plot.width=7, repr.plot.height=3.5)
sessionInfo()

R version 3.5.1 (2018-07-02)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.5 LTS

Matrix products: default
BLAS: /usr/lib/libblas/libblas.so.3.6.0
LAPACK: /usr/lib/lapack/liblapack.so.3.6.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] DescTools_0.99.28    DeclareDesign_0.12.0 estimatr_0.14       
 [4] fabricatr_0.6.0      randomizr_0.16.1     gmodels_2.18.1      
 [7] blockrand_1.3        blockTools_0.6-3     viridis_0.5.1       
[10] viridisLite_0.3.0    forcats_0.3.0        stringr_1.3.1       
[13] dplyr_0.7.8          pur

# Load Input Dataframe

In [24]:
filename = "all-thankees-historical-20190520.csv"
data.path <- "/home/civilservant/Tresors/CivilServant/projects/wikipedia-integration/gratitude-study/Data Drills/thankee/historical_output"
recipient.df <- read.csv(file.path(data.path, filename))

### Adjust Column Names to Match Thankee Randomization Specification (REMOVE IN FINAL RUN)
**IMPORTANT**: THESE COLUMNS **DO NOT HAVE THE SAME DEFINITION**. WE ARE JUST MAKING THIS TRANSFORMATION IN ORDER TO SET UP THE CODE IN THE ABSENCE OF THANKEE RANDOMIZATION DATA

In [25]:
#recipient.df$user_id

In [27]:
recipient.df$prev_experience <- factor(as.integer(gsub("bin_", "", recipient.df$prev_experience)))
recipient.df$anonymized_id <- seq(nrow(recipient.df))

### Subset values outside the 99% confidence intervals

In [28]:
## CALCULATE MEANS AND SDs before making any removals
df.labor.mean <- mean(recipient.df$labor_hours_84_days_pre_sample)
df.labor.sd   <- sd(recipient.df$labor_hours_84_days_pre_sample)

print(paste("Removing", 
            nrow(subset(recipient.df,
                        labor_hours_84_days_pre_sample > df.labor.mean + 2.58* df.labor.sd)), "outliers",
           "observations because labor_hours_84_days_pre_sample is an outlier."))
recipient.df.final <- subset(recipient.df, labor_hours_84_days_pre_sample <= df.labor.mean + 2.58* df.labor.sd)

[1] "Removing 34 outliers observations because labor_hours_84_days_pre_sample is an outlier."


# Review and Generate Variables

In [29]:
print(aggregate(recipient.df.final[c("labor_hours_84_days_pre_sample")],
          FUN=mean, by = list(recipient.df.final$prev_experience)))

  Group.1 labor_hours_84_days_pre_sample
1       0                          8.397
2      90                         10.924
3     180                         17.186
4     365                         17.591
5     730                         19.267
6    1460                         20.824
7    2920                         24.288


In [30]:
## Add the newcomer field
recipient.df.final$newcomer <- recipient.df.final$prev_experience == 0

## Update the has_email field
recipient.df.final$has_email <- recipient.df.final$has_email == "True"

## PREVIOUS EXPERIENCE
print("prev_experience")
print(summary(factor(recipient.df.final$prev_experience)))
cat("\n")

## SHOW LABOR HOURS BY EXPERIENCE GROUP:
print("Aggregate labor_hours_84_days_pre_sample")
print(aggregate(recipient.df.final[c("labor_hours_84_days_pre_sample")],
          FUN=mean, by = list(recipient.df.final$prev_experience)))
cat("\n")

print("NEWCOMERS AND EMAILS")
print("--------------------")
print(CrossTable(recipient.df.final$has_email, recipient.df.final$newcomer, 
       prop.r = FALSE, prop.c=TRUE, prop.t = FALSE, prop.chisq = FALSE))

# VARIABLE: num_prev_thanks_pre_treatment
print("num_prev_thanks_pre_sample")
print(summary(recipient.df.final$num_prev_thanks_pre_sample))
cat("\n")
    
## SHOW PREVIOUS THANKS BY EXPERIENCE GROUP:
print("num_prev_thanks_pre_sample by prev_experience")
print(aggregate(recipient.df.final[c("num_prev_thanks_pre_sample")],
      FUN=mean, by = list(recipient.df.final$prev_experience)))
cat("\n")

[1] "prev_experience"
   0   90  180  365  730 1460 2920 
 799   78   69  100  139  195  186 

[1] "Aggregate labor_hours_84_days_pre_sample"
  Group.1 labor_hours_84_days_pre_sample
1       0                          8.397
2      90                         10.924
3     180                         17.186
4     365                         17.591
5     730                         19.267
6    1460                         20.824
7    2920                         24.288

[1] "NEWCOMERS AND EMAILS"
[1] "--------------------"

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Col Total |
|-------------------------|

 
Total Observations in Table:  1566 

 
                             | recipient.df.final$newcomer 
recipient.df.final$has_email |     FALSE |      TRUE | Row Total | 
-----------------------------|-----------|-----------|-----------|
                       FALSE |        43 |         5 |        48 | 
                             |     0.0

# Generate Randomization Blocks

In [31]:
recipient.df.final$lang_prev_experience <- factor(paste(recipient.df.final$lang, recipient.df.final$prev_experience))
colnames(recipient.df.final)

In [32]:
summary(recipient.df.final$lang_prev_experience)

In [33]:
## BLOCKING VARIABLES
bv = c("labor_hours_84_days_pre_sample", "num_prev_thanks_pre_sample")
#bv = c("labor_hours_84_days_pre_sample", "num_prev_thanks_pre_sample", "has_email")

block.size = 2

## TODO: CHECK TO SEE IF I CAN DO BALANCED RANDOMIZATION
## WITHIN BLOCKS LARGER THAN 2
blockobj = block(data=recipient.df.final,
            n.tr = block.size,
            groups = "lang_prev_experience",
            id.vars="anonymized_id",
            block.vars = bv,
            distance ="mahalanobis"
            )
## CHECK DISTANCES
#print(blockobj)
recipient.df.final$randomization_block_id <- createBlockIDs(blockobj,
                                   data=recipient.df.final,
                                   id.var = "anonymized_id")
recipient.df.final$randomization_block_size = block.size

### Identify Incomplete Blocks and Remove them Units in Incomplete Blocks From the Experiment

In [34]:
block.freq <- Freq(factor(recipient.df.final$randomization_block_id))
incomplete.blocks <- as.integer(subset(block.freq, freq == 1)$level)
incomplete.blocks

In [35]:
# removed.observations <- subset(recipient.df.final, (
#     randomization_block_id %in% incomplete.blocks)==TRUE)

# recipient.df.final <- 
#     subset(recipient.df.final, (
#         randomization_block_id %in% incomplete.blocks)!=TRUE)

# print(paste("Removed", nrow(removed.observations), "units placed in incomplete blocks."))

# Generate Randomizations

In [36]:
## Seed generated by Brooklyn Integers
# https://www.brooklynintegers.com/int/1377549523/
set.seed(1377549523)

In [37]:
assignments <- block_ra(blocks=recipient.df.final$randomization_block_id, 
                            num_arms = 2, conditions = c(0,1))
recipient.df.final$randomization_arm <- assignments 

### Check Balance

In [38]:
print("Aggregating labor hours by treatment")
print(aggregate(recipient.df.final[c("labor_hours_84_days_pre_sample")],
          FUN=mean, by = list(recipient.df.final$randomization_arm)))

print("CrossTable of lang by treatment")
CrossTable(recipient.df.final$lang, recipient.df.final$randomization_arm, 
       prop.r = TRUE, prop.c=FALSE, prop.t = FALSE, prop.chisq = FALSE)

print("CrossTable of lang_prev_experience by treatment")
CrossTable(recipient.df.final$lang_prev_experience, recipient.df.final$randomization_arm, 
       prop.r = TRUE, prop.c=FALSE, prop.t = FALSE, prop.chisq = FALSE)


[1] "Aggregating labor hours by treatment"
  Group.1 labor_hours_84_days_pre_sample
1       0                          13.87
2       1                          13.92
[1] "CrossTable of lang by treatment"

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|-------------------------|

 
Total Observations in Table:  1566 

 
                        | recipient.df.final$randomization_arm 
recipient.df.final$lang |         0 |         1 | Row Total | 
------------------------|-----------|-----------|-----------|
                     ar |       196 |       196 |       392 | 
                        |     0.500 |     0.500 |     0.250 | 
------------------------|-----------|-----------|-----------|
                     de |       195 |       197 |       392 | 
                        |     0.497 |     0.503 |     0.250 | 
------------------------|-----------|-----------|-----------|
                     fa |       197 |       196 |       3

# Output and Archive Randomizations

In [40]:
randomization.filename <- paste("thanks-recipient-randomizations-", format(Sys.Date(), format="%Y%m%d"), ".csv", sep="") 
write.csv(recipient.df.final, file = file.path(data.path, randomization.filename))

In [43]:
#colnames(recipient.df.final)