In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [1]:
# activate R magic
%load_ext rpy2.ipython


ModuleNotFoundError: No module named 'rpy2'

In [None]:
%%R

power_mcnemar_test <- function(n = NULL, paid = NULL, psi = NULL, sig.level = 0.05, power = NULL,
                               alternative = c("two.sided", "one.sided"),
                               method = c("normal", "exact", "cond.exact")) {

    if (sum(sapply(list(n, paid, psi, power, sig.level), is.null)) != 1)
        stop("exactly one of 'n', 'paid', 'psi', 'power', and 'sig.level' must be NULL")
    if (!is.null(sig.level) && !is.numeric(sig.level) || any(0 > sig.level | sig.level > 1))
        stop("'sig.level' must be numeric in [0, 1]")
    if (!is.null(paid)) {
        if (any(paid <=0) || any(paid>=0.5)) {
            stop("paid is the smallest discordant probability and must be 0<paid<0.5")
        }
    }
    if (!is.null(psi)) {
       if (any(psi <= 1)) {
           stop("psi must be 1 or greater since it is the ratio of the larger discordant probability to the smaller discordant probability")
       }    
       if (any((psi + 1)*paid > 1)) {
           stop("psi cannot be so big that the sum of the discordant probabilities exceed 1: ie., (1+paid)*psi>1")
       }    
    }
    alternative <- match.arg(alternative)
    method <- match.arg(method)
    tside <- switch(alternative, one.sided = 1, two.sided = 2)
    
    ## Fix if psi was specified to be less that 1
#    if (psi<1 && !is.null(psi)) {
#        paid <- paid*psi
#        psi <- 1/psi
#    }

    ## Conditional power (conditional on n)
    f <- function(n, paid, psi, sig.level, power) {
        bc <- ceiling(paid * n * (1+psi))
        pbinom(qbinom(0.025, size=bc, prob=0.5)-1, size=bc, prob=1/(1+psi)) + 1-pbinom(qbinom(0.975, size=bc, prob=0.5), size=bc, prob=1/(1+psi))
    }

    ## Unconditional power
    fexact <- function(n, paid, psi, sig.level, power, alt=alternative) {
      print(n)
      print(paid)
      print(psi)
      print(sig.level)
      print(power)
        sum(dbinom(seq(n), size=n, prob=paid*(1+psi))*power_binom_test(seq(n), p0=.5, pa=1/(1+psi),
                                                                       power=power, sig.level=sig.level,
                                                                       alternative=ifelse(alt=="two.sided", "two.sided", "less"))$power)
    }

        
    if (method=="normal") {
        p.body <- quote( pnorm (
            (sqrt(n * paid) * (psi-1) - qnorm(sig.level/tside, lower.tail=FALSE)*sqrt(psi+1)) / sqrt((psi+1) - paid*(psi-1)^2)))
    } else if (method=="exact") { 
        p.body <- quote( fexact(n, paid, psi, sig.level, power) )
    } else {
        p.body <- quote( f(n, paid, psi, sig.level, power) )
    }


    if (is.null(power)) {
        power <- eval(p.body)
    } else if (is.null(n)) {
        n <- uniroot(function(n) eval(p.body) - power, c(ceiling(log(sig.level)/log(.5)), 1e+07))$root
    } else if (is.null(paid))
        paid <- uniroot(function(paid) eval(p.body) - power, c(1e-10, 1/(1+psi)-1e-10))$root
    else if (is.null(psi))
        psi <- uniroot(function(psi) eval(p.body) - power, c(1+1e-10, 1/paid-1-1e-10))$root
    else if (is.null(sig.level))
        sig.level <- uniroot(function(sig.level) eval(p.body) -
            power, c(1e-10, 1 - 1e-10))$root
    else stop("internal error", domain = NA)
    NOTE <- "n is number of pairs"
    METHOD <- paste("McNemar paired comparison of proportions", ifelse(method=="normal", "approximate", "exact") ,"power calculation")
    structure(list(n = n, paid = paid, psi = psi, sig.level = sig.level,
        power = power, alternative = alternative, note = NOTE,
        method = METHOD), class = "power.htest")

}

In [None]:
%%R
datatable = read.csv(file = "/content/drive/My Drive/NLP Power Analysis/Glue meta - Sheet1.csv")
testsetsizes = read.csv(file = "/content/drive/My Drive/NLP Power Analysis/GlueTaskTestSetSizes - Sheet1.csv")
# print(testsetsizes)
# install.packages("biostatUZH", repos="http://R-Forge.R-project.org")
library(biostatUZH)

f <- function(x) {
    # print(x[1])
#  test.set.size <- dplyr::filter(testsetsizes, grepl(x[1], testsetsizes$dataset))$size
 test.set.size  = as.numeric(subset(testsetsizes, Dataset == x[1])$size)
  # print(test.set.size )
 paper <- x[2]
 baseline <- as.numeric(x[3]) / 100.0
 newmodel <- as.numeric(x[4]) / 100.0
#  power = .8
 claim <- x[6]
sota.comparison <- x[5]

#  devtools::install_github('ekstroem/MESS')
# print(claim)
 if (claim == "Y" & newmodel - baseline > 0 & sota.comparison == "Y") {
    # print(newmodel - baseline)
    res <- power.prop.test(n = test.set.size, p1 = baseline, p2 = newmodel, power = NULL, sig.level = 0.05)$power > 0.8
    lach.mid <- uniroot(function(p2) eval(quote(sampleSizeMcNemar(p1 = baseline, p2 = p2, power=0.8)["N_l mid"])) - test.set.size, c(baseline, 1.0-10^-6))$root <= newmodel
    lach.min <- uniroot(function(p2) eval(quote(sampleSizeMcNemar(p1 = baseline, p2 = p2, power=0.8)["N_l min"])) - test.set.size, c(baseline, 1.0-10^-6))$root<= newmodel
    lach.max <- uniroot(function(p2) eval(quote(sampleSizeMcNemar(p1 = baseline, p2 = p2, power=0.8)["N_l max"])) - test.set.size, c(baseline+10^-6, 1.0-10^-6))$root<= newmodel

    baseline.accuracy <- baseline
    n <- test.set.size
    prior <- uniroot(function(treatment.effect) {
      non.overlapping <- 1.0 - (.41422307 + 0.58187475 * baseline.accuracy - 0.46624237 * treatment.effect)
      paid <- ( non.overlapping - treatment.effect ) /2
      psi <- ( non.overlapping + treatment.effect ) / ( non.overlapping - treatment.effect)
      power_mcnemar_test(n = n, paid = paid, psi = psi , sig.level = 0.05, power = NULL, alternative = "two.sided", method = "normal")$power - .8}, c(10^-6, 1.0-baseline.accuracy-10^-6))$root + baseline <= newmodel

    result <- c(observed.effect, res, lach.mid, lach.min, lach.max, prior)
    # print(result)
    result
 }
 else{
     c(NA, NA, NA, NA, NA)
 }
}

powers <- na.omit(t(as.data.frame(apply(datatable, 1, f))))
print(nrow(powers))
# print(powers)
colnames(powers)<-c("power.binomial", "power.lach.mid", "power.lach.min", "power.lac.max", "prior")
print(powers)
percentages = (colSums(powers, na.rm=TRUE) / nrow(powers))
print(percentages)
# print(powers)
# library(ggplot2)
# ggplot(powers, aes(x=power.binomial)) + geom_histogram(binwidth=.05, alpha=.5, position="identity")

# ggplot(powers, aes(x=power, colour=power > 0.8)) + geom_density()



In [None]:
%%R
library(ggplot2)
pdf(file = "/content/drive/My Drive/NLP Power Analysis/percentage_powered.pdf")
par(mar=c(8, 4.1, 4.1, 2.1))
barplot( height=percentages*100.0 , ylim=c(0,100),ylab="% Powered", xlab="", names=c("Binomial", "Lach (Mid)", "Lach (Max)", "Lach (Min)", "Prior"), density=c(5,10,20,30,7), angle=c(0,45,90,11,36) , col="brown", las=2)
dev.off()

In [None]:
%%R
claim <- datatable[datatable[,"Claimed.Improvement..Bolded.or.in.text."] == "Y",]
claim <- claim[claim[,"SOTA.Comparison"] == "Y",]
effect.sizes <- claim[, "Model.Accuracy"] - claim[,"Previous.Best"] 
claim$effect.size = effect.sizes

# print(aggregate(claim$effect.size, by=list(Task=claim$Task), FUN=mean))
print(claim %>%
    group_by(Task) %>%
    summarise(mean=mean(effect.size),sd= sd(effect.size),se=sd(effect.size)/sqrt(n()), n=n()))


In [None]:
%%R
install.packages("ggpubr")
library(ggpubr)

# pdf(file = )
stdErr <- function(x) {sd(x)/ sqrt(length(x))}
print(colnames(datatable))
# claim <- datatable[,"Claimed.Improvement..Bolded.or.in.text."] == "Y"
newdata <- datatable 
#%>% filter(SOTA.Comparison == "Y")
effect.sizes <- newdata[, "Model.Accuracy"] - newdata[,"Previous.Best"] 
newdata$effect.size = effect.sizes

print(effect.sizes)
print(mean(effect.sizes))
print(stdErr(effect.sizes))
# hist(effect.sizes, # histogram
#  col="peachpuff", # column color
#  border="black",
#  prob = TRUE, # show densities instead of frequencies
#  xlab = "% Accuracy Difference (New Model - Baseline)",
#  breaks=40,
#  ylim=c(0.,0.75),
#  xlim=c(-10., 10.),
#  main =NULL)
# lines(density(effect.sizes), # density plot
#  lwd = 2, # thickness of line
#  col = "chocolate3")

ggplot(newdata, aes(x=effect.size)) + 
 geom_histogram(aes(y=..density..), colour="black", bins=40, fill="peachpuff")+
 geom_density(alpha=.2, colour="chocolate3")  +  theme_pubr(base_size=18) +   scale_fill_manual(values=cbPalette) +  scale_colour_manual(values=cbPalette) + xlab("Accuracy Improvement") + ylab("Density")


ggsave("/content/drive/My Drive/NLP Power Analysis/effect_size_distribution.pdf")




In [None]:
%%R
library(tidyverse)
# install.packages("ggpubr")
library(ggpubr)
# The palette with grey:
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

# The palette with black:
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

# To use for fills, add

# To use for line and point colors, add


# newdata <- datatable
# print(colnames(newdata))
# newdata$effect.size = effect.sizes
# print(newdata %>% filter(abs(effect.sizes) >= 5))
# ggplot(newdata %>% filter(SOTA.Comparison == "Y"),aes(y=effect.size,x=Previous.Best, color=Task))+geom_point()+geom_smooth(method="lm")
ggplot(newdata ,aes(y=effect.size,x=Previous.Best, color=Task))+geom_point(size=3)+geom_smooth(method="lm", na.rm = TRUE, fullrange= TRUE,
               aes(group=1),colour="black") +  theme_pubr(base_size=18) +   scale_fill_manual(values=cbPalette) +  scale_colour_manual(values=cbPalette) + xlab("Baseline Accuracy") + ylab("Accuracy Improvement")

ggsave("/content/drive/My Drive/NLP Power Analysis/effect_size_distribution_per_task.pdf")


In [None]:
%%R
# library(lmer4)
fit <- lm(effect.size ~ Previous.Best + Task, data=newdata %>% filter(Claimed.Improvement..Bolded.or.in.text. == "Y") %>% filter(SOTA.Comparison == "Y") )
print(summary(fit)) # show results

Task <- c("SST-2", "MRPC", "QNLI", "RTE", "MNLI-m", "MNLI-mm", "QQP", "WNLI")
Previous.Best <- c(97.2, 92, 97.5, 91.7, 91.6, 91.3, 91.0, 94.5)
sota.data <- data.frame(Previous.Best, Task)

predict(fit,sota.data)

# install.packages("stargazer")
library(stargazer)
stargazer(fit, title="Results", align=TRUE)

In [None]:
%%R

f <- function(x) {
    # print(x[1])
#  test.set.size <- dplyr::filter(testsetsizes, grepl(x[1], testsetsizes$dataset))$size
 test.set.size  = as.numeric(subset(testsetsizes, Dataset == x[1])$size)
  # print(test.set.size )
 paper <- x[2]
 baseline <- as.numeric(x[3]) / 100.0
 newmodel <- as.numeric(x[4]) / 100.0
 i <- as.numeric(tail(x, n=1))
#  power = .8
 claim <- x[5]
#  print(i)
#  devtools::install_github('ekstroem/MESS')

 if (claim == "Y" & (newmodel - baseline > 0)) {
     
     fit <- lm(effect.size ~ Previous.Best + Task, data=newdata[-i,] %>% filter(Claimed.Improvement..Bolded.or.in.text. == "Y") %>% filter(SOTA.Comparison == "Y") )
    # print(summary(fit)) # show results
     # For a positive claim
    Task <- c(x[1])
    Previous.Best <- c(baseline*100.0)
    sota.data <- data.frame(Previous.Best, Task)
    predicted.effect <- predict(fit, sota.data) / 100.0
    if (predicted.effect > 0) {
      baseline.accuracy <- baseline
      n <- test.set.size

      non.overlapping <- 1.0 - (.41422307 + 0.58187475 * baseline.accuracy - 0.46624237 * predicted.effect)
      paid <- as.numeric(( non.overlapping - predicted.effect ) / 2)
      # print(paid)
      psi <- ( non.overlapping + predicted.effect ) / ( non.overlapping - predicted.effect)
      power <- power_mcnemar_test(n = n, paid = paid, psi = psi , sig.level = 0.05, power = NULL, alternative = "two.sided", method = "normal")$power

      # for significance use the true effect
      true.effect <-newmodel - baseline
      non.overlapping <- 1.0 - (.41422307 + 0.58187475 * baseline.accuracy - 0.46624237 * (true.effect))
      paid <- as.numeric(( non.overlapping - true.effect ) / 2)
      overlapping <- as.numeric(1.0 - non.overlapping)
      p21 <- as.numeric((non.overlapping + true.effect) / 2)
      p11 <- as.numeric(baseline - p21)
      p00 <- as.numeric(1.0 - (p11 + p21 + paid))
      contingency <- matrix(as.integer(c(p00, paid, p21, p11) * n), nrow=2, ncol=2)
      res <- mcnemar.test(contingency) 
      # print(res$p.value)

      result <- c(predicted.effect * 100.0, (newmodel - baseline)* 100.0, power, x[1], res$p.value)
      # print(result)
      result
    }
    else{
        c((predicted.effect)* 100.0, (newmodel - baseline)* 100.0, NA, x[1], NA)
    }
 }
 else{
     c(NA, NA, NA, NA, NA)
 }
}

library()

powers <- as.data.frame(na.omit(t(as.data.frame(apply(newdata %>% filter(Claimed.Improvement..Bolded.or.in.text. == "Y") %>% filter(SOTA.Comparison == "Y") %>%mutate(Id=row_number()), 1, f)))))
# print(powers)
colnames(powers) <- c("Regressed Effect", "Observed Effect", "Power", "Task", "P")
# install.packages("ggpubr", force=FALSE)
library(ggpubr)
# The palette with grey:
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

# The palette with black:
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

# powers$Power <- as.numeric()
rownames(powers) <- c()
print(powers)
powers$Power=as.numeric(levels(powers$Power))[powers$Power]
powers$P=as.numeric(levels(powers$P))[powers$P]
print("Percent underpowered")
print(mean(powers$Power < .8))

print("Power levels")
print(mean(powers$Power))
print(stdErr(powers$Power))

# ggplot(powers, aes(x=Power)) + 
#  geom_histogram(aes(y=..density..), colour="black", bins=20, fill="peachpuff")+
#  geom_density(alpha=.2, colour="chocolate3")  +  theme_pubr(base_size=18) +   scale_fill_manual(values=cbPalette) +  scale_colour_manual(values=cbPalette) + xlab("Power") + ylab("Density")

print("Percent Significant")
print(mean(powers$P < .05))

print("p-value average")
print(mean(powers$P))
print(stdErr(powers$P))


print("Percent powed and significant")
print(mean(powers$P < .05 & powers$Power >.8))
# powered <- powers %>% filter(Power >= .8)
# print(nrow( powered %>% filter(P <= 0.05)) / nrow(powered))

# print("Power levels")
# print(mean(powers$Power))
# print(stdErr(powers$Power))
# ggplot(powers, aes(x=P)) + 
#  geom_histogram(aes(y=..density..), colour="black", bins=30, fill="peachpuff")+
#  geom_density(alpha=.2, colour="chocolate3")  +  theme_pubr(base_size=18) +   scale_fill_manual(values=cbPalette) +  scale_colour_manual(values=cbPalette) + xlab("Power") + ylab("Density")


#

In [None]:
%%R
# df <- data.frame(Powe,
#                 len=c(1 - 0.4561404, 0.5087719,  0.3684211))
# head(df)
powers$powered <- powers$Power < .8
powers$significant <- powers$P < .05
powers$significant <- powers$P < .05
powers$significant.and.powered <- powers$P < .05 & powers$Power >.8
stargazer(powers, align=TRUE)

