# Libraries

In [106]:
source("tools.R")

# Prior likelihoods

### Preparing the data

In [107]:
data_prior <- read_csv("data/for_composititional_analysis_prior.csv")

dict_prior <- data_prior %>%                        
                        group_by(id, pid, scenario) %>%
                        summarize()

"Missing column names filled in: 'X1' [1]"Parsed with column specification:
cols(
  X1 = col_integer(),
  id = col_integer(),
  pid = col_character(),
  scenario = col_character(),
  x = col_integer(),
  y = col_double()
)


In [108]:
lmls_prior <- read_csv("output/lmls_prior.csv")

"Missing column names filled in: 'X1' [1]"Parsed with column specification:
cols(
  X1 = col_integer(),
  l = col_double(),
  `l*p` = col_double(),
  `l*p+r` = col_double(),
  `l*r` = col_double(),
  `l*r*p` = col_double(),
  `l*r+p` = col_double(),
  `l+p` = col_double(),
  `l+r` = col_double(),
  `l+r*p` = col_double(),
  `l+r+p` = col_double(),
  p = col_double(),
  `p*r` = col_double(),
  `p+r` = col_double(),
  r = col_double()
)


In [109]:
lmls_prior %>% head

X1,l,l*p,l*p+r,l*r,l*r*p,l*r+p,l+p,l+r,l+r*p,l+r+p,p,p*r,p+r,r
1,-713.6954,-0.08438396,-146.6802,-851.89829,-851.89688,-322.6189,791.0454,821.78723,927.2291,-323.47298,-323.0216,804.48813,-548.3215,792.3592
10,-1097.0542,-1097.054,-10196.9489,-68.44092,-2035.82151,141.9539,148.872,148.07899,-10196.9489,1468.08628,1537.9242,-51685.97435,1516.7239,1540.7145
100,-1196.2895,-1196.289,60.4096,-94.76571,-98.1411,815.7817,-245.216,899.05122,818.8646,65.02978,-761.1522,69.69232,994.7514,855.1832
101,-884.5586,-884.5586,-2511.4764,-21.2577,-21.48428,-369.3005,-437.512,90.20008,1129.5959,98.89978,-905.7997,1191.32516,1159.0688,1128.8089
102,-1428.1828,-263.1303,-132.9282,-266.09369,-60.38163,-996.5167,-1428.1828,82.7037,169.0308,-831.25674,-1717.5405,-581.34877,-169.7323,-39.2825
103,-1055.6152,-1055.62,-7682.7321,-57.30613,-2035.23194,1579.088,150.1733,150.17337,-7682.7321,149.92483,-182.1528,-33875.75641,-175.4362,1616.266


In [110]:
metadata_prior <- read_csv("output/metadata_prior.csv")

Parsed with column specification:
cols(
  cid = col_integer(),
  composition = col_character(),
  white_added = col_character(),
  second_exception = col_character()
)


In [111]:
metadata_prior %>% head

cid,composition,white_added,second_exception
1,l,False,False
1,p,False,False
1,r,False,False
1,l+p,False,False
1,l+r,False,False
1,p+r,False,False


In [112]:
metadata_prior <- metadata_prior %>%
                        mutate( cid_composition = paste0(cid, composition) )

In [113]:
metadata_prior$pid <- NULL
metadata_prior$composition <- NULL

In [114]:
kernels <- c("l", "p", "r", "l+p", "l+r", "p+r", "l*r", "l*p", "p*r", "l+r+p", "l+r*p", "l*r+p", "l*p+r", "l*r*p")

lmls_prior<- lmls_prior %>% 
                gather(kernel, lml, kernels)

colnames(lmls_prior) = c('id', 'kernel', 'lml')

lmls_prior <- merge(x = lmls_prior, y = dict_prior, by = c("id", "id"), all.x = TRUE)

In [115]:
lmls_prior <- lmls_prior %>%
                        mutate( cid_composition = paste0(id, kernel) )

Joining

In [116]:
lmls_prior <- merge(x = lmls_prior, y = metadata_prior, by = c("cid_composition", "cid_composition"), all.x = TRUE)

In [117]:
lmls_prior$cid_composition <- NULL

Analyzing

In [118]:
lmls_prior %>%
    filter(white_added == 'True') %>%
    nrow

lmls_prior %>%
    filter(white_added == 'False') %>%
    nrow

lmls_prior %>%
    nrow

In [119]:
lmls_prior %>%
    filter(second_exception == 'True') %>%
    nrow

lmls_prior %>%
    filter(second_exception == 'False') %>%
    nrow

lmls_prior %>%
    nrow

In [120]:
paste0(round((lmls_prior %>%
            filter(lml == -999999999 | second_exception == 'True') %>%
            nrow) / (lmls_prior %>% nrow) * 100, 2), '%')

In [121]:
lmls_prior %>%
    filter(second_exception == 'True') %>% 
    distinct(pid) %>%
    nrow

### Summarising

In [122]:
lmls_prior_f <- lmls_prior %>%
                    filter(second_exception == 'False' & lml != -999999999) %>%
                    group_by(pid, scenario) %>%
                    mutate(lml_minus_min = lml - min(lml),
                           lml_standard = lml_minus_min / (max(lml_minus_min) - min (lml_minus_min)))

In [125]:
lmls_prior_f %>% head

id,kernel,lml,pid,scenario,cid,white_added,second_exception,lml_minus_min,lml_standard
100,l,-1196.28948,a017,Gym members,100,False,False,7.786741e-06,3.553901e-09
100,l*p,-1196.28948,a017,Gym members,100,False,False,0.0,0.0
100,l*p+r,60.4096,a017,Gym members,100,True,False,1256.699,0.5735626
100,l*r,-94.76571,a017,Gym members,100,True,False,1101.524,0.5027399
100,l*r*p,-98.1411,a017,Gym members,100,True,False,1098.148,0.5011994
100,l*r+p,815.78171,a017,Gym members,100,False,False,2012.071,0.9183175


In [124]:
plot_lmls <- function(prop_data, title, hide_x=FALSE, hide_y=FALSE) {
    plot <- prop_data %>%
                ggplot(aes(x=kernel, y=value)) + 
                    geom_bar(stat="identity") +
                    #ylim(0, 0.5) +
                    #coord_cartesian (ylim=c(-1.2,1.0)) +
                    labs(title = title) +
                    ggthemes::theme_few() +
                    xlab("Kernel composition") + 
                    #ylab("Mean Standardized\nLog Likelihood") +
                    geom_errorbar(aes(ymin=as.numeric(lo_ci), ymax=as.numeric(hi_ci)),
                                      width=.3,                   
                                      position=position_dodge(.9)) +
                    theme(axis.text.x = element_text(angle = 90,  vjust = 0.5, hjust=0),
                          text = element_text(size=12, family="serif"),
                          plot.title = element_text(hjust = 0.5))
    
    if(hide_x){
        plot <- plot + theme(axis.title.x=element_blank())
    }
    if(hide_y){
        plot <- plot + theme(axis.title.y=element_blank())
    }
    
    return(plot)
}

### Plots

In [337]:
# Plotting data
to_plot<- lmls_prior_f %>% 
            group_by(kernel, scenario) %>%
            summarize(value = mean(lml_standard),
                      lo_ci = t.test(lml_standard, conf.level=0.95)$conf.int[1],
                      hi_ci = t.test(lml_standard, conf.level=0.95)$conf.int[2])

# Plotting magic
to_plot$kernel <- factor(to_plot$kernel, levels=kernels)

lmls_temperature <- to_plot %>% filter(scenario == "Temperature")
lmls_rain <- to_plot %>% filter(scenario == "Rain")
lmls_sales <- to_plot %>% filter(scenario == "Sales")
lmls_gym <- to_plot %>% filter(scenario == "Gym members")
lmls_salary <- to_plot %>% filter(scenario == "Salary")
lmls_fb <- to_plot %>% filter(scenario == "FB Friends")

p1 <- plot_lmls(lmls_temperature, "Temperature", hide_x=TRUE)
p2 <- plot_lmls(lmls_rain, "Rain", hide_x=TRUE)
p3 <- plot_lmls(lmls_sales, "Sales", hide_x=TRUE, hide_y=TRUE)
p4 <- plot_lmls(lmls_gym, "Gym members", hide_y=TRUE, hide_x=TRUE)
p5 <- plot_lmls(lmls_salary, "Salary", hide_x=TRUE, hide_y=TRUE)
p6 <- plot_lmls(lmls_fb, "FB Friends", hide_y=TRUE, hide_x=TRUE)

svg("Images/part_2/kernels_priors_lmls.svg", width=8, height=4)
multiplot(p1, p2, p3, p4, p5, p6, cols=3)
dev.off()

In [329]:
# Plotting data
to_plot<- lmls_prior_f %>% 
            group_by(kernel, scenario) %>%
            summarize(value = mean(lml),
                      lo_ci = t.test(lml, conf.level=0.95)$conf.int[1],
                      hi_ci = t.test(lml, conf.level=0.95)$conf.int[2])

# Plotting magic
to_plot$kernel <- factor(to_plot$kernel, levels=kernels)

lmls_temperature <- to_plot %>% filter(scenario == "Temperature")
lmls_rain <- to_plot %>% filter(scenario == "Rain")
lmls_sales <- to_plot %>% filter(scenario == "Sales")
lmls_gym <- to_plot %>% filter(scenario == "Gym members")
lmls_salary <- to_plot %>% filter(scenario == "Salary")
lmls_fb <- to_plot %>% filter(scenario == "FB Friends")

p1 <- plot_lmls(lmls_temperature, "Temperature", hide_x=TRUE)
p2 <- plot_lmls(lmls_rain, "Rain", hide_x=TRUE)
p3 <- plot_lmls(lmls_sales, "Sales", hide_x=TRUE, hide_y=TRUE)
p4 <- plot_lmls(lmls_gym, "Gym members", hide_y=TRUE, hide_x=TRUE)
p5 <- plot_lmls(lmls_salary, "Salary", hide_x=TRUE, hide_y=TRUE)
p6 <- plot_lmls(lmls_fb, "FB Friends", hide_y=TRUE, hide_x=TRUE)

svg("Images/part_2/kernels_priors_lmls_2.svg", width=8, height=4)
multiplot(p1, p2, p3, p4, p5, p6, cols=3)
dev.off()

# Full-Bayesian Posterior analysis

### Preparing the data

In [86]:
lmls_posterior <- read_csv("output/full-bayesian-posterior/results_posterior_test_lmls.csv")

Parsed with column specification:
cols(
  id = col_integer(),
  kernel = col_character(),
  lml = col_double(),
  white_added = col_logical(),
  second_exception = col_logical()
)


In [87]:
# To add the 'scenario' column

data_posterior <- read_csv("data/for_composititional_analysis_posterior.csv")

dict_posterior <- data_posterior %>%                        
                        group_by(id, pid, scenario) %>%
                        summarize()

dict_posterior$pid <- NULL

lmls_posterior <- merge(x = lmls_posterior, y = dict_posterior, by = c("id", "id"), all.x = TRUE)

# Adding 'participant id' (pid)
lmls_posterior$pid <- floor((lmls_posterior$id-1) / 6) + 1

"Missing column names filled in: 'X1' [1]"Parsed with column specification:
cols(
  X1 = col_integer(),
  id = col_integer(),
  pid = col_character(),
  scenario = col_character(),
  x = col_integer(),
  y = col_double(),
  condition = col_character()
)


### Analyzing

In [95]:
print('White added')

lmls_posterior %>%
    filter(white_added == 'TRUE') %>%
    nrow

lmls_posterior %>%
    filter(white_added == 'FALSE') %>%
    nrow

lmls_posterior %>%
    nrow

#######

print('Second exception')
lmls_posterior %>%
    filter(second_exception == 'TRUE') %>%
    nrow

lmls_posterior %>%
    filter(second_exception == 'FALSE') %>%
    nrow

lmls_posterior %>%
    nrow

######

print('Proportion of LML error and Second exception over the total')

paste0(round((lmls_posterior %>%
            filter(lml == -999999999 | second_exception == 'TRUE') %>%
            nrow) / (lmls_posterior %>% nrow) * 100, 2), '%')

[1] "White added"


[1] "Second exception"


[1] "Proportion of LML error and Second exception over the total"


In [89]:
# Removing unusable data, and standardizing.

lmls_posterior_f <- lmls_posterior %>%
                    filter(second_exception == 'FALSE' & lml != -999999999) %>%
                    group_by(pid, scenario) %>%
                    mutate(lml_minus_min = lml - min(lml),
                           lml_standard = lml_minus_min / (max(lml_minus_min) - min (lml_minus_min)))

### Plotting

In [104]:
# Plotting data
to_plot<- lmls_posterior_f %>%
            group_by(kernel, scenario) %>%
            summarize(value = mean(lml_standard),
                      lo_ci = t.test(lml_standard, conf.level=0.95)$conf.int[1],
                      hi_ci = t.test(lml_standard, conf.level=0.95)$conf.int[2])

# Plotting magic
to_plot$kernel <- factor(to_plot$kernel, levels=kernels)

lmls_temperature <- to_plot %>% filter(scenario == "Temperature")
lmls_rain <- to_plot %>% filter(scenario == "Rain")
lmls_sales <- to_plot %>% filter(scenario == "Sales")
lmls_gym <- to_plot %>% filter(scenario == "Gym members")
lmls_salary <- to_plot %>% filter(scenario == "Salary")
lmls_fb <- to_plot %>% filter(scenario == "FB Friends")

p1 <- plot_lmls(lmls_temperature, "Temperature", hide_x=TRUE)
p2 <- plot_lmls(lmls_rain, "Rain", hide_x=TRUE)
p3 <- plot_lmls(lmls_sales, "Sales", hide_x=TRUE, hide_y=TRUE)
p4 <- plot_lmls(lmls_gym, "Gym members", hide_y=TRUE, hide_x=TRUE)
p5 <- plot_lmls(lmls_salary, "Salary", hide_x=TRUE, hide_y=TRUE)
p6 <- plot_lmls(lmls_fb, "FB Friends", hide_y=TRUE, hide_x=TRUE)

svg("Images/part_2/kernels_posterior_lmls.svg", width=8, height=4)
multiplot(p1, p2, p3, p4, p5, p6, cols=3)
dev.off()

In [103]:
# Plotting data
to_plot<- lmls_posterior_f %>%
            group_by(kernel, scenario) %>%
            summarize(value = mean(lml),
                      lo_ci = t.test(lml, conf.level=0.95)$conf.int[1],
                      hi_ci = t.test(lml, conf.level=0.95)$conf.int[2])

# Plotting magic
to_plot$kernel <- factor(to_plot$kernel, levels=kernels)

lmls_temperature <- to_plot %>% filter(scenario == "Temperature")
lmls_rain <- to_plot %>% filter(scenario == "Rain")
lmls_sales <- to_plot %>% filter(scenario == "Sales")
lmls_gym <- to_plot %>% filter(scenario == "Gym members")
lmls_salary <- to_plot %>% filter(scenario == "Salary")
lmls_fb <- to_plot %>% filter(scenario == "FB Friends")

p1 <- plot_lmls(lmls_temperature, "Temperature", hide_x=TRUE)
p2 <- plot_lmls(lmls_rain, "Rain", hide_x=TRUE)
p3 <- plot_lmls(lmls_sales, "Sales", hide_x=TRUE, hide_y=TRUE)
p4 <- plot_lmls(lmls_gym, "Gym members", hide_y=TRUE, hide_x=TRUE)
p5 <- plot_lmls(lmls_salary, "Salary", hide_x=TRUE, hide_y=TRUE)
p6 <- plot_lmls(lmls_fb, "FB Friends", hide_y=TRUE, hide_x=TRUE)

svg("Images/part_2/kernels_posterior_lmls_2.svg", width=8, height=4)
multiplot(p1, p2, p3, p4, p5, p6, cols=3)
dev.off()

In [99]:
lmls_posterior_f %>% 
    filter(lml < -10000) %>% nrow

# Best on each condition

### Prior

In [135]:
prior_a <- lmls_prior_f %>%
                    group_by(kernel, scenario) %>%
                    summarize(value = mean(lml),
                              lo_ci = t.test(lml, conf.level=0.95)$conf.int[1],
                              hi_ci = t.test(lml, conf.level=0.95)$conf.int[2])

### Posterior

In [136]:
posterior_a <- lmls_posterior_f %>%
                    group_by(kernel, scenario) %>%
                    summarize(value = mean(lml),
                              lo_ci = t.test(lml, conf.level=0.95)$conf.int[1],
                              hi_ci = t.test(lml, conf.level=0.95)$conf.int[2])

### Tables

In [156]:
prior_a %>% 
    group_by(scenario) %>%
    summarize(max_index = which.max(value),
              max_kernel = kernel[max_index],
              max_lml = value[max_index])

scenario,max_index,max_kernel,max_lml
FB Friends,8,l+r,125.2687
Gym members,14,r,606.5293
Rain,14,r,255.654
Salary,14,r,1081.6803
Sales,8,l+r,-881.9682
Temperature,14,r,474.7388


In [157]:
posterior_a %>% 
    group_by(scenario) %>%
    summarize(max_index = which.max(value),
              max_kernel = kernel[max_index],
              max_lml = value[max_index])

scenario,max_index,max_kernel,max_lml
FB Friends,5,l*r*p,-217.165903
Gym members,8,l+r,-6.6087
Rain,8,l+r,28.158404
Salary,8,l+r,-4.190274
Sales,4,l*r,-327.823858
Temperature,14,r,106.528731


### Consistency

In [165]:
prior_maxs <- lmls_prior_f %>%
                group_by(id, pid, scenario) %>%
                summarize(max_index = which.max(lml),
                          max_kernel = kernel[max_index],
                          max_lml = lml[max_index])

In [166]:
posterior_maxs <- lmls_posterior_f %>%
                group_by(id, pid, scenario) %>%
                summarize(max_index = which.max(lml),
                          max_kernel = kernel[max_index],
                          max_lml = lml[max_index])

In [169]:
merged_maxs <- merge(x = prior_maxs, y = posterior_maxs, by = c("id", "id"), all.x = TRUE)

In [176]:
(merged_maxs %>%
    filter(max_kernel.x == max_kernel.y) %>%
    nrow)/
(merged_maxs %>%
    nrow)

# Excluding {l, p, r}

### Tables

In [181]:
prior_a <- lmls_prior_f %>%
                    filter(kernel != 'l', kernel != 'p', kernel != 'r') %>%
                    group_by(kernel, scenario) %>%
                    summarize(value = mean(lml),
                              lo_ci = t.test(lml, conf.level=0.95)$conf.int[1],
                              hi_ci = t.test(lml, conf.level=0.95)$conf.int[2])

posterior_a <- lmls_posterior_f %>%
                    filter(kernel != 'l', kernel != 'p', kernel != 'r') %>%
                    group_by(kernel, scenario) %>%
                    summarize(value = mean(lml),
                              lo_ci = t.test(lml, conf.level=0.95)$conf.int[1],
                              hi_ci = t.test(lml, conf.level=0.95)$conf.int[2])

print('Kernel with the maximum average per scenario')
print('Prior table:')
prior_a %>% 
    group_by(scenario) %>%
    summarize(max_index = which.max(value),
              max_kernel = kernel[max_index],
              max_lml = value[max_index])

print('Posterior data:')
posterior_a %>% 
    group_by(scenario) %>%
    summarize(max_index = which.max(value),
              max_kernel = kernel[max_index],
              max_lml = value[max_index])

[1] "Kernel with the maximum average per scenario"
[1] "Prior table:"


scenario,max_index,max_kernel,max_lml
FB Friends,7,l+r,125.2687
Gym members,11,p+r,518.0173
Rain,8,l+r*p,213.3836
Salary,11,p+r,809.7401
Sales,7,l+r,-881.9682
Temperature,10,p*r,443.1288


[1] "Posterior data:"


scenario,max_index,max_kernel,max_lml
FB Friends,4,l*r*p,-217.165903
Gym members,7,l+r,-6.6087
Rain,7,l+r,28.158404
Salary,7,l+r,-4.190274
Sales,3,l*r,-327.823858
Temperature,7,l+r,54.054911


### Tables (using standard)

In [183]:
prior_a <- lmls_prior_f %>%
                    filter(kernel != 'l', kernel != 'p', kernel != 'r') %>%
                    group_by(kernel, scenario) %>%
                    summarize(value = mean(lml_standard),
                              lo_ci = t.test(lml_standard, conf.level=0.95)$conf.int[1],
                              hi_ci = t.test(lml_standard, conf.level=0.95)$conf.int[2])

posterior_a <- lmls_posterior_f %>%
                    filter(kernel != 'l', kernel != 'p', kernel != 'r') %>%
                    group_by(kernel, scenario) %>%
                    summarize(value = mean(lml_standard),
                              lo_ci = t.test(lml_standard, conf.level=0.95)$conf.int[1],
                              hi_ci = t.test(lml_standard, conf.level=0.95)$conf.int[2])

print('Kernel with the maximum average per scenario')
print('Prior table:')
prior_a %>% 
    group_by(scenario) %>%
    summarize(max_index = which.max(value),
              max_kernel = kernel[max_index],
              max_lml = value[max_index])

print('Posterior data:')
posterior_a %>% 
    group_by(scenario) %>%
    summarize(max_index = which.max(value),
              max_kernel = kernel[max_index],
              max_lml = value[max_index])

[1] "Kernel with the maximum average per scenario"
[1] "Prior table:"


scenario,max_index,max_kernel,max_lml
FB Friends,7,l+r,0.9469077
Gym members,8,l+r*p,0.8893989
Rain,10,p*r,0.9329891
Salary,11,p+r,0.8224981
Sales,7,l+r,0.9694123
Temperature,10,p*r,0.9404126


[1] "Posterior data:"


scenario,max_index,max_kernel,max_lml
FB Friends,4,l*r*p,0.9560027
Gym members,11,p+r,0.820073
Rain,7,l+r,0.8092954
Salary,11,p+r,0.7124758
Sales,4,l*r*p,0.9987226
Temperature,11,p+r,0.7849605


### Persistence percentage

In [179]:
prior_maxs <- lmls_prior_f %>%
                filter(kernel != 'l', kernel != 'p', kernel != 'r') %>%
                group_by(id, pid, scenario) %>%
                summarize(max_index = which.max(lml),
                          max_kernel = kernel[max_index],
                          max_lml = lml[max_index])

posterior_maxs <- lmls_posterior_f %>%
                filter(kernel != 'l', kernel != 'p', kernel != 'r') %>%
                group_by(id, pid, scenario) %>%
                summarize(max_index = which.max(lml),
                          max_kernel = kernel[max_index],
                          max_lml = lml[max_index])

merged_maxs <- merge(x = prior_maxs, y = posterior_maxs, by = c("id", "id"), all.x = TRUE)

(merged_maxs %>%
    filter(max_kernel.x == max_kernel.y) %>%
    nrow)/
(merged_maxs %>%
    nrow)

# Real-world data