## Load File

In [1]:
# clear workspace
#rm(list = ls())

# change current directory
# setwd("C:/Users/Luise/Documents/jupyterprep/")

# load csv
filename <- 'exp1_a.csv'
data <- read.csv(filename, header = TRUE)

# drop first two, irrelevant, columns
data$X <- NULL
data$X.1 <- NULL

# make relevant coluns factors
data$ppt=factor(data$ppt)
data$qua=factor(data$qua)
data$dip=factor(data$dip)

In [None]:
library(dplyr)
sample_n(data, 5)

## Missing Values

In [None]:
# how many missing values for the different columns?
na <-sapply(data, function(x) sum(length(which(is.na(x)))))
na <- data.frame(na_count)
    na

In [None]:
# remove all the rows that contain a missing value
data_na<-data[complete.cases(data), ]

In [None]:
# remove all columns with at least one missing value
data_na<-data[,!apply(is.na(data), 2, any)]

In [None]:
# replace NA with 0
data_na <- data
data_na[is.na(data_na)] <- 0

In [None]:
# fill NA with column mean
data_na <- data
for(i in 1:ncol(data)){
  data_na[is.na(data_na[,i]), i] <- mean(data_na[,i], na.rm = TRUE)
}

## Initial plots & data summary

In [None]:
summary(data)

In [None]:
head(aggregate(data, by=list(data$ppt), mean,simplify = TRUE))
#aggregate(data, by=list(data$dip,data$qua,data$ppt), mean,simplify = TRUE)

In [None]:
# get frequencies in categorical vars
# table(data$qua)
library(plyr)
count(data, 'qua')

In [None]:
# plot continuous variable distributions and correlations
library(GGally)
ggpairs(data[, c("dv", "dv_dp")])

In [None]:
# cobine categorical variables in ggplot - example for per ppt plots
library(ggplot2)
ggplot(data, aes(x = qua, y = dv))+
  geom_bar(
    aes(fill = dip), stat = "identity", color = "white",
    position = position_dodge(0.9)
    )+
  facet_wrap(~ppt)

# more syntax
ggplot(data, aes(x = dv_dp, y = dv)) +
    geom_point(size=3) + geom_line(color="red") +
    xlim(0, 6) + ylim(0.49, 1) +
    xlab("PC") + ylab("d'") +
    ggtitle(paste("Random stuff!",numit)) +
    theme(text = element_text(size=20),plot.title = element_text(hjust = 0.5, face="bold"))

## Regular Expressions

In [None]:
text <- c("A","text is", "a text!")

# returns elements of vector that include the queried text
grep("text", text, perl=TRUE, value=TRUE) # value=FALSE for indices

# replace parts of strings, for regex expressions see python cookbook for more examples
sub("text", "", text, perl=TRUE)


## Prep for training

In [None]:
## set the seed for reproducability
set.seed(1)

data$logdv = sample(c(0,1), replace=TRUE, size=nrow(data))

# random indices
train_ind <- sample(seq_len(nrow(data)), size = round(0.8*nrow(data)))

train <- data[train_ind, ]
test <- data[-train_ind, ]

## Basic statistical tests - continous DV/categorical IVs

Datasets for different samples. Note that the type of sum of squares used varies between the R packages. SPSS and SAS use type 3 by default, in R it is generally type 1 or 2. This controls what happens with the variance shared by predictors.

In [None]:
# independent measures
nppt<-100
data <- data.frame(
    dv=rnorm(nppt,0,1), # continuous dv
    subj=factor(seq(1,nppt)), # subj
    cond1= factor(rep(c(1,2),nppt/2)), # grouping variable 1
    cond2= factor(rep(c(1,2,3,4,5),(nppt/5))) # grouping variable 1
)

# repeated measures
nppt_rm<-10 # subj in rm design
ntrials<-20 # trials per cond in rm design
ncond1<-2 # levels of iv 1
ncond2<-3 # levels of IV 2
data_rm <- data.frame(
    dv=rnorm(nppt_rm*ntrials*ncond1*ncond2,0,1), # continuous dv
    subj=factor(rep(rep(sort(rep(seq(1,nppt_rm),ntrials)),ncond1),ncond2)), # subj
    trial=factor(rep(rep(rep(seq(1,ntrials),nppt_rm),ncond1),ncond2)),
    cond1=factor(rep(sort(rep(rep(seq(1,ncond1),ntrials),nppt_rm)),ncond2)),
    cond2=factor(sort(rep(rep(rep(seq(1,ncond2),ntrials),nppt_rm),ncond1)))
)
# ensure that there is 1 observation per cobination of all vars
counts<-count(data_rm, c('subj','trial','cond1','cond2'))
sum(counts$freq)==nppt_rm*ntrials*ncond1*ncond2

t-tests - independent samples

In [None]:
# introduce an effect into simulated data
data$dv[data$cond1==1]<-data$dv[df$cond1==1]+0.5

# assumption of normality (if not determined visually)
shapiro.test(data$dv[data$cond1==1])
shapiro.test(data$dv[data$cond1==2])

# assumption of equal variances
var.test(data$dv~data$cond1)

# parametric
t.test(data$dv~data$cond1,var.equal = TRUE) # alternative: t.test(data$dv[data$cond1==1],data$dv[data$cond1==2])
# for one sided tests: alternative = c("two.sided", "less", "greater")
# for one sample tests: mu = 0
# for welch test when vars unequal: var.equal = FALSE

# non-parametric
wilcox.test(data$dv~data$cond1) 

t-tests - dependent samples

In [None]:
# aggregate data cross trials
df<-aggregate(dv ~ subj+cond1, data=data_rm, FUN=mean)
#count(df, c('subj','cond1'))

# introduce effect
df$dv[df$cond1==1]<-df$dv[df$cond1==1]+0.2

# pairwise differences normally distributed?
shapiro.test(df$dv[df$cond1==1]-df$dv[df$cond1==2])

# parametric
t.test(df$dv ~ df$cond1,var.equal=TRUE)

# non-parametric
wilcox.test(df$dv ~ df$cond1)

Simulated p-value distributions for t-tests

In [None]:
# adjust 
nppt<-100
data <- data.frame(
    dv=rnorm(nppt,0,1), # continuous dv
    subj=factor(seq(1,nppt)), # subj
    cond1= factor(rep(c(1,2),nppt/2)), # grouping variable 1
    cond2= factor(rep(c(1,2,3,4,5),(nppt/5))) # grouping variable 1
)

# quick resampling of the dv (no relationship with iv)
pval_noeffect<-NULL
for(i in 1:1000){
    data$dv <- rnorm(nppt,0,1) # continuous dv
    pval_noeffect<-c(pval_noeffect,t.test(data$dv[data$cond1==1],data$dv[data$cond1==2])$p.value)
}
hist(pval_noeffect)

# quick resampling of the dv (relationship with iv)
pval_effect<-NULL
for(i in 1:1000){
    data$dv[data$cond1==1] <- rnorm(length(data$dv[data$cond1==1]),0,4) # continuous dv
    data$dv[data$cond1==2] <- rnorm(length(data$dv[data$cond1==2]),1,4) # continuous dv
    pval_effect<-c(pval_effect,t.test(data$dv[data$cond1==1],data$dv[data$cond1==2])$p.value)
}
hist(pval_effect)

One-Way ANOVA (aov & ezANOVA)

In [None]:
# aov
res.aov <- aov(dv ~ cond2, data = data)
summary(res.aov)

#ez ANOVA also runs assumption tests!
library(ez)
anova = ezANOVA(
data = data
, dv = dv
, wid = subj
, between = cond2
)
print(anova)

One-Way RM ANOVA (aov & ezANOVA)

In [None]:
# aggregate data cross trials
df<-aggregate(dv ~ subj+cond2, data=data_rm, FUN=mean)

# aov
res.aov <- aov(dv ~ cond2+Error(subj/cond2), data = df)
summary(res.aov)

#ez ANOVA also runs assumption tests!
library(ez)
anova = ezANOVA(
data = df
, dv = dv
, wid = subj
, within = cond2
)
print(anova)

Two-Way ANOVA (aov & ezANOVA)

In [None]:
#aov
res.aov <- aov(dv ~ cond1*cond2, data = data)
summary(res.aov)

#ez ANOVA also runs assumption tests!
library(ez)
anova = ezANOVA(
data = data
, dv = dv
, wid = subj
, between = c(cond1,cond2)
)
print(anova)

Two-Way RM ANOVA (aov & ezANOVA)

In [None]:
# aggregate data cross trials
df<-aggregate(dv ~ subj+cond1+cond2, data=data_rm, FUN=mean)
#count(df, c('cond1','cond2'))

#aov
res.aov <- aov(dv ~ cond1*cond2 + Error(subj/(cond1*cond2)), data = df)
summary(res.aov)

#ez ANOVA also runs assumption tests!
library(ez)
anova = ezANOVA(
data = df
, dv = dv
, wid = subj
, within = c(cond1,cond2)
)
print(anova)