# Contrast coding
---

This notebook explores difference between constrast coding.

### Libraries

In [16]:
using DataFrames, Missings
using StatsBase, StatsModels, GLM, Random 
#using Plots, StatsPlots, Plots.PlotMeasures, Colors, ColorSchemes
#using LinearAlgebra, Distributions, FreqTables
#using StatsBase, HypothesisTests 

In [1]:
using DataFrames, StatsModels, Random

## Data

In [2]:
Random.seed!(1)
# Dimensions of matrices 
n = 100
m = 250

# Number of column covariates
q = 20

# Generate data with two categorical variables and 4 numerical variables.
X_df = hcat(DataFrame(catvar1=rand(1:5, n), catvar2=rand(["A", "B", "C"], n),
        catvar3=rand(["D", "E"], n)), DataFrame(rand(n,4),:auto))

Unnamed: 0_level_0,catvar1,catvar2,catvar3,x1,x2,x3,x4
Unnamed: 0_level_1,Int64,String,String,Float64,Float64,Float64,Float64
1,1,A,E,0.0305704,0.223649,0.198196,0.98829
2,2,C,E,0.594606,0.823901,0.489737,0.582697
3,4,A,E,0.00710816,0.0176562,0.220826,0.143616
4,4,A,E,0.997248,0.569468,0.741995,0.885134
5,5,B,E,0.456335,0.193274,0.457017,0.564806
6,1,B,E,0.89749,0.649155,0.141094,0.258384
7,4,C,D,0.611742,0.0403184,0.188614,0.834546
8,4,C,E,0.391296,0.376364,0.815911,0.984715
9,4,C,E,0.46407,0.370864,0.158952,0.175703
10,1,A,E,0.222976,0.729914,0.887117,0.0184104


In [None]:
unique(X_df.catvar2)

In [None]:
greet = "Hello"
whom = "world"
#greet*","*whom
 "$greet, $whom."

In [None]:
f = "y ~ catvar1 + catvar2 + x1 + x2 + x3 + x4"
y, xs = split(f, "~")
sum(term.(split(xs, "+")))

In [4]:
macro mlmFormula(ex)
    name = string(ex)
    name = join(map(x -> isspace(name[x]) ? "" : name[x], 1:length(name)))
    return :(sum(term.(split($name, "+"))))
end

@mlmFormula(catvar1 + catvar2 + catvar3 + x1 + x2 + x3 + x4)


catvar1(unknown)
catvar2(unknown)
catvar3(unknown)
x1(unknown)
x2(unknown)
x3(unknown)
x4(unknown)

In [9]:
function design_matrix(;f, df::DataFrame, cntrstArray)
    cntrst = Dict{Symbol, AbstractContrasts}()
    for cntrsTuple in cntrstArray
        for i in 1:length(cntrsTuple)-1
            fun = cntrsTuple[length(cntrsTuple)]
            cntrst[cntrsTuple[i]] = fun
        end
    end    
    return modelmatrix(f, df, hints= cntrst)
end

design_matrix(f=@mlmFormula(catvar1 + catvar2 + catvar3 + x1 + x2 + x3 + x4),df=X_df,
               cntrstArray=[(:catvar1,:catvar3, DummyCoding()) (:catvar2,  EffectsCoding()) ]  )

100×11 Matrix{Float64}:
 0.0  0.0  0.0  0.0  -1.0  -1.0  1.0  …  0.223649   0.198196   0.98829
 1.0  0.0  0.0  0.0   0.0   1.0  1.0     0.823901   0.489737   0.582697
 0.0  0.0  1.0  0.0  -1.0  -1.0  1.0     0.0176562  0.220826   0.143616
 0.0  0.0  1.0  0.0  -1.0  -1.0  1.0     0.569468   0.741995   0.885134
 0.0  0.0  0.0  1.0   1.0   0.0  1.0     0.193274   0.457017   0.564806
 0.0  0.0  0.0  0.0   1.0   0.0  1.0  …  0.649155   0.141094   0.258384
 0.0  0.0  1.0  0.0   0.0   1.0  0.0     0.0403184  0.188614   0.834546
 0.0  0.0  1.0  0.0   0.0   1.0  1.0     0.376364   0.815911   0.984715
 0.0  0.0  1.0  0.0   0.0   1.0  1.0     0.370864   0.158952   0.175703
 0.0  0.0  0.0  0.0  -1.0  -1.0  1.0     0.729914   0.887117   0.0184104
 0.0  1.0  0.0  0.0  -1.0  -1.0  1.0  …  0.920536   0.421533   0.439957
 0.0  1.0  0.0  0.0   1.0   0.0  1.0     0.105447   0.58961    0.715972
 1.0  0.0  0.0  0.0   1.0   0.0  1.0     0.373212   0.46241    0.988754
 ⋮                          ⋮         ⋱ 

In [None]:
typeof(contrasts)

In [None]:
# Convert dataframe to predicton matrix
# X = Matrix(contr(X_df, [:catvar1, :catvar2], ["treat", "sum"]))

# # Generate X matrix
# contrasts = Dict(:catvar1 => DummyCoding(base = "1"), :catvar2 => EffectsCoding(base = "A"))
# # contrasts = Dict(:catvar1 => StatsModels.FullDummyCoding())
# frml = @formula(y ~  catvar1 + catvar2 + x1 + x2 + x3 + x4).rhs
# # # mf = ModelFrame(frml, df_data)
# X = modelmatrix(frml, X_df, hints = contrasts);
# lm(frml, df_data, contrasts = contrasts)
X = modelmatrix(@formula(y ~  catvar1 + catvar2 + x1 + x2 + x3 + x4).rhs, X_df, 
    hints= Dict(:catvar1 => DummyCoding(base = "1"), :catvar2 => EffectsCoding(base = "A")))


# X = new_Matrix(@fmrl y ~  catvar1 + catvar2 + x1 + x2 + x3 + x4, X_df, 
#     Dict(:catvar1 => DummyCoding(base = "1"), :catvar2 => EffectsCoding(base = "A"))) 

In [None]:
# distibution for each group
d_A = Normal(30, 7); d_B = Normal(40, 7); d_age_A =  Normal(35, 5); d_age_B =  Normal(50, 2);

# generate data toy
m_data = hcat(vcat(rand(d_A, 40), rand(d_B, 40)), # score
              vcat(repeat(["A"], 40), repeat(["B"], 40)), # group
              vcat(repeat(["male"], 20), repeat(["female"], 20), repeat(["male"], 20), repeat(["female"], 20)), # sex
              repeat(["w", "nw"], 40),
                # shuffle(repeat(["w", "nw"], 40)), # race
              Int.(vcat((round.(rand(d_age_A, 40))), (round.(rand(d_age_B, 40)))))  )
m_data = m_data[shuffle(1:size(m_data,1)), :];

# generate dataframe 
df_data = DataFrame(group = string.(vec(m_data[:, 2])), sex = string.(vec(m_data[:, 3])), 
                    race = string.(vec(m_data[:, 4])), age = Int.(vec(m_data[:, 5])), 
                    score = float.(vec(m_data[:, 1])))
first(df_data, 4)

## Two levels - one covariate

### Dummy coding

In [None]:
# Generate X matrix
contrasts = Dict(:group => DummyCoding(base = "A"))
# contrasts = Dict(:group => StatsModels.FullDummyCoding())
frml = @formula(score ~ 1 + group)
# mf = ModelFrame(frml, df_data)
mX = modelmatrix(frml, df_data, hints = contrasts);
lm(frml, df_data, contrasts = contrasts)

In [None]:
gdf =  groupby(sort(df_data, :group), :group); df_group_mean = combine(gdf, :score => mean ) 
display(df_group_mean) 
println("Mean group A = ", round(df_group_mean.score_mean[1], digits = 4))
println("Mean group B - Mean group A = ",
    round(df_group_mean.score_mean[2]-df_group_mean.score_mean[1], digits = 4))

### Effect coding

In [None]:
# Generate X matrix
contrasts = Dict(:group => EffectsCoding(base = "A"))
frml = @formula(score ~ 1 + group)
# mf = ModelFrame(frml, df_data)
mX = modelmatrix(frml, df_data, hints = contrasts)
lm(frml, df_data, contrasts = contrasts)

In [None]:
println("Average score across the group \n μ(μ_group_B, μ_group_A) = ", round((df_group_mean.score_mean[1]+df_group_mean.score_mean[2])*0.5, digits = 4))
println("Average difference between the 2 groups divided by 2 \n (μ_group_B - μ_group_A)*0.5 = ", 
    round((df_group_mean.score_mean[2]-df_group_mean.score_mean[1])*0.5, digits = 4))

## Two levels - two covariates

### Dummy coding - no interaction

In [None]:
# Generate X matrix
contrasts = Dict(:group => DummyCoding(base = "A"), :sex => DummyCoding(base = "female"))
frml = @formula(score ~ 1 + group + sex)
# mf = ModelFrame(frml, df_data)
mX = modelmatrix(frml, df_data, hints = contrasts);
lm(frml, df_data, contrasts = contrasts)

In [None]:
gdf =  groupby(sort(df_data, [:group, :sex]), [:group, :sex]); df_group_sex_mean = combine(gdf, :score => mean ) 
gdf =  groupby(sort(df_data, [:group, :sex]),  :sex); df_sex_mean = combine(gdf, :score => mean ) 
display(df_group_sex_mean) 
println("Mean female group A = ", round(df_group_sex_mean.score_mean[1], digits = 4),
        ", under the assumption of additivity.")
println("Mean group B - Mean group A = ", 
    round(df_group_mean.score_mean[2]-df_group_mean.score_mean[1], digits = 4))
println("Mean Male - Mean Female = ", 
    round(df_sex_mean.score_mean[2]-df_sex_mean.score_mean[1], digits = 4))

### Dummy coding - with interaction

In [None]:
# Generate X matrix
contrasts = Dict(:group => DummyCoding(base = "A"), :sex => DummyCoding(base = "female"))
frml = @formula(score ~ 1 + group + sex + group*sex)
# mf = ModelFrame(frml, df_data)
mX = modelmatrix(frml, df_data, hints = contrasts);
lm(frml, df_data, contrasts = contrasts)

In [None]:
println("Mean female group A = ", round(df_group_sex_mean.score_mean[1], digits = 4))
println("Mean female group B - Mean female group A = ", 
    round(df_group_sex_mean.score_mean[3]-df_group_sex_mean.score_mean[1], digits = 4))
println("Mean male group A - Mean female group A = ", 
    round(df_group_sex_mean.score_mean[2]-df_group_sex_mean.score_mean[1], digits = 4))
println("(Mean male group B - Mean female group B) - (Mean male group A - Mean female group A) = ", 
    round((df_group_sex_mean.score_mean[4]-df_group_sex_mean.score_mean[3])-(df_group_sex_mean.score_mean[2]-df_group_sex_mean.score_mean[1]), 
        digits = 4) )

### Effect coding - no interaction

In [None]:
# Generate X matrix
contrasts = Dict(:group => EffectsCoding(base = "A"), :sex => EffectsCoding(base = "female"))
frml = @formula(score ~ 1 + group + sex)
# mf = ModelFrame(frml, df_data)
mX = modelmatrix(frml, df_data, hints = contrasts)
lm(frml, df_data, contrasts = contrasts)

In [None]:
println("Mean(Mean group B, Mean group A) = ", #mean(df_AvB_sex_mean.score_mean)
    round((df_group_mean.score_mean[1]+df_group_mean.score_mean[2])*0.5, digits = 4))
println("[Mean group B - Mean group A]*0.5 = ",
    round((df_group_mean.score_mean[2]-df_group_mean.score_mean[1])*0.5, digits = 4))
println("[Mean male - Mean female]*0.5 = ",
    round((df_sex_mean.score_mean[2]-df_sex_mean.score_mean[1])*0.5, digits = 4))

### Effect coding - with interaction

In [None]:
# Generate X matrix
contrasts = Dict(:group => EffectsCoding(base = "A"), :sex => EffectsCoding(base = "female"))
frml = @formula(score ~ 1 + group + sex + group*sex)
# mf = ModelFrame(frml, df_data)
mX = modelmatrix(frml, df_data, hints = contrasts)
lm(frml, df_data, contrasts = contrasts)

In [None]:
df_group_sex_mean

In [None]:
println("Mean(Mean female group B, Mean female group A, Mean male group B, Mean male group A) = ", 
    round(mean(df_group_sex_mean.score_mean), digits = 4))
println("Mean[(Mean B|female - Mean A|female)*0.5, (Mean B|male - Mean A|male)*0.5] = ",
    round(mean([0.5*(df_group_sex_mean.score_mean[4]-df_group_sex_mean.score_mean[2]),
                0.5*(df_group_sex_mean.score_mean[3]-df_group_sex_mean.score_mean[1])]), digits = 4))
println("Mean[(Mean male|B - Mean female|B)*0.5, (Mean male|A - Mean female|A)*0.5] = ",
    round(mean([0.5*(df_group_sex_mean.score_mean[4]-df_group_sex_mean.score_mean[3]),
                0.5*(df_group_sex_mean.score_mean[2]-df_group_sex_mean.score_mean[1])]), digits = 4))
println("0.5*[(Mean male|B - Mean female|B)*0.5 - (Mean male|A - Mean female|A)*0.5] = ",
    round((0.5*(0.5*(df_group_sex_mean.score_mean[4]-df_group_sex_mean.score_mean[3]) -
                0.5*(df_group_sex_mean.score_mean[2]-df_group_sex_mean.score_mean[1]))), digits = 4))

## Two levels - three covariates

### Effect coding - with interaction

In [None]:
# Generate X matrix
contrasts = Dict(:group => EffectsCoding(base = "A"), :sex => EffectsCoding(base = "female"), 
                 :race=> EffectsCoding(base = "nw")   )
frml = @formula(score ~ 1 + group + sex + race + group*sex + group*race + sex*race)
# mf = ModelFrame(frml, df_data)
mX = modelmatrix(frml, df_data, hints = contrasts)
lm(frml, df_data, contrasts = contrasts)

In [None]:
gdf =  groupby(sort(df_data, [:race]), [:race]); df_race_mean = combine(gdf, :score => mean )
gdf =  groupby(sort(df_data, [:group, :race]), [:group, :race]); df_group_race_mean = combine(gdf, :score => mean )
gdf =  groupby(sort(df_data, [:sex, :race]), [:sex, :race]); df_sex_race_mean = combine(gdf, :score => mean )
gdf =  groupby(sort(df_data, [:group, :race, :sex]), [:group, :race, :sex,]); df_group_race_sex_mean = combine(gdf, :score => mean )
gdf =  groupby(sort(df_data, [:sex, :race, :group]), [:sex, :race, :group]); df_sex_race_group_mean = combine(gdf, :score => mean )
gdf =  groupby(sort(df_data, [:group, :sex, :race]), [:group, :sex, :race]); df_group_sex_race_mean = combine(gdf, :score => mean )
gdf =  groupby(sort(df_data, [ :race, :sex, :group]), [ :race, :sex, :group]); df_race_sex_group_mean = combine(gdf, :score => mean );

In [None]:
println("Mean(Mean sex vs group , Mean race vs group) = ", 
    round(mean(vcat(df_group_sex_mean.score_mean, df_group_race_mean.score_mean)), digits = 4))

println("Mean(main effect group for sex and race) = ",
    round(mean(permutedims(reshape(vec(df_sex_race_group_mean.score_mean), 2, 4))*[-0.5;0.5]), digits = 4))
    # round(mean([mean([0.5*(df_AvB_sex_mean.score_mean[4]-df_AvB_sex_mean.score_mean[2]),
    #                 0.5*(df_AvB_sex_mean.score_mean[3]-df_AvB_sex_mean.score_mean[1])]),
    #                 mean([0.5*(df_AvB_race_mean.score_mean[4]-df_AvB_race_mean.score_mean[2]),
    #                 0.5*(df_AvB_race_mean.score_mean[3]-df_AvB_race_mean.score_mean[1])])]), digits = 4))
   
println("Mean(main effect sex for group and race) = ",
    round(mean(permutedims(reshape(vec(df_group_race_sex_mean.score_mean), 2, 4))*[-0.5;0.5]), digits = 4))

println("Mean(main effect race for sex and group) = ",
    round(mean(permutedims(reshape(vec(df_group_sex_race_mean.score_mean), 2, 4))*[-0.5;0.5]), digits = 4))

println("Mean(Interaction effect group - sex for race) = ",
    round(mean(permutedims(reshape(vec(df_race_sex_group_mean.score_mean), 4, 2))*[0.25;-0.25;-0.25;0.25]),
            digits = 6))

println("Mean(Interaction effect group - race for sex) = ",
    round(mean(permutedims(reshape(vec(df_sex_race_group_mean.score_mean), 4, 2))*[0.25;-0.25;-0.25;0.25]),
            digits = 4))

println("Mean(Interaction effect group - race for sex) = ",
    round(mean(permutedims(reshape(vec(df_group_sex_race_mean.score_mean), 4, 2))*[0.25;-0.25;-0.25;0.25]),
            digits = 4))

## One 2-levels categorical and one continuous covariates

When using continuous variable with only integer number, it is preferable to generate manually our own schema applied to our formula.

>"Compute all the invariants necessary to fit a model with terms. A schema is a dict that maps Terms to their concrete instantiations (either CategoricalTerms or ContinuousTerms. "Hints" may optionally be supplied in the form of a Dict mapping term names (as Symbols) to term or contrast types. If a hint is not provided for a variable, the appropriate term type will be guessed based on the data type from the data column: any numeric data is assumed to be continuous, and any non-numeric data is assumed to be categorical."




In [None]:
# Create concrete term including the Coding format
# Continuous
cont_age = concrete_term(term(:age), [minimum(df_data.age), maximum(df_data.age)])
cont_score = concrete_term(term(:score), [minimum(df_data.score), maximum(df_data.score)])
# cont_age = concrete_term(term(:age), df_data.age)
# cont_score = concrete_term(term(:score), df_data.score)
# Categorical
cat_group = CategoricalTerm(:group, StatsModels.ContrastsMatrix(EffectsCoding(base = "A"), unique(df_data.group)))
cat_sex = CategoricalTerm(:sex, StatsModels.ContrastsMatrix(EffectsCoding(base = "female"), unique(df_data.sex)))
cat_race = CategoricalTerm(:race, StatsModels.ContrastsMatrix(EffectsCoding(base = "nw"), unique(df_data.race)))
# Categorical
# cat_group = CategoricalTerm(:group, StatsModels.ContrastsMatrix(DummyCoding(base = "A"), unique(df_data.group)))
# cat_sex = CategoricalTerm(:sex, StatsModels.ContrastsMatrix(DummyCoding(base = "female"), unique(df_data.sex)))
# cat_race = CategoricalTerm(:race, StatsModels.ContrastsMatrix(DummyCoding(base = "nw"), unique(df_data.race)))

In [None]:
# Generate schema 
sch1 = StatsModels.Schema(term(:age) => cont_age, term(:score) => cont_score, 
                          term(:group) => cat_group,
                          term(:sex) => cat_sex, term(:race) => cat_race)

### Effect coding - no interaction

In [None]:
df_data2 = copy(df_data)
df_data2.age = df_data2.age .- mean(df_data2.age) ;

In [None]:
# Model Formula
frml =  @formula(score ~ 1 + group + age + group*age )
# Apply schema to the formula
frml = apply_schema(frml, sch1)
# Design matrix
mX = modelmatrix(frml, df_data, hints = contrasts);

In [None]:
# mX[:,2] = mX[:,2].*0.5
# mX[:,4] = mX[:,4].*0.5
dfTest = DataFrame(hcat(mX[:,[2,3]],df_data.score ), :auto);
lm( @formula(x3 ~ 1 + x1 + x2 + x1*x2 ), dfTest);

In [None]:
# Linear model
out =  lm(frml, df_data2)

In [None]:
df_group_mean

In [None]:
scatter(gdf[1].age, gdf[1].score)
scatter!(gdf[2].age, gdf[2].score, legend = false)

In [None]:
mean(df_group_mean.score_mean) + sum(coef(out)[2])

In [None]:
(mean(gdf[2].score)+mean(gdf[1].score))/2

In [None]:
[a , b]

In [None]:
gdf =  groupby(sort(df_data, :group), :group);
# a = cor(gdf[1].score, gdf[1].age); b =  cor(gdf[2].score, gdf[2].age)
# (a+b)/2

In [None]:
b-a

In [None]:
println("[Mean group B - Mean group A]*0.5 = ",
    round((df_AvB_mean.score_mean[2]-df_AvB_mean.score_mean[1])*0.5, digits = 4))

In [None]:
mean([0.5*(df_AvB_sex_mean.score_mean[4]-df_AvB_sex_mean.score_mean[2]), 0.5*(df_AvB_sex_mean.score_mean[3]-df_AvB_sex_mean.score_mean[1])])

In [None]:
round(mean(permutedims(reshape(vec(df_sex_group_mean.score_mean), 2, 4))*[-0.5;0.5]), digits = 4)

In [None]:
round((df_AvB_mean.score_mean[1]+df_AvB_mean.score_mean[2])*0.5, digits = 4) + 7.908

In [None]:
gdf =  groupby(sort(df_data, :group), :group); df_AvB_mean = combine(gdf, :score => mean )


println("Mean(Mean sex vs group , Mean race vs group) = ", 
    round(mean(vcat(df_AvB_race_mean.score_mean, df_AvB_sex_mean.score_mean)), digits = 4))

### Effect coding - with interaction

In [None]:
# Model Formula
frml =  @formula(score ~ 1 + group + sex + age + group*sex +  age*group + age*sex)
# Apply schema to the formula
frml = apply_schema(frml, sch1)

In [None]:
# frml = @formula(score ~ 1 + group + sex + race + group*sex + group*race + sex*race)
# mf = ModelFrame(frml, df_data)
# mX = modelmatrix(frml, df_data, hints = contrasts)
lm(frml, df_data)

In [None]:
mean(df_AvB_sex_mean.score_mean)+7.908

##  Example in R 

In [None]:
using RCall

In [None]:
R"""
library(MASS)
# Generage fake data for male
data_male <- data.frame(mvrnorm(n=1000,mu=c(2,0),Sigma=rbind(c(1,.8),c(.8,1)),empirical=TRUE ) )
colnames(data_male)<-c('Income','Age')
data_male$Gender = 'Male'
# Generate fake data for female
data_female <- data.frame(mvrnorm(n=1000,mu=c(3,0),Sigma=rbind(c(1,.3),c(.3,1)),empirical=TRUE ))
colnames(data_female) <- c('Income','Age')
data_female$Gender = 'Female'
# Combine data
data <-rbind(data_female,data_male)
data$Gender<-as.factor(data$Gender)
"""
dfTest3 = @rget data;
names(dfTest3)

In [None]:
unique(dfTest3.Gender)

In [None]:
# Model Formula
frml =  @formula(Income ~  Gender + Age + Gender*Age )
# Apply schema to the formula
# frml = apply_schema(frml, sch1)
lm( frml, dfTest3, contrasts = Dict(:Gender => EffectsCoding(base = "Female")))