# Nested design - ANOVA

Initial commands

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf') #setting figure format to vector when exported
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['text.usetex'] = True
from scipy.stats import f

Reading data

In [2]:
data=pd.read_excel('ANOVA_nested.xlsx')
display(data)
alpha=0.01 #significance level
a=2 #number of technicians (for i=1,..,a)
b=3 #number of rats per technician (for j=1,..,b)
n=10 #number of measurements per each rat (for k=1,...,n)

Unnamed: 0,A1,A2,A3,B1,B2,B3
0,1.119,1.045,0.9873,1.3883,1.3952,1.2574
1,1.2996,1.1418,0.9873,1.104,0.9714,1.0295
2,1.5407,1.2569,0.8714,1.1581,1.3972,1.1941
3,1.5084,0.6191,0.9452,1.319,1.5369,1.0759
4,1.6181,1.4823,1.1186,1.1803,1.3727,1.3249
5,1.5962,0.8991,1.2909,0.8738,1.2909,0.9494
6,1.2617,0.8365,1.1502,1.387,1.1874,1.1041
7,1.2288,1.2898,1.1635,1.301,1.1374,1.1575
8,1.3471,1.1821,1.151,1.3925,1.0647,1.294
9,1.0206,0.9177,0.9367,1.0832,0.9486,1.4543


Hypothesis:

H0_A: Protein intake in the livers of experimental rats does not depend on the person performing the experiments.

H1_A: Protein intake in the livers of experimental rats depends on the person performing the experiments.

H0_B: Protein intake in the livers of experimental rats does not depend on the rat, the measurement of which were taken by selected technician.

H1_B: Protein intake in the livers of experimental rats depends on the rat, the measurement of which were taken by selected technician.

In [3]:
MEAN_Yij=data.mean()
MEAN_Yi=pd.concat([pd.Series((MEAN_Yij.iloc[:b]).mean()),pd.Series((MEAN_Yij.iloc[b:2*b]).mean())])
Y_=data.mean().mean()
display(pd.DataFrame(MEAN_Yij,columns=['Yij_']))
display(pd.DataFrame(MEAN_Yi,columns=['Yi_']))
print('Average Y_ is {:.3f}'.format(Y_))

Unnamed: 0,Yij_
A1,1.35402
A2,1.06703
A3,1.06021
B1,1.21872
B2,1.23024
B3,1.18411


Unnamed: 0,Yi_
0,1.16042
0,1.211023


Average Y_ is 1.186


Calculating sum of squares

In [4]:
#total sum of squares
SST=0
npsT=0
for i in range(a):
    for j in range(b):
        for k in range(n):
            SST=SST+(data.iloc[k,i*b+j]-Y_)**2
            npsT=npsT+1
npsT=npsT-1

print('Value of SST is {:.3f}'.format(SST))
print('Number of degrees of freedom T is {:.0f}'.format(npsT))

#sum of squares due to factor A (Brad vs. Janet)
SSA=0
npsA=0
for i in range(a):
    for j in range(b):
        for k in range(n):
            SSA=SSA+(MEAN_Yi.iloc[i]-Y_)**2
    npsA=npsA+1
npsA=npsA-1

print('Value of SSA is {:.3f}'.format(SSA))
print('Number of degrees of freedom A is {:.0f}'.format(npsA))

#sum of squares due to factor B, which is nested in factor A
SSB_A=0
npsB_A=0
for i in range(a):
    for j in range(b):
        for k in range(n):
            SSB_A=SSB_A+(MEAN_Yij.iloc[i*b+j]-MEAN_Yi.iloc[i])**2
        npsB_A=npsB_A+1
npsB_A=npsB_A-a

print('Value of SSB_A is {:.3f}'.format(SSB_A))
print('Number of degrees of freedom B_A is {:.0f}'.format(npsB_A))

#sum of squares due to radnomness
SSE=0
npsE=0
for i in range(a):
    for j in range(b):
        for k in range(n):
            SSE=SSE+(data.iloc[k,i*b+j]-MEAN_Yij.iloc[i*b+j])**2
            npsE=npsE+1
npsE=npsE-a*b

print('Value of SSE is {:.3f}'.format(SSE))
print('Number of degrees of freedom E is {:.0f}'.format(npsE))

#test
print(SST-(SSA+SSB_A+SSE))
print(npsT==npsA+npsB_A+npsE)

Value of SST is 2.558
Number of degrees of freedom T is 59
Value of SSA is 0.038
Number of degrees of freedom A is 1
Value of SSB_A is 0.574
Number of degrees of freedom B_A is 4
Value of SSE is 1.946
Number of degrees of freedom E is 54
4.440892098500626e-16
True


Calculating statistic FA and FB

In [5]:
#sample variance
MSA=SSA/npsA
MSB_A=SSB_A/npsB_A
MSE=SSE/npsE

#statisctics FA and FB
FA=MSA/MSE
FB_A=MSB_A/MSE
#nu1,nu2 (parameters of Fischer's distribution)
nu1_A=npsA
nu1_B_A=npsB_A
nu2=npsE

print('Value of statistic FA is {:.3f}'.format(FA))
print('Value of statistic FB_A is {:.3f}'.format(FB_A))

FcritA=f.ppf(1-alpha,nu1_A,nu2)
print('Value of FcritA is {:.3f}'.format(FcritA))

FcritB_A=f.ppf(1-alpha,nu1_B_A,nu2)
print('Value of FcritB_A is {:.3f}'.format(FcritB_A))

#actual risk for the rejection of the H0 (FA)
p_valA=1-f.cdf(FA,nu1_A,nu2)
print('Actual risk for wrong decision if we accept the H1_A (FA) is {:.6f}'.format(p_valA))

#actual risk for the rejection of the H0 (FB_A)
p_valB_A=1-f.cdf(FB_A,nu1_B_A,nu2)
print('Actual risk for wrong decision if we accept the H1_B (FB_A) is {:.6f}'.format(p_valB_A))

Value of statistic FA is 1.066
Value of statistic FB_A is 3.982
Value of FcritA is 7.129
Value of FcritB_A is 3.688
Actual risk for wrong decision if we accept the H1_A (FA) is 0.306487
Actual risk for wrong decision if we accept the H1_B (FB_A) is 0.006660


Since p value for the first hypothesis (FA) is higher than significance level 1%, we can not claim that protein intake in the livers of experimental rats depends on the person performing the experiments. Furthermore, with risk value of 30,65 % we can not claim that protein intake in the livers of experimental rats depends on the prson performing the experiments.

Since p value for the second hypothesis (FB_A) is smaller than significance level 1%, we can say that protein intake in the livers of experimental rats depends on the rat, the measurement of which were taken by selected technician. Actual risk for wrong decision if we accept the second alternative hypothesis is even lower; it is 0.66 %.