In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, f
from statsmodels.multivariate.manova import MANOVA

In [2]:
#1
def mean_vector(data):
    ones = pd.DataFrame({'mean': np.ones(len(data))})
    mean = ones.transpose().dot(data)/len(data)
    return mean

In [3]:
def cov_matrix(data):
    mean = mean_vector(data)
    mean_rep = pd.concat([mean]*len(data))
    mean_rep.columns = data.columns
    mean_rep.reset_index(inplace = True, drop = True)
    cov = (data-mean_rep).transpose().dot(data-mean_rep)/(len(data)-1)
    return cov

In [4]:
#Multivariate profile analysis assuming equal covariances
def multi_prof_analysis(df1,df2,C):
    n1 = len(df1.index)
    n2 = len(df2.index)
    q = len(C.index)
    x1_bar = mean_vector(df1)
    x2_bar = mean_vector(df2)
    S1 = cov_matrix(df1)
    S2 = cov_matrix(df2)
    Sp = ((n1-1)*S1+(n2-1)*S2)/(n1+n2-2) #pooled covariance matrix
    t2 = ((x1_bar-x2_bar).dot(C.T).dot(np.linalg.inv((1/n1+1/n2)*C.dot(Sp).dot(C.T))).dot(C).dot((x1_bar-x2_bar).T)).iloc[0,0]
    fvalue = t2*(n1+n2-q-1)/((n1+n2-2)*q)
    pvalue = f.sf(fvalue, q, n1+n2-q-1)
    return {'t2': t2, 'f-value': fvalue, 'p-value': pvalue}

In [5]:
#2
turtle = pd.read_fwf('turtle.dat', header=None, widths = [5,5,4,8])
turtle.columns=[0,1,2,'gender']
turtle

Unnamed: 0,0,1,2,gender
0,98,81,38,female
1,103,84,38,female
2,103,86,42,female
3,105,86,42,female
4,109,88,44,female
5,123,92,50,female
6,123,95,46,female
7,133,99,51,female
8,133,102,51,female
9,133,102,51,female


In [6]:
female=turtle.iloc[0:24,:3]
female

Unnamed: 0,0,1,2
0,98,81,38
1,103,84,38
2,103,86,42
3,105,86,42
4,109,88,44
5,123,92,50
6,123,95,46
7,133,99,51
8,133,102,51
9,133,102,51


In [7]:
male=turtle.iloc[24:48,:3]
male.reset_index(inplace = True, drop = True)
male

Unnamed: 0,0,1,2
0,93,74,37
1,94,78,35
2,96,80,35
3,101,84,39
4,102,85,38
5,103,81,37
6,104,83,39
7,106,83,39
8,107,82,38
9,112,89,40


In [8]:
#(a) Are the profiles parallel?
C_par=pd.DataFrame([[1,-1,0],[0,1,-1]])
C_par

Unnamed: 0,0,1,2
0,1,-1,0
1,0,1,-1


In [9]:
multi_prof_analysis(female,male,C_par)
#Reject null hypothesis.
#Profiles are not parallel.

{'t2': 15.423169058017201,
 'f-value': 7.543941387073631,
 'p-value': 0.0014947765346784245}

In [10]:
#(b) Are the profiles coincident?
C_coin=pd.DataFrame([np.ones(3)])
C_coin

Unnamed: 0,0,1,2
0,1.0,1.0,1.0


In [11]:
multi_prof_analysis(female,male,C_coin)
#Reject null hypothesis.
#If the profiles are not parallel, they cannot be coincident.
#As expected, the profiles are not coincident.

{'t2': 24.964840464171786,
 'f-value': 24.96484046417179,
 'p-value': 8.894702339275906e-06}

In [12]:
#(c) Repeat (a) and (b) using Python packages.
#Repeat (a)
#Data preprocessing
diff = turtle.drop('gender',axis=1)
diff = diff.dot(C_par.T)
diff.columns = ['x12','x23']
para = diff.join(pd.DataFrame(turtle['gender']))
para

Unnamed: 0,x12,x23,gender
0,17,43,female
1,19,46,female
2,17,44,female
3,19,44,female
4,21,44,female
5,31,42,female
6,28,49,female
7,34,48,female
8,31,51,female
9,31,51,female


In [13]:
#Test for parallel profile
para_test = MANOVA.from_formula('x12 + x23 ~ gender', data = para)
print(para_test.mv_test())
#p-value=0.0015
#Reject null hypothesis
#Same result as #2(a)

                   Multivariate linear model
                                                               
---------------------------------------------------------------
       Intercept         Value  Num DF  Den DF  F Value  Pr > F
---------------------------------------------------------------
          Wilks' lambda  0.0175 2.0000 45.0000 1260.1630 0.0000
         Pillai's trace  0.9825 2.0000 45.0000 1260.1630 0.0000
 Hotelling-Lawley trace 56.0072 2.0000 45.0000 1260.1630 0.0000
    Roy's greatest root 56.0072 2.0000 45.0000 1260.1630 0.0000
---------------------------------------------------------------
                                                               
---------------------------------------------------------------
            gender         Value  Num DF  Den DF F Value Pr > F
---------------------------------------------------------------
             Wilks' lambda 0.7489 2.0000 45.0000  7.5439 0.0015
            Pillai's trace 0.2511 2.0000 45.0000  7.5439 0.

In [14]:
#Repeat (b)
#univariate x1+x2+x3
female = turtle[turtle['gender'] == 'female'].iloc[:,:3].sum(axis=1)
male = turtle[turtle['gender'] == 'male'].iloc[:,:3].sum(axis=1)

In [15]:
#Test for coincident profile
ttest_ind(female, male, equal_var = True) #We assumed equal covariances
#p-value=8.8947e-06
#Reject null hypothesis
#Same result as #2(b)

Ttest_indResult(statistic=4.996482809354179, pvalue=8.894702339275784e-06)

In [16]:
#(d) Test equality of the two population mean vectors at alpha=0.05 using oneway MANOVA
#For oneway MANOVA, we assume that:
#1. The random samples from different populations are independent.
#2. All populations have a common covariance matrix
#3. Each population is multivariate normal.
cols = ['x1','x2','x3','gender']
cols

['x1', 'x2', 'x3', 'gender']

In [17]:
turtle.columns=cols
turtle

Unnamed: 0,x1,x2,x3,gender
0,98,81,38,female
1,103,84,38,female
2,103,86,42,female
3,105,86,42,female
4,109,88,44,female
5,123,92,50,female
6,123,95,46,female
7,133,99,51,female
8,133,102,51,female
9,133,102,51,female


In [18]:
man = MANOVA.from_formula('x1+x2+x3 ~ gender', data = turtle)
print(man.mv_test())
#p-value=0.0000 < 0.05
#Reject null hypothesis
#At significance level 0.05, the two population means are not equal

                   Multivariate linear model
                                                               
---------------------------------------------------------------
       Intercept         Value  Num DF  Den DF  F Value  Pr > F
---------------------------------------------------------------
          Wilks' lambda  0.0144 3.0000 44.0000 1001.2534 0.0000
         Pillai's trace  0.9856 3.0000 44.0000 1001.2534 0.0000
 Hotelling-Lawley trace 68.2673 3.0000 44.0000 1001.2534 0.0000
    Roy's greatest root 68.2673 3.0000 44.0000 1001.2534 0.0000
---------------------------------------------------------------
                                                               
---------------------------------------------------------------
            gender         Value  Num DF  Den DF F Value Pr > F
---------------------------------------------------------------
             Wilks' lambda 0.3886 3.0000 44.0000 23.0782 0.0000
            Pillai's trace 0.6114 3.0000 44.0000 23.0782 0.