In [1]:
!pip install hotelling



In [2]:
import numpy as np
import pandas as pd
import hotelling.stats
from scipy.stats import f, chi2, ttest_ind
from statsmodels.stats import multivariate as mv

In [3]:
#1
def mean_vector(data):
    ones = pd.DataFrame({'mean': np.ones(len(data))})
    mean = ones.transpose().dot(data)/len(data)
    return mean

In [4]:
def cov_matrix(data):
    mean = mean_vector(data)
    mean_rep = pd.concat([mean]*len(data))
    mean_rep.columns = data.columns
    mean_rep.reset_index(inplace = True, drop = True)
    cov = (data-mean_rep).transpose().dot(data-mean_rep)/(len(data)-1)
    return cov

In [5]:
#Hotelling's T^2 test
def my_hotelling(df,mu):
    n=len(df.index) #number of observations
    p=len(df.columns) #number of variables
    mu_reshape=pd.DataFrame(mu.reshape(1,3), index=['mean'], columns=df.columns)
    diff=mean_vector(df)-mu_reshape
    inv_cov = pd.DataFrame(np.linalg.inv(cov_matrix(df)), index=df.columns, columns=df.columns)
    t2 = n*((diff.dot(inv_cov)).dot(diff.T)).iloc[0,0]
    fvalue = (n-p)*t2/((n-1)*p)
    pvalue = f.sf(fvalue, p, n-p)
    return {'t2': t2, 'f-value': fvalue, 'p-value': pvalue}

In [6]:
#2
#(a) Using python code in #1
college = pd.read_fwf("college.dat", header = None)
college.columns = ['X1','X2','X3'] #X1: Social Science and History, X2: Verbal, X3: Science
college

Unnamed: 0,X1,X2,X3
0,468,41,26
1,428,39,26
2,514,53,21
3,547,67,33
4,614,61,27
...,...,...,...
82,614,70,23
83,527,49,30
84,474,41,16
85,441,47,26


In [7]:
#array of means to test
mu0=np.array([500,50,30])
mu0

array([500,  50,  30])

In [8]:
my_hotelling(college,mu0)
#result: p-value=2.828e-23 < 0.05
#reject null hypothesis

{'t2': 223.31017568489185,
 'f-value': 72.70563859508106,
 'p-value': 2.82809706246472e-23}

In [9]:
#(b) Using Python package
hotelling.stats.hotelling_t2(college,mu0)
#result: p-value=2.828e-23 < 2.713
#reject null hypothesis (same result)

(223.31017568489145,
 72.70563859508093,
 2.8280970624648855e-23,
              X1          X2          X3
 X1  5808.059342  597.835204  222.029671
 X2   597.835204  126.053729   23.388532
 X3   222.029671   23.388532   23.111735)

In [10]:
#(c)
def conf_region(df,n,p,alpha):
    S=cov_matrix(df)
    eigenval = np.linalg.eig(S)[0] #eigenvalues
    eigenvec = np.linalg.eig(S)[1] #eigenvectors
    lengths=np.sqrt((n-1)*p/(n*(n-p))*f.isf(alpha,p,n-p))*np.sqrt(eigenval)
    val=np.sqrt((n-1)*p/(n*(n-p))*f.isf(alpha,p,n-p))*np.multiply(eigenvec,np.sqrt(eigenval).reshape(p,1))
    axes=np.vstack((val,-val))
    return 'half-lengths:', lengths, 'directions for the axes:', axes

In [11]:
conf_region(college,87,3,0.05)

('half-lengths:',
 array([23.72999755,  2.47276833,  1.18250011]),
 'directions for the axes:',
 array([[-2.35853725e+01, -2.46154906e+00, -8.85304423e-01],
        [-2.55791539e-01,  2.45938874e+00, -2.36837177e-02],
        [-4.50521377e-02,  6.69323257e-03,  1.18162262e+00],
        [ 2.35853725e+01,  2.46154906e+00,  8.85304423e-01],
        [ 2.55791539e-01, -2.45938874e+00,  2.36837177e-02],
        [ 4.50521377e-02, -6.69323257e-03, -1.18162262e+00]]))

In [12]:
#(d)
def sim_conf_int(df,n,p,alpha,coeff):
    a = pd.DataFrame(coeff, index=df.columns, columns=['mean'])
    x_bar = mean_vector(df).T
    S=cov_matrix(df)
    term1 = a.T.dot(x_bar).iloc[0,0]
    term2 = np.sqrt((n-1)*p/(n*(n-p))*f.isf(alpha,p,n-p)*(a.T.dot(S).dot(a)).iloc[0,0])
    interval=[term1-term2, term1+term2]
    return interval

In [13]:
coeff1=[1,-2,1]
sim_conf_int(college,87,3,0.05,coeff1)

[422.05125244523714, 462.6154142214296]

In [14]:
#3
#(a) Find simultaneous confidence interval
stiff = pd.read_fwf("stiff.dat", header = None)
stiff2 = stiff.iloc[:,0:4]
stiff2

Unnamed: 0,0,1,2,3
0,1889,1651,1561,1778
1,2403,2048,2087,2197
2,2119,1700,1815,2222
3,1645,1627,1110,1533
4,1976,1916,1614,1883
5,1712,1712,1439,1546
6,1943,1685,1271,1671
7,2104,1820,1717,1874
8,2983,2794,2412,2581
9,1745,1600,1384,1508


In [15]:
coeff2=[1,2,-1,-2]
sim_conf_int(stiff2,30,4,0.05,coeff2)

[123.10266537479538, 769.097334625204]

In [16]:
#(b) Repeat under large sample assumption
def sim_conf_int_large(df,n,p,alpha,coeff):
    a = pd.DataFrame(coeff, index=df.columns, columns=['mean'])
    x_bar = mean_vector(df).T
    S=cov_matrix(df)
    term1 = a.T.dot(x_bar).iloc[0,0]
    term2 = np.sqrt(chi2.isf(alpha,p))*np.sqrt((a.T.dot(S).dot(a)/n).iloc[0,0])
    interval=[term1-term2, term1+term2]
    return interval

In [17]:
sim_conf_int_large(stiff2,30,4,0.05,coeff2)

[161.68206247064148, 730.5179375293578]

In [18]:
#4
def prof_analysis(df,C):
    n = len(df.index)
    q = len(C.index)
    x_bar = mean_vector(df)
    S = cov_matrix(df)
    t2 = n*((C.dot(x_bar.T)).T.dot(np.linalg.inv(C.dot(S).dot(C.T))).dot(C.dot(x_bar.T))).iloc[0,0]
    fvalue = t2*(n-q)/((n-1)*q)
    pvalue = f.sf(fvalue, q, n-q)
    return {'t2': t2, 'f-value': fvalue, 'p-value': pvalue}

In [19]:
#5
#(a) Test for flat means
C_flat = pd.DataFrame([[1,-1,0,0],[0,1,-1,0],[0,0,1,-1]])
C_flat

Unnamed: 0,0,1,2,3
0,1,-1,0,0
1,0,1,-1,0
2,0,0,1,-1


In [20]:
prof_analysis(stiff2,C_flat)
#result: p-value=1.721e-13 < 0.05
#reject null hypothesis

{'t2': 254.72120597667998,
 'f-value': 79.05140875138345,
 'p-value': 1.7219150170540697e-13}

In [21]:
#(b) Test for linear trend
C_linear = pd.DataFrame([[1,-2,1,0],[0,1,-2,1]])
C_linear

Unnamed: 0,0,1,2,3
0,1,-2,1,0
1,0,1,-2,1


In [22]:
prof_analysis(stiff2,C_linear)
#result: p-value=9.560e-13 < 0.05
#reject null hypothesis

{'t2': 180.3800161429685,
 'f-value': 87.0800077931572,
 'p-value': 9.560459481929842e-13}

In [23]:
#(c) Repeat using Python package
#Repeat (a)
flat = stiff2.dot(C_flat.T)
flat.columns = ['x1-x2','x2-x3','x3-x4']
flat

Unnamed: 0,x1-x2,x2-x3,x3-x4
0,238,90,-217
1,355,-39,-110
2,419,-115,-407
3,18,517,-423
4,60,302,-269
5,0,273,-107
6,258,414,-400
7,284,103,-157
8,189,382,-169
9,145,216,-124


In [24]:
mv.test_mvmean(flat)
#result: p-value=1.721e-13 < 0.05
#reject null hypothesis (same result)

<class 'statsmodels.stats.base.HolderTuple'>
statistic = 79.05140875138369
pvalue = 1.721915017054007e-13
df = (3, 27)
t2 = 254.72120597668078
distr = 'F'
tuple = (79.05140875138369, 1.721915017054007e-13)

In [25]:
#Repeat (b)
linear = stiff2.dot(C_linear.T)
linear.columns=['x1-2x2+x3','x2-2x3+x4']
linear

Unnamed: 0,x1-2x2+x3,x2-2x3+x4
0,148,307
1,394,71
2,534,292
3,-499,940
4,-242,571
5,-273,380
6,-156,814
7,181,260
8,-193,551
9,-71,340


In [26]:
mv.test_mvmean(linear)
#result: p-value=9.560e-13 < 0.05
#reject null hypothesis (same result)

<class 'statsmodels.stats.base.HolderTuple'>
statistic = 87.08000779315768
pvalue = 9.560459481929198e-13
df = (2, 28)
t2 = 180.38001614296948
distr = 'F'
tuple = (87.08000779315768, 9.560459481929198e-13)