In [1]:
import numpy as np
import pandas as pd
from multiple_factor import DGP2, Inferece2
from joblib import Parallel, delayed
import multiprocessing

# load covariates data
data = pd.read_csv("FactorialData/educationData2008.csv")
cols = ['Total']
cols += list(data.iloc[:,26:32].columns)
cols += list(data.iloc[:,34:36].columns)
cols += ['teachers']
X = data[cols].to_numpy()

class DGP3(DGP2):
    
    def __init__(self, num_factor, num_sample, X, tau=0, match_more=False, design='MT'):
        self.total = X
        self.covariates = X[:,:-1]
        super().__init__(num_factor, num_sample, self.covariates.shape[1], tau, match_more, design)
        
    def generate_X(self):
        idx = np.random.choice(len(self.total), self.n, replace=False)
        total = self.total[idx]
        X = total[:,:-1]
        self.Y0 = total[:,-1]
        return X

    def generate_Y(self):
        X = self.X - np.mean(self.X, axis=0)
        if self.D.shape[1] > 1:
            gamma = 2*self.D[:,1] - 1
            Y = gamma*X.dot(np.linspace(1,.2,9)) \
                + (np.mean(self.D[:,1:],axis=1) + self.D[:,0])*self.tau + self.Y0
        else:
            Y = self.D[:,0]*self.tau + self.Y0
        return Y

In [2]:
def reject_prob(X, num_factor, sample_size, tau=0, ntrials=1000, more=False, design='MT'):
    phi_tau = np.zeros(ntrials)
    for i in range(ntrials):
        dgp = DGP3(num_factor, sample_size, X, tau, more, design)
        Y, D, tuple_idx = dgp.Y, dgp.D, dgp.tuple_idx
        inf = Inferece2(Y, D, tuple_idx, design)
        phi_tau[i] = inf.phi_tau
    return np.mean(phi_tau)

def risk(X, num_factor, sample_size, tau=0, ntrials=1000, more=False, design='MT'):
    mse = np.zeros(ntrials)
    for i in range(ntrials):
        dgp = DGP3(num_factor, sample_size, X, tau, more, design)
        Y, D, tuple_idx = dgp.Y, dgp.D, dgp.tuple_idx
        ate = np.mean(Y[D[:,0]==1]) - np.mean(Y[D[:,0]==0])
        mse[i] = (ate - tau)**2
    return np.mean(mse)

def reject_prob_parrell(X, num_factor, sample_size, tau=0, ntrials=1000, more=False, design='MT'):
    if design == 'MT2':
        more = True
        design = 'MT'
    X = X/np.std(X,axis=0)/np.sqrt(12)
    if design == 'S4':
        X = X[:, np.random.permutation(X.shape[1])]
    def process(qk):
        dgp = DGP3(num_factor, sample_size, X, tau, more, design)
        Y, D, tuple_idx = dgp.Y, dgp.D, dgp.tuple_idx
        inf = Inferece2(Y, D, tuple_idx, design)
        return inf.phi_tau
    num_cores = multiprocessing.cpu_count()
    ret = Parallel(n_jobs=num_cores)(delayed(process)(i) for i in range(ntrials))
    return np.mean(ret)

def risk_parrell(X, num_factor, sample_size, tau=0, ntrials=1000, more=False, design='MT'):
    if design == 'MT2':
        more = True
        design = 'MT'
    X = X/np.std(X,axis=0)/np.sqrt(12)
    if design == 'S4':
        X = X[:, np.random.permutation(X.shape[1])]
    def process(qk):
        dgp = DGP3(num_factor, sample_size, X, tau, more, design)
        Y, D, tuple_idx = dgp.Y, dgp.D, dgp.tuple_idx
        ate = np.mean(Y[D[:,0]==1]) - np.mean(Y[D[:,0]==0])
        return (ate - tau)**2
    num_cores = multiprocessing.cpu_count()
    ret = Parallel(n_jobs=num_cores)(delayed(process)(i) for i in range(ntrials))
    return np.mean(ret)

In [3]:
print(risk_parrell(X+1e-5*np.random.normal(size=X.shape), 5, 1280, 0.05, 100, design='MT'))
print(risk_parrell(X+1e-5*np.random.normal(size=X.shape), 5, 1280, 0.05, 100, design='MP-B'))
print(risk_parrell(X+1e-5*np.random.normal(size=X.shape), 5, 1280, 0.05, 100, design='S4'))
print(risk_parrell(X+1e-5*np.random.normal(size=X.shape), 5, 1280, 0.05, 100, design='RE'))

print(reject_prob_parrell(X+1e-5*np.random.normal(size=X.shape), 5, 1280, 0, 100, design='MT'))
print(reject_prob_parrell(X+1e-5*np.random.normal(size=X.shape), 5, 1280, 0, 100, design='MT2'))
print(reject_prob_parrell(X+1e-5*np.random.normal(size=X.shape), 5, 1280, 0, 100, design='S4'))

0.00031544765277139483
0.00045992975912504167
0.0010314106733879146
0.00045285408519983437
0.02
0.06
0.07


In [4]:
# add a few noise to break the tie for S4
X = X + 1e-5*np.random.normal(size=X.shape)
n = 1000

designs = ['MT', 'C', 'MT2', 'S4', 'MP-B', 'RE']
mse = [risk_parrell(X, 5, 1280, 0, n, design=d) for d in designs]
mse2 = [risk_parrell(X, 5, 1280, 0.05, n, design=d) for d in designs]
print(mse/mse[0])
print(mse2/mse2[0])

designs = ['MT', 'MT2', 'C', 'S4']
size = [reject_prob_parrell(X, 5, 1280, 0, n, design=d) for d in designs]
power = [reject_prob_parrell(X, 5, 1280, 0.05, n, design=d) for d in designs]

print(size)
print(power)



[1.         2.36948114 1.20217173 2.67258079 1.92456228 1.75330399]
[1.         2.60018969 1.42461574 3.43026565 1.90330011 2.00043216]
[0.013, 0.047, 0.055, 0.05]
[0.657, 0.718, 0.462, 0.462]


[1.         2.36948114 1.20217173 2.67258079 1.92456228 1.75330399]
[1.         2.60018969 1.42461574 3.43026565 1.90330011 2.00043216]
[0.013, 0.047, 0.055, 0.05]
[0.657, 0.718, 0.462, 0.462]