In [None]:
import math
import pandas as pd
import numpy as np
import igraph 
import itertools
from itertools import chain, combinations
import matplotlib.pyplot as plt
from  sklearn.linear_model import LinearRegression
from scipy.stats import norm
from scipy.stats import chi2
import warnings
from state_space_estimation.roles import roles
from scipy.stats import multivariate_normal as mn
%matplotlib inline

tol = 0.00001

In [None]:
class dag():
    
    def __init__(self, m):
        '''
        m is a pandas DataFrame conaining an adjacency matrix
        '''
        self.m = m
        self.nodes = self.m.columns.values
        self.lags = np.array([n for n in self.nodes if '_1' in n])
        if not all(self.m.columns.values == self.m.index.values):
            raise ValueError('Invalid adjacency matrix (columns and rows are not the same)')
        self.directed = self.directed()

            
    def parents(self, n):
        '''
        n is a node in the graph
        '''
        if n not in self.nodes:
            raise ValueError('n is not in graph')
        return self.m.loc[self.m.loc[:,n] != 0, n].index.values
    
    
    def children(self, n):
        '''
        n is a node in the graph
        '''
        if n not in self.nodes:
            raise ValueError('n is not in graph')
        return self.m.loc[:,self.m.loc[n,:] != 0].columns.values
    
    
    def directed(self):
        return utils.is_dag(self.m.values)
        
    
    def depth(self, n):
        '''
        n is a node in the graph
        return the length of the shortest path to a root node
        '''
        if not self.directed:
            raise ValueError('Cannot compute depth, graph is undirected')
        if len(self.parents(n)) == 0:
            return 0
        else:
            return 1 + max([self.depth(p) for p in self.parents(n)])
    
    
    def root_nodes(self):
        return np.array([n for n in self.nodes if len(self.parents(n)) == 0])
        
        
    def isolated_nodes(self):
        return np.array([n for n in d.nodes if (len(d.parents(n)) == 0) & (len(d.children(n)) == 0)])
    
    
    def connected_roots(self):
        return np.array([n for n in d.nodes if (len(d.parents(n)) == 0) & (len(d.children(n)) > 0)])
    
    
    def structure(self):
        M = self.m.copy()
        M[M != 0] = 1
        return M
    
    
    def shd(self, d):
        try:
            return utils.count_accuracy(d.structure().values, self.structure().values)['shd']
        except ValueError as e:
            # Graph is not directed
            return None
    
    
    def impute(self, values):
        na_nodes = values[values.isna()]
        depth = 0
        try:
            max_depth = max([self.depth(n) for n in self.nodes])
        except ValueError as e:
            print('Cannot impute values as graph is not directed')
            raise e
        while depth <= max_depth:
            for node in na_nodes.index:
                if self.depth(node) == depth:
                    values[node] = np.dot(self.m.loc[:,node], values.fillna(0))
            depth += 1
        return values


    def calculate_irf(self, x_0, T=250, verbose=False):
        if verbose:
            print('Simulating irf...')
        for lag in self.lags:
            x_0[lag] = 0
        irf = pd.DataFrame([self.impute(x_0)], columns=self.nodes)
        for t in range(T-1):
            if verbose:
                print('Simulating t={} of {} ({}%)'.format(t+1, T, np.round(100 * ((t+1)/T), decimals=2)))
            nr = pd.Series(np.full(len(self.nodes), np.nan), index=self.nodes)
            for lag in self.lags:
                nr[lag] = irf.iloc[-1,:][lag.rstrip('_1')]
            nr = self.impute(nr)    
            irf = irf.append(nr, ignore_index=True)
        irf.loc[:,'t'] = range(T)
        irf.set_index('t', inplace=True)
        irf.drop(self.lags, axis=1, inplace=True)
        return irf


    def plot_irf(self, irf, layout=None):
        if layout is None:
            side = math.ceil(math.sqrt(len(irf.columns)))
            layout = (side, side)
        axes = irf.plot(subplots=True, layout=layout, 
                        color="black", legend=False)
        for ax, name in zip(axes.flatten(), irf.columns.values):
            ax.axhline(y=0, color="red")
            ax.set_title(name)
        return plt
    
    
    def plot_structure(self):
        M = self.m.values
        g = igraph.Graph.Adjacency((M != 0.0).tolist())
        g.es['weight'] = M[M.nonzero()]
        g.vs['label'] = self.nodes
        g.vs['color'] = 'white'
        g.vs['size'] = 45
        return g

In [None]:
data = pd.read_csv('../data/rbc_100k.csv')
data = data.drop(list(['eps_z', 'eps_g']), axis=1)
data.columns = [col.replace(" ", "") for col in data.columns]
data = data.iloc[:,1:]

data = data.apply(lambda x: x - x.mean())

shift_vars = data.columns.values
shift = data.loc[:,shift_vars].shift()
shift.columns = [str(col) + '_1' for col in shift.columns]
data = pd.concat([data, shift], axis=1)
data = data.iloc[1:,:]

data_array = data.values

In [None]:
bl_children = np.where(np.isin(data.columns.values, shift.columns.values))[0]
bl_parents  = None 
d = data_array.shape[1]
bnds = [
        (0, 0)
        if i == j
        else (0, 0) 
        if bl_parents is not None and i in bl_parents
        else (0, 0) 
        if bl_children is not None and j in bl_children
        else (0, None) 
        for _ in range(2)
        for i in range(d)
        for j in range(d)
    ] 

In [None]:
pd.DataFrame(np.array(bnds))

In [None]:
def grid_search():
    results = pd.DataFrame(columns=['SHD', 'root_nodes', 'directed', 'dag', 'params'])
    lambda1s = np.linspace(1e1, 1e-50, 10) 
    w_thresholds = np.linspace(0, 0.99, 10)
    loss_types = ['l2', 'logistic', 'poisson']
    bndss = [dag_bnds]
    params = [lambda1s, w_thresholds, loss_types, bndss]
    grid = list(itertools.product(*params))
    gridsize = len(grid)
    
    i = 1
    for p in grid:
        W_est = linear.notears_linear(data_array, 
                                      lambda1=p[0], 
                                      w_threshold=p[1],
                                      loss_type=p[2], 
                                      bnds=p[3],
                                      max_iter=1000, 
                                      h_tol=1e-8, 
                                      rho_max=1e+20)
        d = dag(pd.DataFrame(W_est, index=data.columns.values, columns=data.columns.values))
        shd = d.shd(gt)
        root_nodes = d.root_nodes()
        directed = d.directed

        results = results.append({
            'SHD': shd,
            'root_nodes': root_nodes,
            'directed': directed,
            'dag': d,
            'params': {
                'lambda1': p[0],
                'w_threshold': p[1],
                'loss_type': p[2],
                'bnds': p[3]
            }
         }, ignore_index=True)
        print('Completed iteration {} of {} ({}%)\n SHD: {}'.format(i, gridsize, np.round(i/gridsize*100, 2), shd))
        i += 1
        
        
    print('FINISHED')
    return results

In [None]:
results = grid_search()

In [None]:
results = results[results['directed']].sort_values(by='SHD')

In [None]:
W_est = linear.notears_linear(data_array, 
                              lambda1=1e-1, 
                              w_threshold=0.001,
                              loss_type='poisson', 
                              max_iter=1000, 
                              h_tol=1e-30, 
                              rho_max=1e+30,
                              bnds=bnds)
                              # verbose=True)
adj_df = pd.DataFrame(W_est, index=data.columns.values, columns=data.columns.values)
d = dag(adj_df)

In [None]:
g = d.plot_structure()
igraph.plot(g)

In [None]:
d = len(data.columns)
gt = pd.DataFrame(np.zeros((d, d)), columns=data.columns, index=data.columns)

gt.loc['z','y'] = 1.37278195
gt.loc['g','y'] = 0.15452990
gt.loc['k_1','y'] = 0.01074088

gt.loc['z','c'] = 0.35193460
gt.loc['g','c'] = -0.10362034
gt.loc['k_1','c'] = 0.03140616

gt.loc['z','k'] = 1.01252958
gt.loc['g','k'] = 0.04465323
gt.loc['k_1','k'] = 0.95566049

gt.loc['z','l'] = 0.154009373
gt.loc['g','l'] = 0.072779801
gt.loc['k_1','l'] = -0.009885726

gt.loc['z','r'] = 0.16661011
gt.loc['g','r'] = 0.01875479
gt.loc['k_1','r'] = -0.01036630

gt.loc['z','w'] = 1.79625183
gt.loc['g','w'] = -0.15452990
gt.loc['k_1','w'] = 0.08541297

gt.loc['z','i'] = 1.02084736
gt.loc['g','i'] = 0.04502005
gt.loc['k_1','i'] = -0.02066529

gt.loc['z_1','z'] = 0.9702133
gt.loc['g_1','g'] = 0.989444

gt_dag = dag(gt)

In [None]:
g = gt_dag.plot_structure()
igraph.plot(g)

In [None]:
x_0 = pd.Series(np.full(len(gt_dag.nodes), np.nan), index=gt_dag.nodes)
shock_amt = 0.66
x_0[4] = shock_amt
x_0[9:] = 0

In [None]:
tru_irf = gt_dag.calculate_irf(x_0, T=250)
plt = gt_dag.plot_irf(irf)
plt.show()

In [None]:
est_irf = d.calculate_irf(x_0, T=250, verbose = True)
plt = d.plot_irf(est_irf)
plt.show()

In [None]:
max([d.depth(n) for n in d.nodes])

In [None]:
d.m

In [None]:
from scipy import stats, linalg
from itertools import chain, combinations, product
from sklearn.linear_model import LinearRegression, Lasso

In [None]:
data = pd.read_csv('../data/real_data.csv')
data = data.drop(list(['DATE']), axis=1)
data.columns = [col.replace(" ", "") for col in data.columns]
data = data.iloc[:,1:]

# data = data.iloc[:1000,:]
data = data.apply(lambda x: x - x.mean())
# data = data.applymap(lambda x: x + np.random.normal(scale=1))

shift_vars = data.columns.values
shift = data.loc[:,shift_vars].shift()
shift.columns = [str(col) + '_1' for col in shift.columns]
data = pd.concat([data, shift], axis=1)
data = data.iloc[1:,:]

In [None]:
def partial_correlation(y, x, z, data):
    if len(z) == 0:
        pcorr = stats.pearsonr(data[y], data[x])[0]
    else:
        model_y = LinearRegression(fit_intercept=False, normalize=True)
        model_x = LinearRegression(fit_intercept=False, normalize=True)
        model_y.fit(data[z], data[y])
        model_x.fit(data[z], data[x])
        resid_y = data[y] - model_y.predict(data[z])
        resid_x = data[x] - model_x.predict(data[z])
    
        pcorr = stats.pearsonr(resid_y, resid_x)[0]
    
    return pcorr

In [None]:
def powerset(iterable):
    s = list(iterable)
    pset = list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))
    variables = iterable[:np.int64(len(iterable)/2)]
    pairs = {}
    for v in variables:
        pairs[v] = v+'_1'
        pairs[v+'_1'] = v
    pset = [s for s in pset if all([pairs[n] not in s for n in s])]
    return pset
        

def choose_states(data, verbose=False):
    lags = data.columns.values[np.int64(len(data.columns.values)/2):]
    results = pd.DataFrame(columns=['states', 'mean_pcorr', 'min_pcorr', 'max_pcorr', 'pcorrs'])
    for states in powerset(data.columns.values):
        controls = [n for n in data.columns.values if n not in states and n not in lags]
        if len(controls) < 2:
            break
        if verbose: 
            print('Calculating partial correlations for {}'.format(list(states)))
        pcorrs = []
        for p in product(controls, repeat=2):
            pcorr = partial_correlation(p[0], p[1], list(states), data=data)
            entry = {'y': p[0],
                     'x': p[1],
                     'z': states,
                     'pcorr': pcorr}
            pcorrs.append(entry)
        result = {'states': list(states),
                  'mean_pcorr': np.mean(np.abs([p['pcorr'] for p in pcorrs])),
                  'min_pcorr': min([p['pcorr'] for p in pcorrs]),
                  'max_pcorr': max([p['pcorr'] for p in pcorrs]),
                  'pcorrs': pcorrs}
        results = results.append(result, ignore_index=True)
    return results

In [None]:
results = choose_states(data, verbose=True)
results

In [None]:
results['nstates'] = results['states'].apply(lambda x: len(x))
results.sort_values(by=['mean_pcorr', 'nstates'], ascending=[False, True])

In [None]:
results[results['mean_pcorr'] == 1]

In [None]:
results[(np.abs(results['min_pcorr']) > 0.999) & (results['max_pcorr'] > 0.999)].sort_values(by='mean_pcorr')

In [None]:
results['score'] = results['mean_pcorr'] / len(results['states'])

In [None]:
results.sort_values(by='states')

In [None]:
from sklearn.linear_model import Lasso, LinearRegression

In [None]:
Y = data.iloc[:,:9]
X = data.iloc[:,9:]

In [None]:
model = Lasso(alpha=0.1, max_iter=10000, fit_intercept = False, normalize = True)
model.fit(X, Y)

In [None]:
# Each row is coefs for y_i
lasso_results = pd.DataFrame(model.coef_, columns=X.columns, index=Y.columns)

In [None]:
lasso_results.loc[:,(lasso_results != 0).any()].columns.values

In [None]:
def lasso_grid_search(X, Y, a_min=-5, a_max=5, T=100, verbose=True, **kwargs):
    results = pd.DataFrame(columns=['alpha', 'states', 'coefs', 'model'])
    i = 1
    for alpha in np.logspace(a_min, a_max, T):
        if verbose:
            print('Beginning iteration {} of {} ({}%) --- alpha = {}'.format(i, T, (i/T)*100, alpha))
        model = Lasso(alpha=alpha, **kwargs)
        model.fit(X, Y)
        coefs = pd.DataFrame(model.coef_, columns=X.columns, index=Y.columns)
        states = coefs.loc[:,(coefs != 0).any()].columns.values
        result = {'alpha': alpha, 'states': states, 'coefs': coefs, 'model': model}
        results = results.append(result, ignore_index=True)
        i += 1
    return results

In [None]:
lasso_results = lasso_grid_search(X, Y, T=100, max_iter=100000, fit_intercept = False, normalize = True)

In [None]:
for states in lasso_results['states']:
    print(states)

In [None]:
def calculate_irf(model, x_0, states, lags, T=250, verbose=False):
    if verbose:
        print('Simulating irf...')
    for lag in lags:
        x_0[lag] = 0
    irf = pd.DataFrame([self.impute(x_0)], columns=self.nodes)
    for t in range(T-1):
        if verbose:
            print('Simulating t={} of {} ({}%)'.format(t+1, T, np.round(100 * ((t+1)/T), decimals=2)))
        nr = pd.Series(np.full(len(self.nodes), np.nan), index=self.nodes)
        for lag in self.lags:
            nr[lag] = irf.iloc[-1,:][lag.rstrip('_1')]
        nr = self.impute(nr)    
        irf = irf.append(nr, ignore_index=True)
    irf.loc[:,'t'] = range(T)
    irf.set_index('t', inplace=True)
    irf.drop(self.lags, axis=1, inplace=True)
    return irf
    

def plot_irfs(states, lags, data, T=250, **kwargs):
    Y = data[~np.union1d(states, lags)]
    X = data[states]
    model = LinearRegression(**kwargs)
    model.fit(X, Y)
    irf = pd.DataFrame(columns=data.columns)
    irf 
    for t in range(T):
        

In [None]:
T = 250
states = np.array(['k_1', 'g_1', 'z_1'])
X_0 = np.array([[0, 0.66 , 0]]) # initial values of states
controls = data.columns.values[:9]
lags = data.columns.values[9:]
# Note: For now assume states are lags (can relax later i.e. z_1 -> z -> controls)

Y = data[controls]
X = data[states]
model = LinearRegression(fit_intercept=False, normalize=True)
model.fit(X, Y)

controls_init = pd.Series(model.predict(X_0).reshape(len(controls)), index=controls)
lags_init = pd.Series(np.zeros(len(lags)), index=lags)
for i in range(len(states)):
    lags_init[states[i]] = X_0[0, i]
irf = pd.DataFrame(controls_init.append(lags_init)).T
for t in range(1, T):
    L = irf.iloc[-1,:][controls]
    L.index = lags
    S = np.array([L[states]])
    C = pd.Series(model.predict(S).reshape(len(controls)), index=controls)
    irf = irf.append(C.append(L), ignore_index=True)
    
irf.loc[:,'t'] = range(T)
irf.set_index('t', inplace=True)
irf.drop(lags, axis=1, inplace=True)

side = math.ceil(math.sqrt(len(irf.columns)))
layout = (side, side)
axes = irf.plot(subplots=True, layout=layout, 
                color="black", legend=False)
for ax, name in zip(axes.flatten(), irf.columns.values):
    ax.axhline(y=0, color="red")
    ax.set_title(name)
plt.show()

In [None]:
T = 250
states = np.array(['k_1'])
X_0 = np.array([[1]]) # initial values of states
controls = data.columns.values[:9]
lags = data.columns.values[9:]
# Note: For now assume states are lags (can relax later i.e. z_1 -> z -> controls)

Y = data[controls]
X = data[states]
model = LinearRegression(fit_intercept=False, normalize=True)
model.fit(X, Y)

controls_init = pd.Series(model.predict(X_0).reshape(len(controls)), index=controls)
lags_init = pd.Series(np.zeros(len(lags)), index=lags)
for i in range(len(states)):
    lags_init[states[i]] = X_0[0, i]
irf = pd.DataFrame(controls_init.append(lags_init)).T
for t in range(1, T):
    L = irf.iloc[-1,:][controls]
    L.index = lags
    S = np.array([L[states]])
    C = pd.Series(model.predict(S).reshape(len(controls)), index=controls)
    irf = irf.append(C.append(L), ignore_index=True)
    
irf.loc[:,'t'] = range(T)
irf.set_index('t', inplace=True)
irf.drop(lags, axis=1, inplace=True)

side = math.ceil(math.sqrt(len(irf.columns)))
layout = (side, side)
axes = irf.plot(subplots=True, layout=layout, 
                color="black", legend=False)
for ax, name in zip(axes.flatten(), irf.columns.values):
    ax.axhline(y=0, color="red")
    ax.set_title(name)
plt.show()

In [None]:
x_0 = pd.Series(np.full(len(gt_dag.nodes), np.nan), index=gt_dag.nodes)
shock_amt = 1.04
x_0[5] = shock_amt
x_0[9:] = 0
tru_irf = gt_dag.calculate_irf(x_0, T=250)
plt = gt_dag.plot_irf(irf)
plt.show()

In [None]:
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression
from scipy import stats
import warnings


def srivastava(data):
    '''
    Inputs:
        data: np.ndarray
            Residual correlation matrix
    Performs:
        Perform test T3* from Srivastava (2005) to test wheter
        the corrleation matrix is diagonal
    Returns:
        float
    '''
    n = data.shape[0]
    p = data.shape[1]
    if p > 1:
        S = np.cov(data.T)
        a2_hat = np.sum(np.square(np.diag(S)))
        a4_hat = np.sum(np.power(np.diag(S), 4))
        a20_hat = (n/(p*(n+2)))*a2_hat
        a40_hat = (1/p)*a4_hat
        g3_hat = (n/(n-1))*((np.trace(np.dot(S,S))-(1/n)*(np.trace(S))**2)/(np.sum(np.square(np.diag(S)))))

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            T3_hat = (n/2)*((g3_hat-1)/np.sqrt(1-(1/p)*(a40_hat/(a20_hat**2))))
            if np.isnan(T3_hat): # Could have sqrt of negative, replace as in Wang et al.
                T3_hat = (n/2)*((g3_hat-1)/np.sqrt(1-(a4_hat/(a2_hat**2))))

        return T3_hat
    else: # Test isn't meaningful, so do not exclude the model on this basis
        return 0        


def schott(data):
    '''
    Inputs:
        data: np.ndarray
            Residual correlation matrix
    Performs:
        Perform test from Schott (2005) to test wheter
        the corrleation matrix is diagonal
    Returns:
        float
    '''
    n = data.shape[0]
    m = data.shape[1]
    k = m*(m-1)/2
    if m > 1:
        R = np.corrcoef(data.T)
        # for i,j in combinations(list(range(R.shape[0])), 2):
        #     if np.abs(R[i,j]) == 1 and i != j:
        #         R[i,j] == 0
        t_nm = np.sum(np.square(np.triu(R, k=1))) - ((m*(m-1))/(2*n))
        s_nm = (m*(m-1)*(n-1))/((n**2)*(n+2))

        return t_nm/np.sqrt(s_nm)
    
    else: # Test isn't meaningful, so do not exclude the model on this basis
        return 0


def get_resids(roles, data):
    '''
    Inputs:
        roles: state_space_estimation.roles
        data: pd.DataFrame
    Performs:
        Collect linear regression residuals from the model specified in roles
    Returns:
        (np.ndarray, np.ndarray)
    '''
    # Use numpy indexing instead of pandas for large performance increase
    # (At the expense of some increased code complexity)
    data = data.values
    
    # Conditioning sets
    cset = np.append(roles.lag_2_endo_states_idx, roles.lag_exo_states_idx)
    # cset = np.append(roles.lag_endo_states_idx, roles.exo_states_idx)
    
    # Targets 
    tar = np.append(np.append(roles.lag_endo_states_idx, roles.lag_controls_idx), roles.exo_states_idx)
    # tar = np.append(roles.endo_states_idx, roles.controls_idx)
    
    if cset.shape[0] > 0:
        lm = LinearRegression(fit_intercept=True, normalize=False)
        lm.fit(data[:,cset], data[:,tar])
        resid = data[:,tar] - lm.predict(data[:,cset]) 
    else:
        resid = data[:,tar]
    
    return resid


def test(roles, data, method='schott', alpha=0.05, tol=1e-20):
    '''
    Inputs:
        roles: state_space_estimation.roles
            Model upon which to conduct constraint tests
        data: pd.DataFrame
        method: one of ('srivastava', 'schott')
            Testing strategy to use
        alpha: float
            Significance level
        tol: float
            Tolerence, used for detecting near zero residuals 
            which make testing unstable
    Performs:
        Conduct constraint-based (partial correlation) tests on data
        given the state-space model specified by roles and return all
        tests in a dictionary (two test statistics, two p-values, and
        overall decision 'valid')
    Returns:
        tests: dict
    '''
    valid = True
    resid = get_resids(roles, data)
    if method == 'srivastava':
        t = srivastava(resid)
        crit_val = stats.norm.ppf(1-(alpha)) # One-sided test
        p = 1 - stats.norm.cdf(t)
        if t > crit_val:
            valid = False

    elif method == 'schott':
        t = schott(resid)
        crit_val = stats.norm.ppf(1-(alpha/2)) # Two-sided test
        p = 2*(1 - stats.norm.cdf(np.abs(t)))
        if np.abs(t) > crit_val:
            valid = False
        
    else:
        raise ValueError('method {} not found'.format(method))

    return {'t': t, 'p': p, 'valid': valid}

In [None]:
data = pd.read_csv('../data/rbc.csv')
# data.set_index('DATE', inplace=True)

In [None]:
model = roles(['z', 'g'], ['k'], ['y', 'c', 'l', 'r', 'w', 'i'], data.columns.values)
sample = data.sample(1000, replace=True)
print(test(model, sample, method='srivastava'))
print(test(model, sample, method='schott'))

In [None]:
data

In [None]:
cset = ['k_2', 'z_1', 'g_1']
# cset = ['k_1', 'z', 'g']
tar = ['k_1', 'y_1', 'c_1', 'l_1', 'r_1', 'w_1', 'i_1', 'z', 'g']
# tar = ['k', 'y', 'c', 'l', 'r', 'w', 'i']
resid = get_resids(model, data)
pd.DataFrame(np.corrcoef(resid.T), columns=tar, index=tar)

In [None]:
sample = data.sample(1000, replace=True)
cset = ['k_2', 'z_1', 'g_1']
tar = ['k_1', 'y_1', 'c_1', 'l_1', 'r_1', 'w_1', 'i_1', 'z', 'g']

lm = LinearRegression(fit_intercept=True, normalize=False)
lm.fit(sample.loc[:,cset], sample.loc[:,tar])
resid = sample.loc[:,tar] - lm.predict(sample.loc[:,cset]) 

f = len(model.controls) + len(model.endo_states)
np.var(resid.values[:,:f].flatten()) < 1e-20

In [None]:
from itertools import chain, combinations

def potential_states(n_states):
    '''
    Inputs:
        n_states: int
    Performs: 
        Create a generator containing a state_space_estimation.roles objects
        for every possible state space model given this data and n_states
    Returns:
        generator
    '''
    variables = data.columns.values[:int(len(data.columns.values)/2)]
    limit = len(variables)-1 if n_states is None else n_states
    exo_states = chain.from_iterable(combinations(variables, r) for r in range(limit+1))
    for exo in exo_states:
        y = [z for z in variables if z not in exo]
        endo_states = combinations(y, limit-len(exo))
        for endo in endo_states:
            controls = [z for z in variables if z not in endo and z not in exo]
            yield roles(exo, endo, controls, data.columns.values)
    return None

In [None]:
results = pd.DataFrame(columns = ['exo_states', 'endo_states', 'controls', 't1', 't2', 'valid' ])
models = potential_states(3)
for model in models:
    t1, t2, valid = test(model, data, method='srivastava', tol=1e-5)
    results = results.append({'exo_states': model.exo_states,
                              'endo_states': model.endo_states,
                              'controls': model.controls,
                              't1': t1,
                              't2': t2,
                              'valid': valid}, 
                             ignore_index=True)
results[results['valid']]

In [None]:
results = pd.DataFrame(columns = ['exo_states', 'endo_states', 'controls', 't1', 't2', 'valid' ])
models = potential_states(3)
for model in models:
    t1, t2, valid = test(model, data, method='schott', tol=1e-5)
    results = results.append({'exo_states': model.exo_states,
                              'endo_states': model.endo_states,
                              'controls': model.controls,
                              't1': t1,
                              't2': t2,
                              'valid': valid}, 
                             ignore_index=True)
results[results['valid']]

In [None]:
results = pd.DataFrame(columns = ['exo_states', 'endo_states', 'controls', 't1', 't2', 'valid' ])
nstates = 0
while results.shape[0] == 0:
    models = potential_states(nstates)
    for model in models:
        t1, t2, valid = test(model, data, method='custom', tol=1e-8)
        if valid:
            results = results.append({'exo_states': model.exo_states,
                                      'endo_states': model.endo_states,
                                      'controls': model.controls,
                                      't1': t1,
                                      't2': t2,
                                      'valid': valid}, 
                                     ignore_index=True)
        nstates += 1
results[results['valid']]

In [None]:
results = pd.DataFrame(columns = ['exo_states', 'endo_states', 'controls', 't1', 't2', 'valid' ])
results.shape

In [None]:
m = 3
n = 1000
k = 1000
alpha = 0.05
rejected = 0
for i in range(k):
    sample1 = np.array([norm.rvs(size=n), norm.rvs(size=n), norm.rvs(size=n)]).T 
    sample2 = np.array([norm.rvs(size=n), norm.rvs(size=n), norm.rvs(size=n)]).T
    t1, t2, valid = test_resids(sample1, sample2, method='srivastava')
    # if not valid:
    if t1 > norm.ppf(1-alpha):
        rejected += 1
    
print(rejected/k)

In [None]:
m = 17 # total number of variables
l = 17 # number of restricted variables
k = (l/2)*(2*m-l-1)
X = np.array([1 for i in range(m**2)]).reshape(m,-1)
counted = np.sum(np.triu(X, k=1)) - np.sum(np.triu(X[:(m-l),:(m-l)], k=1))
assert counted == k

In [None]:
def schott(data, tol=1e-5):
    n = data.shape[0]
    m = data.shape[1]
    if m > 1:
        R = np.corrcoef(data.T)
        # Check for collinearity
        if (np.abs(R) > 1-tol).all(): # Residuals are so small computation is impossible, but this
                                      # is an indication we have found the correct model... 
            return 0
        else:
            t_nm = np.sum(np.square(np.triu(R, k=1))) - ((m*(m-1))/(2*n))
            s_nm = (m*(m-1)*(n-1))/((n**2)*(n+2))
            # print(t_nm)
            # print(s_nm)

        return t_nm/np.sqrt(s_nm)
    
    else: # Test isn't meaningful, so do not exclude the model on this basis
        return 0
    

def custom(data, l, tol=1e-5):
    n = data.shape[0]   # sample size
    m = data.shape[1]   # total number of variables
    k = (l/2)*(2*m-l-1) # number of elements of R considered
    if m > 1:
        R = np.corrcoef(data.T)
        if (np.abs(R[(m-l):, (m-l):]) > 1-tol).all(): # Residuals are so small computation is impossible, but this
                                                      # is an indication we have found the correct model... 
            return 0        
        else:
            t_nm = (np.sum(np.square(np.triu(R, k=1))) 
                    - np.sum(np.square(np.triu(R[:(m-l),:(m-l)], k=1)))
                    - k/n)
            s_nm = (2*k*(n-1))/((n**2)*(n+2))
            # print(t_nm)
            # print(s_nm)

            return t_nm/np.sqrt(s_nm)
    
    else: # Test isn't meaningful, so do not exclude the model on this basis
        return 0

In [None]:
sample = data.sample(1000, replace=True)
resid1, resid2 = get_resids(model, sample)
print(custom(resid2, 2))
print(schott(resid2))

In [None]:
names1 = ['z_1', 'g_1', 'k', 'y', 'c', 'l', 'r', 'w', 'i']
names2 = ['k_1', 'z', 'g']
resid1, resid2 = get_resids(model, sample, ntests=4)
R = np.corrcoef(resid1.T)
pd.DataFrame(R, columns=names1, index=names1)

In [None]:
sample = data.sample(1000, replace=True)
X = sample.loc[:,['k_1', 'z', 'g']]
Y = sample.loc[:,['k', 'y', 'c', 'l', 'r', 'w', 'i']]
lm = LinearRegression(fit_intercept=True, normalize=False)
lm.fit(X, Y)
resid = lm.predict(X) - Y
np.var(resid.values.flatten())

In [None]:
np.array([0]).all()

In [None]:
from scipy.stats import multivariate_normal as mn

In [None]:
def custom(data, l, tol=1e-5):
    n = data.shape[0]   # sample size
    m = data.shape[1]   # total number of variables
    k = (l/2)*(2*m-l-1) # number of elements of R considered
    if m > 1 and k > 0:
        R = np.corrcoef(data.T)      
        included = np.triu(R, k=1)
        excluded = np.concatenate([np.concatenate([np.triu(R[:(m-l),:(m-l)], k=1), 
                                                   np.zeros(((m-l), l))], axis=1), 
                                   np.zeros((l,m))], axis=0)
        t_nm = (np.sum(np.square(included - excluded))) - k/n
        s_nm = (2*k*(n-1))/((n**2)*(n+2))
        return t_nm/np.sqrt(s_nm)
    
    else: # Test isn't meaningful, so do not exclude the model on this basis
        return 0

In [None]:
n = 1000
cor = 0.8
m = 6
l = 2

cov = np.identity(m-l)
cov[cov == 0] = cor
cor_sample = mn.rvs(cov=cov, size=n).reshape(n, -1)
uncor_sample = mn.rvs(cov=np.identity(l), size=n).reshape(n, -1)
sample = np.concatenate([cor_sample, uncor_sample], axis=1)

print(custom(sample, l))
print(schott(sample))

In [None]:
import pandas as pd
import numpy as np

cors = [0.1, 0.2, 0.5, 0.8]
ms = [4, 8, 12]
ls = [2, 4, 6, 10]
n = 1000
k = 1000
alpha = 0.05

results = pd.DataFrame()

for c, m, l in itertools.product(cors, ms, ls):
    if l < m:
        rejected = 0
        for i in range(k):
            cov = np.identity(m-l)
            cov[cov == 0] = c
            cor_sample = mn.rvs(cov=cov, size=n).reshape(n, -1)
            uncor_sample = mn.rvs(cov=np.identity(l), size=n).reshape(n, -1)
            sample = np.concatenate([cor_sample, uncor_sample], axis=1)

            t = custom(sample, l)
            # if not valid:
            if np.abs(t) > norm.ppf(1-(alpha/2)):
                rejected += 1

        emp_alpha = rejected/k

        result = {'cor': c, 'm': m, 'l': l, 'n': n, 'k': k, 'alpha': alpha, 'emperical_alpha': emp_alpha, 'delta': np.abs(alpha)-emp_alpha}
        results = results.append(result, ignore_index=True)
    
results

In [None]:
results.sort_values(by='delta', ascending=False).loc[:,['cor', 'm', 'l', 'delta']]

In [None]:
for i, j, k in itertools.product([1, 2], [3, 4], [5, 6]):
    print(i, j, k)

In [None]:
m = 6
l = 4
n = 1000
k = 1000
alpha = 0.05
rejected = 0
for i in range(k):
    cov = np.identity(l)
    cov[cov == 0] = 0.5
    cor_sample1 = mn.rvs(cov=cov, size=n)
    cor_sample2 = mn.rvs(cov=cov, size=n)
    uncor_sample1 = mn.rvs(cov=np.identity(m-l), size=n)
    uncor_sample2 = mn.rvs(cov=np.identity(m-l), size=n)
    sample1 = np.concatenate([cor_sample1, uncor_sample1], axis=1)
    sample2 = np.concatenate([cor_sample2, uncor_sample2], axis=1)
        
    t1 = custom(sample1, l)
    t2 = custom(sample2, l)
    # if not valid:
    if np.abs(t1) > norm.ppf(1-(alpha/4)) or np.abs(t2) > norm.ppf(1-(alpha/4)):
        rejected += 1
    
print(rejected/k)

In [None]:
def srivastava(data):
    '''
    Inputs:
        data: np.ndarray
            Residual correlation matrix
    Performs:
        Perform test T3* from Srivastava (2005) to test wheter
        the corrleation matrix is diagonal
    Returns:
        float
    '''
    n = data.shape[0]
    p = data.shape[1]
    if p > 1:
        S = np.cov(data.T)
        a2_hat = np.sum(np.square(np.diag(S)))
        a4_hat = np.sum(np.power(np.diag(S), 4))
        a20_hat = (n/(p*(n+2)))*a2_hat
        a40_hat = (1/p)*a4_hat
        g3_hat = (n/(n-1))*((np.trace(np.dot(S,S))-(1/n)*(np.trace(S))**2)/(np.sum(np.square(np.diag(S)))))

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            T3_hat = (n/2)*((g3_hat-1)/np.sqrt(1-(1/p)*(a40_hat/(a20_hat**2))))
            if np.isnan(T3_hat): # Could have sqrt of negative, replace as in Wang et al.
                T3_hat = (n/2)*((g3_hat-1)/np.sqrt(1-(a4_hat/(a2_hat**2))))

        return T3_hat
    else: # Test isn't meaningful, so do not exclude the model on this basis
        return 0       

In [None]:
m = 9
n = 100
k = 1000
cs = [0] # np.linspace(0, 0.2, 11)
alpha = 0.05
results = pd.DataFrame()
for c in cs:
    rejected = 0
    for i in range(k):
        cov = np.identity(m)
        cov[cov == 0] = c
        sample = mn.rvs(cov=cov, size=n)
        t = srivastava(sample)
        if np.abs(t) > norm.ppf(1-(alpha/2)):
            rejected += 1
    results = results.append({'correlation': c, 'power': rejected/k}, ignore_index=True)
    
results

In [None]:
from math import comb as ncr
count = 0
for i in range(0, 3+1):
    for j in range(0, i+1):
        count += ncr(i,j)*ncr(9,i)*ncr(9-i,j)
count

In [None]:
ncr(3,1)*ncr(9,1)*ncr(9,2)

In [None]:
18 + 144 + 672