# Individualized models as a function of variance decomposition

Assume each individual time series follows a stationary AR model:

$$
\begin{align*}
y_{i,t} &= \alpha_i + \phi y_{i,t-1} + e_i, \hspace{2mm} |\phi| < 1, \hspace{2mm} e_i \sim N(0,1) \\
&= \sum_{j=0}^t \phi^{t-j}\big( \alpha + e_j\big)
\end{align*}
$$

The expected value ($\lim_{n \to \infty}$) and the variance are:

$$
\begin{align*}
E[y_{i,t}] &= \frac{\alpha_i}{1-\phi} \\
Var[y_{i,t}] &= 
\end{align*}
$$

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import plotnine
from plotnine import *

In [4]:
# N=100;K=10;alpha=0;phi=0.25
def dgp_panel(N,K,alpha,phi):
    Alpha = np.repeat(alpha, K).reshape([1,K])
    Emat = np.random.randn(N+1,K)
    Ymat = np.zeros([N+1,K])
    Ymat[0] = Alpha + Emat[0]
    for i in range(1, N+1):
        Ymat[i] = Alpha + phi*Ymat[i-1] + Emat[i]
    return Ymat[1:]

In [198]:
phi = 0.75; N=10
Ysamp = dgp_panel(N=N, K=100000, alpha=0, phi=phi)
print(np.sum(Ysamp**2,0).mean())
# print(np.sum((Ysamp-Ysamp.mean(0))**2,0).mean())
# Ysamp.var(0,ddof=0).mean()*N

21.246806648839502


In [241]:
N=100;K=1000
Xmat = np.random.randn(N,K)
Amat = 0*np.random.randn(1,K)
Ymat = Xmat+Amat
Xmat.var().mean()/Ymat.var()

1.0

0.9896999923382773

In [None]:


"""
GENERATE AN INDIVIDUAL TIME SERIES
"""
def dgp_ts(n,alpha,phi):
    e = np.random.randn(n+1)  # Need to create a t=0
    y = np.zeros(n+1)
    y[0] = alpha + e[0]  # Unfortunately we cannot vectorize
    for i in range(1, n+1):
        y[i] = alpha + phi*y[i-1] + e[i]
    return y[1:]
# # Notice mean is alpha/(1-phi)
# np.random.seed(1)
# n = 1000
# plotnine.options.figure_size = (4, 3)
# alpha=2; phi=0.72
# (ggplot(pd.DataFrame({'y':dgp_ts(n, alpha, phi),'idx':list(range(n))}),aes(x='idx',y='y')) + 
# theme_bw() + geom_line() + geom_hline(yintercept=alpha/(1-phi),color='red'))


"""
GENERATE A GROUP OF TIME SERIES: y_{it} = alpha + phi*y_{it-1} + e
k: # of individuals
sd_alpha: how much variation to draw around alpha ~ N(0, sd_alpha^2)
phi: the AR coefficients
"""
def gen_group(nmin, nmax, k, sd_alpha, phi):
    n_seq = np.random.randint(nmin, nmax, k)
    alpha_seq = 0 + sd_alpha*np.random.randn(k)
    holder = []
    for i, n in enumerate(n_seq):
        holder.append(pd.DataFrame({'idt':i, 'y':dgp_ts(n, alpha_seq[i], phi)}))
    df = pd.concat(holder).reset_index(None, True).assign(varname='y')
    return df


In [None]:
# https://biostat.duke.edu/sites/biostat.duke.edu/files/Longitudinal%20Data%20Analysis%20-%20RM%20ANOVA.pdf
nmin, nmax, k, sd_alpha, phi = 25, 26, 25, 0, 0.5
def rm_anova(nmin, nmax, k, sd_alpha, phi):
    df = gen_group(nmin, nmax, k, sd_alpha, phi)
    df['time'] = df.groupby('idt').cumcount()+1
    # Total variation
    ybar = df.y.mean()
    tss = df.y.var()*(len(df.y)-1)
    # Calculate the variation over the time points
    df_sst = df.groupby('time').y.mean().reset_index().rename(columns={'y':'ybar_time'})
    df_sst = df_sst.merge(df.groupby('time').size().reset_index().rename(columns={0:'n_time'}))
    sst = np.sum(df_sst.n_time*(df_sst.ybar_time - ybar)**2)
    # Calculate the variation within the individuals
    df_sss = df.groupby('idt').y.mean().reset_index().rename(columns={'y':'ybar_idt'})
    df_sss = df_sss.merge(df.groupby('idt').size().reset_index().rename(columns={0:'n_idt'}))
    sss = np.sum(df_sss.n_idt*(df_sss.ybar_idt - ybar)**2)
    # Calculate the variation over the residuals
    ssr = tss - (sss + sst)
    return sst, sss, ssr

sim = np.array([rm_anova(nmin, nmax, k, sd_alpha, phi) for z in range(1500)])
sim = pd.DataFrame(sim, columns=['sst', 'sss', 'ssr'])
sim.mean(0)

In [None]:
()/phi**2

In [None]:
sim.sss.mean()

In [None]:
# df_ssr = df.merge(df_sst,'left','time').merge(df_sss,'left','idt')
# ssr = np.sum( (df_ssr.y - df_ssr.ybar_idt - df_ssr.ybar_time + ybar)**2 )
# print('TSS: %0.1f, SST: %0.1f, SSS: %0.1f, SSR: %0.1f' % (tss,sst,sss,ssr))

In [None]:
np.sum( (df_ssr.y - df_ssr.ybar_idt - df_ssr.ybar_time + ybar)**2 )

In [None]:
np.sum(((np.random.randn(100)-np.random.randn(100).mean())**2))

In [None]:
tmp.SSw.median()

In [None]:
stats.chi2.rvs(df=tmp.dof_w[0],size=1000).mean()

In [None]:
res_w = df.copy().groupby([cn_vv,cn_gg]).apply(lambda x: 
       pd.Series({'SSw':np.sum((x[cn_val]-x[cn_val].mean())**2)})).reset_index()
res_w = res_w.groupby(cn_vv).SSw.sum().reset_index()
# (ii) Calculate the between group sum of squares
res_b = df.copy().groupby([cn_vv,cn_gg]).apply(lambda x: 
                 pd.Series({'xbar':x[cn_val].mean(),'n':x[cn_val].shape[0]})).reset_index()
res_b = res_b.merge(df.groupby(cn_vv)[cn_val].mean().reset_index().rename(columns={cn_val:'mu'}))
res_b = res_b.assign(SSb=lambda x: x.n*(x.xbar - x.mu)**2).groupby(cn_vv).SSb.sum().reset_index()
# (iii) Ensure it lines up (SStot == 0)
res_tot = res_w.merge(res_b).assign(SStot=lambda x: x.SSw+x.SSb)
# (iv) Under null of no difference between groups, should have an F-distribution after DoF adjustment
res_tot

In [None]:
# tmp = 
df.groupby(cn_vv).apply(lambda x: pd.Series({'n':x.shape[0], 'k':x[cn_gg].unique().shape[0]})).reset_index()

In [None]:
res_w

In [None]:
def decomp_var(df, cn_gg, cn_vv, cn_val):
    # (i) Calculate the within group sum of squares
    res_w = df.copy().groupby([cn_vv,cn_gg]).apply(lambda x: 
       pd.Series({'SSw':np.sum((x[cn_val]-x[cn_val].mean())**2)})).reset_index()
    res_w = res_w.groupby(cn_vv).SSw.sum().reset_index()
    # (ii) Calculate the between group sum of squares
    res_b = df.copy().groupby([cn_vv,cn_gg]).apply(lambda x: 
                     pd.Series({'xbar':x[cn_val].mean(),'n':x[cn_val].shape[0]})).reset_index()
    res_b = res_b.merge(df.groupby(cn_vv)[cn_val].mean().reset_index().rename(columns={cn_val:'mu'}))
    res_b = res_b.assign(SSb=lambda x: x.n*(x.xbar - x.mu)**2).groupby(cn_vv).SSb.sum().reset_index()
    # (iii) Ensure it lines up (SStot == 0)
    res_tot = res_w.merge(res_b).assign(SStot=lambda x: x.SSw+x.SSb)
    # (iv) Under null of no difference between groups, should have an F-distribution after DoF adjustment
    tmp = df.groupby(cn_vv).apply(lambda x: pd.Series({'n':x.shape[0], 'k':x[cn_gg].unique().shape[0]})).reset_index()
    res_tot = res_tot.merge(tmp,'left',cn_vv)
    res_tot = res_tot.assign(dof_b = lambda x: x.k - 1, dof_w=lambda x: x.n-x.k)
    res_tot = res_tot.assign(Fstat=lambda x: (x.SSb/x.dof_b)/(x.SSw/x.dof_w))
    res_tot['pval'] = 1 - stats.f.cdf(res_tot.Fstat, res_tot.dof_b, res_tot.dof_w)
    res_tot['gg'] = cn_gg
    return res_tot