# Production Technology

The dataset contains `N = 441` firms observed over `T = 12` years, 1968-1979. There variables are: 
* `lcap`: Log of capital stock, $k_{it}$ 
* `lemp`: log of employment, $\ell_{it}$ 
* `ldsa`: log of deflated sales, $y_{it}$
* `year`: the calendar year of the observation, `year` $ = 1968, ..., 1979$, 
* `firmid`: anonymized indicator variable for the firm, $i = 1, ..., N$, with $N=441$. 

In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd 
import numpy as np
import seaborn as sns

import linear_panel_class as lm 
# content in lm is approximately the same as in the exercises
# only major change is that 'robust' is added as argument in estimate()
# to compute robust standard errors.

# Converting data to numpy format 

In [3]:
# load data and extract odd years 
dat = pd.read_csv('firms.csv')
df = dat[dat['year'] % 2 != 0]
df.year.unique()


array([1969, 1971, 1973, 1975, 1977, 1979])

In [4]:
df.sample(5)

Unnamed: 0,firmid,year,lcap,lemp,ldsa
1817,152,1973,1.316333,1.510195,1.672504
4781,399,1973,1.235907,0.859317,1.252015
403,34,1975,1.720145,1.4624,1.454235
1153,97,1969,0.320204,0.341388,0.251231
4451,371,1979,-0.417007,-0.217618,0.192382


In [5]:
df.describe()

Unnamed: 0,firmid,year,lcap,lemp,ldsa
count,2646.0,2646.0,2646.0,2646.0,2646.0
mean,221.0,1974.0,3.740325e-08,-8.049887e-09,1.080877e-09
std,127.32947,3.416296,1.306042,1.176767,1.231411
min,1.0,1969.0,-3.81368,-3.38278,-3.54692
25%,111.0,1971.0,-0.8989662,-0.784196,-0.928677
50%,221.0,1974.0,-0.113989,-0.113923,-0.100676
75%,331.0,1977.0,0.9105642,0.7858111,0.8524479
max,441.0,1979.0,4.103687,3.324889,3.913391


In [6]:
# define N & T in data 
N = df.firmid.unique().size
T = df.year.unique().size
assert df.shape[0] == N*T, f'Error: data is not a balanced panel'
print(f'Data has N={N} and T={T}')

Data has N=441 and T=6


Extract data from `pandas` to `numpy` arrays. 

In [7]:
y = df.ldsa.values.reshape((N*T,1))
y_label = ['ldsa']

ones = np.ones((N*T,1))
l = df.lemp.values.reshape((N*T,1))
k = df.lcap.values.reshape((N*T,1))
x = np.hstack([ones, l, k])
x_label = ['intercept', 'lemp', 'lcap']

# Estimate FE and FD

In [8]:
### print FE estimation
# create transformation matrix
def demeaning_matrix(T):
    Q_T =  np.eye(T) - np.ones((T,T))/T
    return Q_T

# transform the data
Q_T = demeaning_matrix(T)
y_dot = lm.perm(Q_T, y)
x_dot = lm.perm(Q_T, x)

# remove columns with only zeros
x_dot, x_dot_label = lm.remove_zero_columns(x_dot, x_label)

# estimate 
fe_result = lm.estimate(y_dot, x_dot, transform='fe', T=T, robust=False)

# print 
lm.print_table((y_label, x_dot_label), fe_result, title="Fixed Effects", floatfmt='.4f')

---------------------------------------------
Fixed Effects
Dependent variable: ['ldsa']

        Beta      Se    t-values
----  ------  ------  ----------
lemp  0.7069  0.0221     32.0114
lcap  0.1424  0.0197      7.2402
R² = 0.468
σ² = 0.019
Robust standard errors: False


In [9]:
### print FD estimation
# Create transformation matrix
def fd_matrix(T):
    # Initialize a (T-1) x T matrix filled with zeros
    D_T = np.zeros((T-1, T))
    
    # Fill the matrix according to the first-difference structure
    for i in range(T-1):
        D_T[i, i] = -1
        D_T[i, i+1] = 1
    
    return D_T #(T-1)xT

D_T = fd_matrix(T) # (T-1)xT matrix
y_diff = lm.perm(D_T,y)
x_diff = lm.perm(D_T,x)

# remove columns with only zeros
x_diff, x_diff_label = lm.remove_zero_columns(x_diff, x_label)

# estimate 
fd_result = lm.estimate(y_diff, x_diff, transform = 'fd', T = T-1)

# print
lm.print_table((y_label, x_diff_label), fd_result, title="First Difference", floatfmt='.4f')

---------------------------------------------
First Difference
Dependent variable: ['ldsa']

        Beta      Se    t-values
----  ------  ------  ----------
lemp  0.7253  0.0248     29.2665
lcap  0.0547  0.0235      2.3307
R² = 0.313
σ² = 0.022
Robust standard errors: False


# Check for autocorrelation in FE- and FD-residuals 

In [13]:
### check for autocorrelation in FE-residuals (Woolridge p. 275)
# define residuals (+ lagged), remove lagged observation
u_hat = fe_result.get('u_hat') # first element is t=1, last is t=T.

u_hat_L1 = np.roll(u_hat, 1)
u_hat_L1[0] = np.nan # remove first element in lagged, since not observed

valid_indices = ~np.isnan(u_hat_L1)
u_hat_clean = u_hat[valid_indices].reshape(1,N*T-1).T
u_hat_L1_clean = u_hat_L1[valid_indices].reshape(1,N*T-1).T

# perform POLS with robust s.e. 
fe_robust = lm.estimate(u_hat_clean, u_hat_L1_clean, transform='', robust=True)

# print results
lm.print_table((['u_hat'], ['u_hat_L1']), fe_robust, title="H0: the time-demeaned errors are serially CORRELATED", floatfmt='.4f')

# -> we reject the null, thus this suggest that we can use homoskedastic errros, and under FE.1-3 that FE is efficient. 

(array([0.03025055]), array([nan])) (array([0.01834289]), array([0.03025055])) (array([0.04233755]), array([0.01834289])) (array([0.10463153]), array([0.04233755])) (array([0.00049441]), array([0.10463153])) (array([-0.19605693]), array([0.00049441])) (array([-0.03748765]), array([-0.19605693])) (array([0.01389959]), array([-0.03748765])) (array([0.05251268]), array([0.01389959])) (array([0.08148897]), array([0.05251268])) (array([-0.03827055]), array([0.08148897])) (array([-0.07214304]), array([-0.03827055])) (array([0.03371805]), array([-0.07214304])) (array([-0.09576814]), array([0.03371805])) (array([0.14322625]), array([-0.09576814])) (array([-0.08379069]), array([0.14322625])) (array([-0.04478676]), array([-0.08379069])) (array([0.04740129]), array([-0.04478676])) (array([0.12293434]), array([0.04740129])) (array([0.02123748]), array([0.12293434])) (array([-0.00948049]), array([0.02123748])) (array([-0.08074742]), array([-0.00948049])) (array([0.02592448]), array([-0.08074742])) 

In [52]:
### Check for autocorrelation in FD residuals (Woolridge, p. 282)
# define residuals (+ lagged)
u_hat = fd_result.get('u_hat')  

u_hat_L1 = np.roll(u_hat, 1)
u_hat_L1[0] = np.nan

valid_indices = ~np.isnan(u_hat_L1)

u_hat_clean = u_hat[valid_indices].reshape(1,N*(T-1)-1).T
u_hat_L1_clean = u_hat_L1[valid_indices].reshape(1,N*(T-1)-1).T

# perform POLS 
fe_robust = lm.estimate(u_hat_clean, u_hat_L1_clean, transform='')

# print results 
lm.print_table((['u_hat'], ['u_hat_L1']), fe_robust, title="H0: the time-demeaned errors are serially CORRELATED", floatfmt='.4f')

# NULL: the error term should be serially uncorrelated
# finding of significant serial correlation in the error warrants computing the robust variance matrix for the FD estimator 
# -> we reject the null, which suggest that the error term is serially CORRELATED. Thus, we should use HETEROSKEDASTIC ERRORS


---------------------------------------------
H0: the time-demeaned errors are serially CORRELATED
Dependent variable: ['u_hat']

             Beta      Se    t-values
--------  -------  ------  ----------
u_hat_L1  -0.2166  0.0208    -10.4049
R² = 0.047
σ² = 0.021
Robust standard errors: False


# Exogenity test for FE

In [53]:
### do FE exogenity test (Woolridge, p. 285)
# Sort by firmid and year
df_sorted = df.sort_values(['firmid', 'year'])

# Step 1: Redefine N and T based on the data
N = df_sorted['firmid'].nunique()  
T = df_sorted['year'].nunique()   

# Step 2: Create the demeaning matrix for T periods
def demeaning_matrix(T):
    Q_T =  np.eye(T) - np.ones((T, T)) / T
    return Q_T

Q_T = demeaning_matrix(T)

# Step 3: Demean the data for each firm
# Reshape and apply demeaning by firm using groupby
def demean_by_firm(df, col_name, Q_T):
    firm_groups = df.groupby('firmid')[col_name]
    demeaned_values = firm_groups.transform(lambda x: Q_T @ x.values)
    return demeaned_values

df_sorted['lcap_demeaned'] = demean_by_firm(df_sorted, 'lcap', Q_T)
df_sorted['lemp_demeaned'] = demean_by_firm(df_sorted, 'lemp', Q_T)
df_sorted['ldsa_demeaned'] = demean_by_firm(df_sorted, 'ldsa', Q_T)

# Step 4: Calculate lead variables on the demeaned data
df_sorted['lcap_lead'] = df_sorted.groupby('firmid')['lcap_demeaned'].shift(-1)
df_sorted['lemp_lead'] = df_sorted.groupby('firmid')['lemp_demeaned'].shift(-1)

# Step 5: Drop last year for each firm to avoid missing lead values
last_year = df_sorted['year'].max()
cond = df_sorted['year'] != last_year
df_ss = df_sorted[cond]
T = df_ss['year'].nunique() # update T  

# Step 6: Convert the relevant columns to numpy arrays
y = df_ss['ldsa_demeaned'].values.reshape((-1, 1))
y_label = ['ldsa']

ones = np.ones((df_ss.shape[0], 1))
l = df_ss['lemp_demeaned'].values.reshape((-1, 1))
k = df_ss['lcap_demeaned'].values.reshape((-1, 1))

x = np.hstack([ones, l, k])
x_label = ['intercept', 'lemp', 'lcap']

l_lead = df_ss['lemp_lead'].values.reshape((-1, 1))
k_lead = df_ss['lcap_lead'].values.reshape((-1, 1))

# Step 7: Estimate FE adding different leads of x to the regrossers
w_dict = {'lemp_lead': l_lead, 'lcap_lead': k_lead}

print('H0: Exogenity. \nAfter controlling for x, then lead variable of x should have no partial effect.')
for key, value in w_dict.items():
    x_w = np.hstack([x, value])
    x_w_label = x_label + [key]

    # Assuming lm.perm() performs the within transformation
    y_dot = lm.perm(Q_T, y)
    x_dot = lm.perm(Q_T, x_w)

    # Remove columns with all zeros if needed
    x_dot, x_dot_label = lm.remove_zero_columns(x_dot, x_w_label)

    # Estimate the fixed effects model
    fe_result_temp = lm.estimate(y_dot, x_dot, transform='fe', T=T, robust=False)

    # Print results
    lm.print_table((y_label, x_dot_label), fe_result_temp, title="Fixed Effects", floatfmt='.4f')

H0: Exogenity. 
After controlling for x, then lead variable of x should have no partial effect.
---------------------------------------------
Fixed Effects
Dependent variable: ['ldsa']

             Beta      Se    t-values
---------  ------  ------  ----------
lemp       0.6633  0.0271     24.4330
lcap       0.1302  0.0215      6.0510
lemp_lead  0.0718  0.0256      2.8015
R² = 0.453
σ² = 0.015
Robust standard errors: False
---------------------------------------------
Fixed Effects
Dependent variable: ['ldsa']

             Beta      Se    t-values
---------  ------  ------  ----------
lemp       0.6628  0.0254     26.0652
lcap       0.0865  0.0239      3.6217
lcap_lead  0.1148  0.0264      4.3531
R² = 0.457
σ² = 0.015
Robust standard errors: False


Basically, we get that we FE.1. is violated. Lets just state this in the text, and then assume that it holds, going further.