# Problem Set 5

# QUESTION 3


In [1]:
from scipy.stats import norm
import numpy as np
from statsmodels.api import add_constant
from linearmodels.iv import IV2SLS
import pandas as pd

In [2]:
alpha_1 = 1
alpha_2 = 1
beta_0 = 1
beta_1 = 1


In [3]:
def gen_data(alpha_1, alpha_2, beta_0, beta_1):
    # Generating the random variables
    v_1 = norm.rvs(size  = 150)
    v_2 = norm.rvs(size  = 150)
    v_3 = norm.rvs(size  = 150)
    z_1 = norm.rvs(size  = 150)
    z_2 = norm.rvs(size  = 150)
    x = [alpha_1*z_1[i]+alpha_2*z_2[i]+v_1[i]+v_3[i] for i in range(150)]
    y = [beta_0+beta_1*x[i]+v_1[i]+v_2[i] for i in range(150)]
    data = pd.DataFrame({'y': y, 'x': x, 'z_1':z_1, 'z_2':z_2})
    data = add_constant(data, has_constant='add')
    return data
    
    

In [4]:
df = gen_data(alpha_1, alpha_2, beta_0, beta_1)

In [5]:
print(df.head())

   const         y         x       z_1       z_2
0    1.0  3.682892  0.531768  0.357804 -1.250682
1    1.0  2.944834  0.649838 -1.060947  0.881529
2    1.0 -4.602047 -2.875818 -1.570189 -0.548478
3    1.0  4.466517  1.782875 -0.497630 -0.437629
4    1.0  9.218863  2.833364 -0.754017  1.013377


In [6]:
 data = gen_data(alpha_1, alpha_2, beta_0, beta_1)
# OLS:
res_ols = IV2SLS(data.y, data[['const','x']], None, None).fit(cov_type='unadjusted')
# IV:
res_iv = IV2SLS(data.y, data[['const']], data.x, data.z_1).fit(cov_type='unadjusted')
# 2SLS
res_2sls = IV2SLS(data.y, data[['const']], data.x, data[['z_1', 'z_2']]).fit(cov_type='unadjusted')



In [7]:
res_ols.params['x']

1.2597542745333117

In [8]:
res_ols.std_errors['x']

0.05002880344665273

In [9]:
 from linearmodels.iv import compare

In [10]:
print(compare({'OLS':res_ols, 'IV':res_iv, '2SLS': res_2sls}))

                         Model Comparison                        
                                OLS             IV           2SLS
-----------------------------------------------------------------
Dep. Variable                     y              y              y
Estimator                       OLS        IV-2SLS        IV-2SLS
No. Observations                150            150            150
Cov. Est.                unadjusted     unadjusted     unadjusted
R-squared                    0.8087         0.7865         0.7645
Adj. R-squared               0.8074         0.7850         0.7629
F-statistic                  634.06         110.45         144.09
P-value (F-stat)             0.0000         0.0000         0.0000
const                        0.9385         0.9297         0.9261
                           (9.4219)       (8.8297)       (8.3778)
x                            1.2598         1.0510         0.9653
                           (25.181)       (10.509)       (12.004)
Instrument

In [11]:
def it_regressions(print_option = False):
    data = gen_data(alpha_1, alpha_2, beta_0, beta_1)
    # OLS:
    res_ols = IV2SLS(data.y, data[['const','x']], None, None).fit(cov_type='unadjusted')
    # IV:
    res_iv = IV2SLS(data.y, data[['const']], data.x, data.z_1).fit(cov_type='unadjusted')
    # 2SLS
    res_2sls = IV2SLS(data.y, data[['const']], data.x, data[['z_1', 'z_2']]).fit(cov_type='unadjusted')
    if print_option == True:
        from linearmodels.iv import compare
        print(compare({'OLS':res_ols, 'IV':res_iv, '2SLS': res_2sls}))
    return {'ols': [res_ols.params['x'], res_ols.std_errors['x']],
           'iv':[res_iv.params['x'], res_iv.std_errors['x']],
           '2sls':[res_2sls.params['x'], res_2sls.std_errors['x']]}
    
    

Now we do our simulations.

In [12]:
def simulations(iter = 1000):
    keys = ['ols', 'iv', '2sls']
    acc_dict = {key: {'beta':[], 'std': []} for key in keys}
    for sim in range(iter):
        if sim%100==0:
            print(str(round(sim/iter,4)*100)+"%")
        iter_dict = it_regressions()
        for key in acc_dict.keys():
            acc_dict[key]['beta'].append(iter_dict[key][0])
            acc_dict[key]['std'].append(iter_dict[key][1])
    print("Simulations concluded")
    return acc_dict
        
    

In [13]:
results = simulations()

0.0%
10.0%
20.0%
30.0%
40.0%
50.0%
60.0%
70.0%
80.0%
90.0%
Simulations concluded


In [14]:
OLS_results = pd.DataFrame(results['ols'])
IV_results = pd.DataFrame(results['iv'])
_2SLS_results = pd.DataFrame(results['2sls'])

## (a)

In [15]:
print(OLS_results.describe())

              beta          std
count  1000.000000  1000.000000
mean      1.249438     0.053995
std       0.056004     0.004458
min       1.076705     0.041925
25%       1.213479     0.051004
50%       1.247786     0.053872
75%       1.287006     0.056968
max       1.422150     0.070218


It is consistent for some estimate, but not for the true $\beta_1$. The mean of the estimates is far away from 1! This happens because our model suffers from endogeneity:

$$
\begin{aligned}
y_{i} &=\beta_{0}+\beta_{1} x_{i}+v_{1 i}+v_{2 i} \quad \quad(\star) \\
x_{i} &=\alpha_{1} z_{1 i}+\alpha_{2} z_{2 i}+v_{1 i}+v_{3 i}\quad\quad (\star\star)
\end{aligned}
$$

$$\mathbb{E}[x_i\cdot (v_{1i}+v_{2i}) ]\neq 0$$

because of $v_{1i}$.

## (b)

In [16]:
print(IV_results.describe())

              beta          std
count  1000.000000  1000.000000
mean      0.992424     0.119861
std       0.123841     0.026790
min       0.459081     0.071280
25%       0.918179     0.101986
50%       0.998599     0.115069
75%       1.079660     0.131824
max       1.400490     0.272595


Now we have a consistent estimator, since the requirements of the IV (exclusion restriction and relevance) are met in our data generating process:

(a) $z_1$ is independent from the error terms

(b) $z_1$ is correlated with $x$ in ($\star\star$) above

## (c)

In [17]:
print(_2SLS_results.describe())

              beta          std
count  1000.000000  1000.000000
mean      0.999085     0.082238
std       0.084742     0.011790
min       0.664661     0.054848
25%       0.946159     0.073950
50%       1.001180     0.081234
75%       1.054697     0.089092
max       1.252271     0.132057


Indeed, our estimator remains consistent since the same reasoning pointed out above also extends to $z_2$.

However, it is clear that the standard deviations of the $\hat \beta_{1,2SLS}$ are usually below the ones from $\hat \beta_{1,IV}$. This can be seen by comparing the means or the quantiles between these series of estimated standard errors.

The intuition can be seen from Question 2:

$$avar(b_{IV}) = \frac{2}{\alpha_1^2}$$

$$avar(b_{2SLS}) = \frac{2}{\alpha_1^2+\alpha_2^2}$$

In our case, $\alpha_2>0$ (i.e., the new instrument is relevant) and intuitively the additional instrument is able to generate additional variation in $x$ which can make the parameters estimations more precise.

## (d)

In [18]:
def gen_data(alpha_1, alpha_2, beta_0, beta_1):
    # Generating the random variables
    v_1 = norm.rvs(size  = 150)
    v_2 = norm.rvs(size  = 150)
    v_3 = norm.rvs(size  = 150)
    z_1 = norm.rvs(size  = 150)
    z_2 = norm.rvs(size  = 150)
    z_3 = norm.rvs(size  = 150)
    x = [alpha_1*z_1[i]+alpha_2*z_2[i]+v_1[i]+v_3[i] for i in range(150)]
    y = [beta_0+beta_1*x[i]+v_1[i]+v_2[i] for i in range(150)]
    data = pd.DataFrame({'y': y, 'x': x, 'z_1':z_1, 'z_2':z_2, 'z_3':z_3})
    data = add_constant(data, has_constant='add')
    return data

In [19]:
def it_regressions(print_option = False):
    data = gen_data(alpha_1, alpha_2, beta_0, beta_1)
    # OLS:
    res_ols = IV2SLS(data.y, data[['const','x']], None, None).fit(cov_type='unadjusted')
    # IV:
    res_iv = IV2SLS(data.y, data[['const']], data.x, data.z_1).fit(cov_type='unadjusted')
    # 2SLS
    res_2sls = IV2SLS(data.y, data[['const']], data.x, data[['z_1', 'z_2','z_3']]).fit(cov_type='unadjusted')
    if print_option == True:
        from linearmodels.iv import compare
        print(compare({'OLS':res_ols, 'IV':res_iv, '2SLS': res_2sls}))
    return {'ols': [res_ols.params['x'], res_ols.std_errors['x']],
           'iv':[res_iv.params['x'], res_iv.std_errors['x']],
           '2sls':[res_2sls.params['x'], res_2sls.std_errors['x']]}

In [20]:
def simulations(iter = 1000):
    keys = ['ols', 'iv', '2sls']
    acc_dict = {key: {'beta':[], 'std': []} for key in keys}
    for sim in range(iter):
        if sim%100==0:
            print(str(round(sim/iter,4)*100)+"%")
        iter_dict = it_regressions()
        for key in acc_dict.keys():
            acc_dict[key]['beta'].append(iter_dict[key][0])
            acc_dict[key]['std'].append(iter_dict[key][1])
    print("Simulations concluded")
    return acc_dict

In [21]:
results = simulations()

0.0%
10.0%
20.0%
30.0%
40.0%
50.0%
60.0%
70.0%
80.0%
90.0%
Simulations concluded


In [22]:
for estimator in results.keys():
    print("-"*40)
    print(estimator)
    print(pd.DataFrame(results[estimator]).describe())

----------------------------------------
ols
              beta          std
count  1000.000000  1000.000000
mean      1.249432     0.054279
std       0.057539     0.004341
min       1.070036     0.042397
25%       1.210419     0.051352
50%       1.249372     0.053899
75%       1.287496     0.057092
max       1.421541     0.069206
----------------------------------------
iv
              beta          std
count  1000.000000  1000.000000
mean      0.988480     0.120903
std       0.122933     0.027501
min       0.471665     0.069501
25%       0.919536     0.102621
50%       0.990401     0.115975
75%       1.069891     0.131879
max       1.320289     0.278689
----------------------------------------
2sls
              beta          std
count  1000.000000  1000.000000
mean      0.999598     0.082474
std       0.083404     0.011766
min       0.696109     0.055499
25%       0.944748     0.074492
50%       1.001083     0.080657
75%       1.057203     0.089344
max       1.256785     0.149089


The estimator is still consistent, but this additional instrument does not help us estimating the parameter more precisely.

The standard errors even increase a little in this new simulations.

## (e)

In [23]:
def gen_data(alpha_1, alpha_2, beta_0, beta_1):
    # Generating the random variables
    v_1 = norm.rvs(size  = 150)
    v_2 = norm.rvs(size  = 150)
    v_3 = norm.rvs(size  = 150)
    z_1 = norm.rvs(size  = 150)
    z_2 = norm.rvs(size  = 150)
    other_z = pd.DataFrame({'z_'+str(i):norm.rvs(size  = 150) for i in range(3,101)})
    x = [alpha_1*z_1[i]+alpha_2*z_2[i]+v_1[i]+v_3[i] for i in range(150)]
    y = [beta_0+beta_1*x[i]+v_1[i]+v_2[i] for i in range(150)]
    data = pd.DataFrame({'y': y, 'x': x, 'z_1':z_1, 'z_2':z_2})
    data = data.join(other_z)
    data = add_constant(data, has_constant='add')
    return data

In [24]:
test = gen_data(alpha_1, alpha_2, beta_0, beta_1)

In [25]:
print(test.head())

   const         y         x       z_1       z_2       z_3       z_4  \
0    1.0  2.070115 -0.102900 -1.332998 -0.566858  0.966553  0.041900   
1    1.0 -0.163841 -1.250282 -0.563433 -0.181638 -1.959610 -1.678629   
2    1.0  1.511346  2.154200  0.975835  2.094416 -0.184394 -0.469107   
3    1.0  1.069384 -0.655164  0.447784 -0.381732 -0.752271 -1.145978   
4    1.0  1.060326 -0.081630  0.162976  0.416039 -0.809499  0.341487   

        z_5       z_6       z_7  ...      z_91      z_92      z_93      z_94  \
0  0.478268 -0.705309  1.734401  ...  0.680112 -0.160665  0.780403 -2.293902   
1 -0.480815  0.437968  0.048819  ...  1.804439  0.633229  0.981908 -0.359420   
2  0.538738 -0.509485  0.610107  ...  0.758649 -0.900574  0.689164  2.615189   
3  0.118106 -0.711665  0.096846  ... -0.469215 -0.084037 -0.214436  0.403984   
4  1.302709 -1.713554  2.240830  ... -0.842022  1.653039 -1.106431 -0.909485   

       z_95      z_96      z_97      z_98      z_99     z_100  
0  1.272774 -1.356407 

In [26]:
def it_regressions(print_option = False):
    data = gen_data(alpha_1, alpha_2, beta_0, beta_1)
    instruments = data.drop(columns = {'const', 'x', 'y'})
    # OLS:
    res_ols = IV2SLS(data.y, data[['const','x']], None, None).fit(cov_type='unadjusted')
    # IV:
    res_iv = IV2SLS(data.y, data[['const']], data.x, data.z_1).fit(cov_type='unadjusted')
    # 2SLS
    res_2sls = IV2SLS(data.y, data[['const']], data.x, instruments).fit(cov_type='unadjusted')
    if print_option == True:
        from linearmodels.iv import compare
        print(compare({'OLS':res_ols, 'IV':res_iv, '2SLS': res_2sls}))
    return {'ols': [res_ols.params['x'], res_ols.std_errors['x']],
           'iv':[res_iv.params['x'], res_iv.std_errors['x']],
           '2sls':[res_2sls.params['x'], res_2sls.std_errors['x']]}

In [27]:
it_regressions(print_option = False)

{'ols': [1.2902590822860855, 0.04759213265505585],
 'iv': [1.2583450843677222, 0.09742983707928406],
 '2sls': [1.2446239777465773, 0.05124662828380647]}

In [28]:
def simulations(iter = 1000):
    keys = ['ols', 'iv', '2sls']
    acc_dict = {key: {'beta':[], 'std': []} for key in keys}
    for sim in range(iter):
        if sim%100==0:
            print(str(round(sim/iter,4)*100)+"%")
        iter_dict = it_regressions()
        for key in acc_dict.keys():
            acc_dict[key]['beta'].append(iter_dict[key][0])
            acc_dict[key]['std'].append(iter_dict[key][1])
    print("Simulations concluded")
    return acc_dict

In [29]:
results = simulations()

0.0%
10.0%
20.0%
30.0%
40.0%
50.0%
60.0%
70.0%
80.0%
90.0%
Simulations concluded


In [30]:
for estimator in results.keys():
    print("-"*40)
    print(estimator)
    print(pd.DataFrame(results[estimator]).describe())

----------------------------------------
ols
              beta          std
count  1000.000000  1000.000000
mean      1.245323     0.053926
std       0.054824     0.004248
min       1.069513     0.041278
25%       1.206436     0.050987
50%       1.246670     0.053863
75%       1.282856     0.056807
max       1.421047     0.068196
----------------------------------------
iv
              beta          std
count  1000.000000  1000.000000
mean      0.984702     0.119802
std       0.118765     0.025882
min       0.509409     0.068786
25%       0.911559     0.102565
50%       0.992809     0.115273
75%       1.063965     0.133618
max       1.344570     0.302378
----------------------------------------
2sls
              beta          std
count  1000.000000  1000.000000
mean      1.195578     0.059284
std       0.058678     0.005128
min       1.031438     0.044797
25%       1.155057     0.055681
50%       1.196447     0.059056
75%       1.234915     0.062622
max       1.394515     0.077079


The estimators for $\hat \beta_{1,2SLS}$ seem to diverge from the true value in mean and in distribution. For example, the 25% quantile is at $1.16$. The estimations precision did not change much, as we would expect: adding irrelevant estimators does not help reducing the variance.