In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

### A Microeconomic Panel: wage determination

`nls_panel` dataset

In [2]:
nls_panel = pd.read_sas('nls_panel.sas7bdat')

### Pooled Regression with non-robust OLS estimators

In [3]:
mdl_poolreg = smf.ols('lwage ~ educ+exper+exper2+tenure+tenure2+black+south+union', data=nls_panel).fit()
print(mdl_poolreg.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.326
Model:                            OLS   Adj. R-squared:                  0.324
Method:                 Least Squares   F-statistic:                     215.5
Date:                Tue, 16 Feb 2021   Prob (F-statistic):          1.17e-298
Time:                        23:25:05   Log-Likelihood:                -1629.9
No. Observations:                3580   AIC:                             3278.
Df Residuals:                    3571   BIC:                             3333.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4766      0.056      8.487      0.0

### Pooled Regression with cluster-robust standard errors

- We use `cov_type='cluster', cov_kwds={'groups': nls_panel['id']}` as parameters inside `.fit()`. It is similar to what we did with `HAC SE` in Chapter 8.
- Its SAS equivalent is `proc surveyreg` with `cluster id` specification. 

In [4]:
mdl_poolreg_hac = smf.ols('lwage ~ educ+exper+exper2+tenure+tenure2+black+south+union', data=nls_panel).fit(cov_type='cluster', cov_kwds={'groups': nls_panel['id']})
print(mdl_poolreg_hac.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.326
Model:                            OLS   Adj. R-squared:                  0.324
Method:                 Least Squares   F-statistic:                     79.26
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           2.15e-93
Time:                        23:25:05   Log-Likelihood:                -1629.9
No. Observations:                3580   AIC:                             3278.
Df Residuals:                    3571   BIC:                             3333.
Df Model:                           8                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4766      0.085      5.636      0.0

### Fixed-Effects Model with Dummies

- Dummy Variable Approach: We set up a dummy variable for each individual
- Please note that we give up the intercept term if we use all dummies. Or we can keep the intercept in the OLS but get rid of one dummy variable here.

In [5]:
nls_panel10 = pd.read_sas('nls_panel10.sas7bdat')

nls_panel10[['id','year', 'south', 'black', 'union']] = nls_panel10[['id','year', 'south', 'black', 'union']].astype(int)

In [6]:
print(nls_panel10.head(10))


   id  year     lwage  hours   age  educ  collgrad  msp  nev_mar  not_smsa  \
0   1    82  1.808289   38.0  30.0  12.0       0.0  1.0      0.0       0.0   
1   1    83  1.863417   38.0  31.0  12.0       0.0  1.0      0.0       0.0   
2   1    85  1.789367   38.0  33.0  12.0       0.0  0.0      0.0       0.0   
3   1    87  1.846530   40.0  35.0  12.0       0.0  0.0      0.0       0.0   
4   1    88  1.856449   40.0  37.0  12.0       0.0  0.0      0.0       0.0   
5   2    82  1.280933   48.0  36.0  17.0       1.0  1.0      0.0       0.0   
6   2    83  1.515855   43.0  37.0  17.0       1.0  1.0      0.0       0.0   
7   2    85  1.930170   35.0  39.0  17.0       1.0  1.0      0.0       0.0   
8   2    87  1.919034   42.0  41.0  17.0       1.0  1.0      0.0       0.0   
9   2    88  2.200974   42.0  43.0  17.0       1.0  1.0      0.0       0.0   

   c_city  south  black  union      exper     exper2    tenure    tenure2  
0     1.0      0      1      1   7.666667   58.77777  7.666667  5

** We use `pd.get_dummies()` to obtain dummy variables for each individual**

In [7]:
id_panel = pd.get_dummies(nls_panel10['id'], prefix='D', prefix_sep='_')

In [8]:
nls_panel10_new = pd.concat([nls_panel10, id_panel], axis=1)

In [9]:
print(nls_panel10_new.head())

   id  year     lwage  hours   age  educ  collgrad  msp  nev_mar  not_smsa  \
0   1    82  1.808289   38.0  30.0  12.0       0.0  1.0      0.0       0.0   
1   1    83  1.863417   38.0  31.0  12.0       0.0  1.0      0.0       0.0   
2   1    85  1.789367   38.0  33.0  12.0       0.0  0.0      0.0       0.0   
3   1    87  1.846530   40.0  35.0  12.0       0.0  0.0      0.0       0.0   
4   1    88  1.856449   40.0  37.0  12.0       0.0  0.0      0.0       0.0   

   ...  D_1  D_2  D_3  D_4  D_5  D_6  D_7  D_8  D_9  D_10  
0  ...    1    0    0    0    0    0    0    0    0     0  
1  ...    1    0    0    0    0    0    0    0    0     0  
2  ...    1    0    0    0    0    0    0    0    0     0  
3  ...    1    0    0    0    0    0    0    0    0     0  
4  ...    1    0    0    0    0    0    0    0    0     0  

[5 rows x 28 columns]


In [10]:
mdl_dummy_panel = smf.ols('lwage~D_1+D_2+D_3+D_4+D_5+D_6+D_7+D_8+D_9+D_10+exper+exper2+tenure+tenure2+union-1', data=nls_panel10_new).fit()

print(mdl_dummy_panel.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.617
Model:                            OLS   Adj. R-squared:                  0.464
Method:                 Least Squares   F-statistic:                     4.031
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           0.000402
Time:                        23:25:05   Log-Likelihood:                 2.3280
No. Observations:                  50   AIC:                             25.34
Df Residuals:                      35   BIC:                             54.02
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
D_1            0.1519      1.097      0.139      0.8

In [11]:
mdl_PoolReg2 = smf.ols('lwage~exper+exper2+tenure+tenure2+union', data=nls_panel10).fit()
print(mdl_PoolReg2.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.210
Model:                            OLS   Adj. R-squared:                  0.121
Method:                 Least Squares   F-statistic:                     2.343
Date:                Tue, 16 Feb 2021   Prob (F-statistic):             0.0570
Time:                        23:25:05   Log-Likelihood:                -15.776
No. Observations:                  50   AIC:                             43.55
Df Residuals:                      44   BIC:                             55.02
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.6209      1.017      0.610      0.5

### Note:

Please note that you may have to install `linearmodels` yourself if your Anaconda does not include it.

If the following conda procedure does not work,

```python
>> conda install linearmodels
```
then you can try `pip`. Note that you always try `pip`*AFTER* you are sure that`conda install` does not work if you install the Python system with Anaconda or Miniconda.

```python
>> pip install linearmodels
```

https://pypi.org/project/linearmodels/

### Fixed-Effects Model and Random-Effects Model

In [12]:
from linearmodels import PanelOLS
from linearmodels import RandomEffects
import statsmodels.api as sm

### Fixed-Effects Model:

- We have to use a two-level index dataframe for the fixed-effects and the random-effects models.
- `df.set_indx()` does the trick. Cross-sectional index goes first, then time index.

In [13]:
nls_panel10_new2 = nls_panel10.set_index(['id','year'])

xvar = sm.add_constant(nls_panel10_new2[['exper', 'exper2','tenure','tenure2','union']])
yvar = nls_panel10_new2['lwage']
                       
mdl_fixed_effects = PanelOLS(yvar,xvar,entity_effects = True).fit()
                       
print(mdl_fixed_effects)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  lwage   R-squared:                        0.2210
Estimator:                   PanelOLS   R-squared (Between):             -0.2694
No. Observations:                  50   R-squared (Within):               0.2210
Date:                Tue, Feb 16 2021   R-squared (Overall):             -0.0284
Time:                        23:25:06   Log-likelihood                    2.3280
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      1.9863
Entities:                          10   P-value                           0.1050
Avg Obs:                       5.0000   Distribution:                    F(5,35)
Min Obs:                       5.0000                                           
Max Obs:                       5.0000   F-statistic (robust):             1.9863
                            

### Random-Effects Model:

- This one also needs a two-level index, which is taken care in the previous fixed-effects model preparation.
- The result is slightly different from SAS's default Fuller and Battese Variance Components (`VCOMP=FB`). There are four different variance set up in SAS (`FB`, `NL`, `WH`, `WK`), all of them are different from the one offered by Python linearmodels.
- Stata returns the same results as Python.
    - .xtreg lwage exper exper2 tenure tenure2 union, re

In [14]:
mdl_Random_effects = RandomEffects(yvar,xvar).fit()
print(mdl_Random_effects)

                        RandomEffects Estimation Summary                        
Dep. Variable:                  lwage   R-squared:                        0.1961
Estimator:              RandomEffects   R-squared (Between):              0.1502
No. Observations:                  50   R-squared (Within):               0.2036
Date:                Tue, Feb 16 2021   R-squared (Overall):              0.1764
Time:                        23:25:06   Log-likelihood                   -2.2449
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2.1465
Entities:                          10   P-value                           0.0775
Avg Obs:                       5.0000   Distribution:                    F(5,44)
Min Obs:                       5.0000                                           
Max Obs:                       5.0000   F-statistic (robust):             2.1465
                            

### Explanatory Variables that don't change over time

In [15]:
xvar1 = sm.add_constant(nls_panel10_new2[['exper', 'exper2','tenure','tenure2','black','union']])
yvar1 = nls_panel10_new2['lwage']

In [16]:
mdl_fixed_effects1 = PanelOLS(yvar1,xvar1,entity_effects = True).fit()
                       
print(mdl_fixed_effects1)

AbsorbingEffectError: 
The model cannot be estimated. The included effects have fully absorbed
one or more of the variables. This occurs when one or more of the dependent
variable is perfectly explained using the effects included in the model.

The following variables or variable combinations have been fully absorbed
or have become perfectly collinear after effects are removed:

          const, exper, exper2, tenure, tenure2, black, union

Set drop_absorbed=True to automatically drop absorbed variables.


### Note:

- When one or more explanatory variables do not change over time for each individual, error message appears and the model does not even finish.
- We need to add `drop_absorbed=True` in the `PanelOLS()` function.

In [17]:
mdl_fixed_effects1 = PanelOLS(yvar1,xvar1,entity_effects = True, drop_absorbed=True).fit()
                       
print(mdl_fixed_effects1)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  lwage   R-squared:                        0.2210
Estimator:                   PanelOLS   R-squared (Between):             -0.2694
No. Observations:                  50   R-squared (Within):               0.2210
Date:                Tue, Feb 16 2021   R-squared (Overall):             -0.0284
Time:                        23:25:11   Log-likelihood                    2.3280
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      1.9863
Entities:                          10   P-value                           0.1050
Avg Obs:                       5.0000   Distribution:                    F(5,35)
Min Obs:                       5.0000                                           
Max Obs:                       5.0000   F-statistic (robust):             1.9863
                            

Variables have been fully absorbed and have removed from the regression:

black



In [18]:
mdl_Random_effects1 = RandomEffects(yvar1,xvar1).fit()
print(mdl_Random_effects1)

                        RandomEffects Estimation Summary                        
Dep. Variable:                  lwage   R-squared:                        0.2139
Estimator:              RandomEffects   R-squared (Between):              0.2720
No. Observations:                  50   R-squared (Within):               0.2048
Date:                Tue, Feb 16 2021   R-squared (Overall):              0.2390
Time:                        23:25:14   Log-likelihood                   -1.5287
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      1.9498
Entities:                          10   P-value                           0.0944
Avg Obs:                       5.0000   Distribution:                    F(6,43)
Min Obs:                       5.0000                                           
Max Obs:                       5.0000   F-statistic (robust):             1.9498
                            