In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

* Read in the `wagepan.dta` dataset  and pick the name of the covariates as global variables.

In [2]:
DATA = pd.read_stata("../data/wagepan.dta")

In [3]:
X_NAME = ['educ','black', 'hisp','exper',"expersq",'married','union']
Y_NAME = ['lwage'] 

* create a balanced panel. start by dropping observations with missing data.
* the determine which individuals are present through out the panel. drop individuals that are not present throughout the entire panel.
* put the two together to create the final dataset.

In [4]:
def drop_data(y_name, x_name, data, year_name='year',idv_name='nr'):
    data = data.copy()
    data = data[y_name + x_name + [year_name,idv_name] ]
    missing_vals = ~data.isnull().max(axis=1)
    data = data[missing_vals]
    return data


def make_balanced(y_name, x_name, data, year_name='year', idv_name='nr'):
    
    min_year = int(data[year_name].min())
    max_year = int(data[year_name].max())
    
    idv_unique = list( data[idv_name][data[year_name]==min_year].unique() )
    for year in range(min_year,max_year+1):
        
        idv_year = list(data[data[year_name]==year][idv_name].unique())
        idv_unique_copy = []
        for idv in idv_unique:
            if idv in idv_year:
                idv_unique_copy.append(idv)
        idv_unique = idv_unique_copy
    data = data[data[idv_name].isin(idv_unique)]
    return data.copy()


def setup_data(y_name, x_name, data, year_name='year', idv_name='nr'):
    
    min_year = int(data[year_name].min())
    max_year = int(data[year_name].max())
    
    #subtract out the columns that are not in the data
    all_xs = []
    for col in data.columns:
        if col in x_name:
            all_xs.append(col)

    #clean the data
    data = data.copy()
    data = drop_data(y_name, x_name, data, year_name = year_name, idv_name= idv_name)
    data = make_balanced(y_name, x_name, data, year_name = year_name, idv_name= idv_name)
    return data[y_name+all_xs+ [year_name,idv_name]]

DATA_CLEAN = setup_data(Y_NAME,X_NAME,DATA)

* perform the within transformation

$$\dot{x}_{it} = x_{it} - \sum_{t \leq T} x_{it} \qquad \dot{y}_{it} = y_{it} - \sum_{t \leq T} y_{it}$$

* The OLS estimator of the transformed variables:

$$\hat{\beta} = (\dot{X}' \dot{X})^{-1}(\dot{X} \dot{Y})$$

* This estimator consistently estimates $\beta$ in the panel data model 

$$y_{it}  = x_{it} \beta + \gamma_i + \delta_t + u_{it}$$

* Notice that coefficients on observations that are not time varying are not identified.

In [5]:
def demean(y_name, x_name, data, year_name='year',idv_name='nr'): 
    
    #add year dummy variables
    years = pd.get_dummies(data[year_name],drop_first=True)
    year_names = list(years.columns)
    
    data = data.copy().merge(years,left_index=True, right_index=True)
    x_name = x_name + year_names
    
    #isolate x and y
    y,X = data[ y_name], data[ x_name ]
    y_dot = y.copy()
    X_dot = X.copy()
    
    y_dot = y_dot - y.groupby(data[idv_name]).transform('mean')
    X_dot = X_dot - X.groupby(data[idv_name]).transform('mean')

    return y_dot, X_dot



def print_results(y_name, x_name, data, year_name='year',idv_name='nr'):
    y_dot,X_dot = demean(y_name,x_name,data, year_name = year_name, idv_name= idv_name)
    model = sm.GLM(y_dot,X_dot)
    model_fit = model.fit()
    print(model_fit.summary())
    
print_results(Y_NAME,X_NAME,DATA_CLEAN)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  lwage   No. Observations:                 4360
Model:                            GLM   Df Residuals:                     4350
Model Family:                Gaussian   Df Model:                            9
Link Function:               identity   Scale:                         0.10776
Method:                          IRLS   Log-Likelihood:                -1324.8
Date:                Thu, 30 Mar 2023   Deviance:                       468.75
Time:                        19:10:45   Pearson chi2:                     469.
No. Iterations:                     3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
educ                0          0        nan        n