In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymc as pm
import arviz as az
#np.set_printoptions(suppress=True, formatter={'float_kind':'{:f}'.format}) #Prints in ordinary (as opposed to scientific) notation

## Poisson Regression: Labor Force Participation Dataset

We will analyze the MROZ dataset which comes from an Econometrica paper by Mroz in 1987 which gives data on a bunch of variables for married women in the year 1975. The variables in the dataset are: 

1. inlf: binary variable equaling 1 if the individual worked (i.e., they were 'in the labor force') in the year 1975 and 0 otherwise
2. hours: number of hours worked in 1975
3. kidslt6: number of kids < 6 years of age
4. kidsge6: number of kids 6-18 years of age
5. age: age in years
6. educ: years of schooling
7. wage: hourly wage in 1975
8. repwage: reported wage at interview in 1976
9. hushrs: hours worked by husband in 1975
10. husage: husband's age
11. huseduc: husband's years of schooling
12. huswage: husband's hourly wage in 1975
13. faminc: family income in 1975
14. mtr: federal marginal tax rate facing woman
15. motheduc: mother's years of schooling
16. fatheduc: father's years of schooling
17. unem: unemployment rate in county of residence
18. city: =1 if live in Standard metropolitan statistical area
19. exper: actual labor market experience
20. nwifeinc: (faminc - wage*hours)/1000
21. lwage: log(wage)
22. expersq: $\text{exper}^2$ (the square of the experience variable)

In [None]:
#Import the MROZ.csv dataset
mroz = pd.read_csv("MROZ.csv")
mroz.head(12)

In [None]:
#Several regressions can be fit on this dataset. Let us fit one with
#hours as the response variable, and
#kidslt6, kidsge6, age, educ, exper, expersq, huswage, huseduc, hushrs, motheduc and fatheduc
#as covariates
import statsmodels.api as sm
#Define the response variable and covariates
Y = mroz['hours']
X = mroz[['kidslt6', 'kidsge6', 'age', 'educ', 
       'hushrs',  'huseduc', 'huswage',  'motheduc',
       'fatheduc', 'exper', 'expersq']].copy()
#Add a constant (intercept) to the model
X = sm.add_constant(X)

#Fit the model: 
model = sm.OLS(Y, X).fit()
print(model.summary())

In [None]:
#Usually one looks at the table above and drops variables for which the standard error
#is comparable to the estimated coefficient (equivalently, the variables for which the
#P>|t| value is large). 
#In this problem we can drop the variables motheduc, fatheduc, hushrs, huseduc and kidsge6
#Regression with a smaller number of covariates
#Define the response variable and covariates
Y = mroz['hours']
X = mroz[['kidslt6', 'age', 'educ', 
        'huswage', 'exper', 'expersq']].copy()
X = sm.add_constant(X) #add a constant (intercept) to the model
#Fit the model: 
linmodel = sm.OLS(Y, X).fit()
print(linmodel.summary())

In [None]:
#We can also take the Bayesian Approach and use PyMC:
import pymc as pm
mrozmod = pm.Model()
with mrozmod:
    # Priors for unknown model parameters
    b0 = pm.Flat("b0")
    b1 = pm.Flat("b1")
    b2 = pm.Flat("b2")
    b3 = pm.Flat("b3")
    b4 = pm.Flat("b4")
    b5 = pm.Flat("b5")
    b6 = pm.Flat("b6")
    log_sigma = pm.Flat("log_sigma")             
    sigma = pm.Deterministic("sigma", pm.math.exp(log_sigma))
    # Expected value of outcome
    mu = b0 + b1 * mroz['kidslt6'] +   b2 * mroz['age'] + b3 * mroz['educ'] + b4 * mroz['huswage'] + b5 * mroz['exper'] + b6 * mroz['expersq']
    # Likelihood
    Y_obs = pm.Normal("Y_obs", mu=mu, sigma=sigma, observed=mroz['hours'])
    idata = pm.sample(2000, chains = 2, return_inferencedata = True)  

In [None]:
b0_samples = idata.posterior['b0'].values.flatten()
b1_samples = idata.posterior['b1'].values.flatten()
b2_samples = idata.posterior['b2'].values.flatten()
b3_samples = idata.posterior['b3'].values.flatten()
b4_samples = idata.posterior['b4'].values.flatten()
b5_samples = idata.posterior['b5'].values.flatten()
b6_samples = idata.posterior['b6'].values.flatten()

allsamples = [b0_samples, b1_samples, b2_samples, b3_samples, b4_samples, b5_samples, b6_samples]
names = ['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6']
print("Parameter   | Mean     | Std. Dev. | Least Squares | Std. Error")
print("------------|----------|----------")
for i, (name, arr) in enumerate(zip(names, allsamples)):
    print(f"{name:10}| {np.mean(arr):.6f} | {np.std(arr):.6f} | {linmodel.params.values[i]:.6f}  | {linmodel.bse.values[i]:.6f}")

In [None]:
#Poisson Regression for Better Interpretation:
# Define the response variable and covariates
Y = mroz['hours']
X = mroz[['kidslt6', 'age', 'educ', 
        'huswage', 'exper', 'expersq']].copy()
X = sm.add_constant(X) # Add a constant (intercept) to the model
# Fit the Poisson regression model
poiregmodel = sm.GLM(Y, X, family=sm.families.Poisson()).fit()
print(poiregmodel.summary())

What is the interpretation of the coefficient -0.8075 for the "kidslt6" variable. It  means that having a small kid reduces mean hours worked by 56%. This is a much more interpretable result compared to before. 

In [None]:
#56% comes from:
print((np.exp(poiregmodel.params['kidslt6']) - 1)*100)

In [None]:
#We can also take the Bayesian Approach and use PyMC:
import pymc as pm
mrozpoimod = pm.Model()
with mrozpoimod:
    # Priors for unknown model parameters
    b0 = pm.Flat("b0")
    b1 = pm.Flat("b1")
    b2 = pm.Flat("b2")
    b3 = pm.Flat("b3")
    b4 = pm.Flat("b4")
    b5 = pm.Flat("b5")
    b6 = pm.Flat("b6")
    log_mu = b0 + b1 * mroz['kidslt6'] +   b2 * mroz['age'] + b3 * mroz['educ'] + b4 * mroz['huswage'] + b5 * mroz['exper'] + b6 * mroz['expersq']
    # Likelihood
    Y_obs = pm.Poisson("Y_obs", mu=np.exp(log_mu), observed=mroz['hours'])
    idata = pm.sample(2000, chains = 2, random_seed = 0, return_inferencedata = True)  

In [None]:
b0_samples = idata.posterior['b0'].values.flatten()
b1_samples = idata.posterior['b1'].values.flatten()
b2_samples = idata.posterior['b2'].values.flatten()
b3_samples = idata.posterior['b3'].values.flatten()
b4_samples = idata.posterior['b4'].values.flatten()
b5_samples = idata.posterior['b5'].values.flatten()
b6_samples = idata.posterior['b6'].values.flatten()

allsamples = [b0_samples, b1_samples, b2_samples, b3_samples, b4_samples, b5_samples, b6_samples]
names = ['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6']
print("Parameter | Estimate | Std. Dev. | Frequentist | Std. Error")
print("------------|----------|----------")
for i, (name, arr) in enumerate(zip(names, allsamples)):
    print(f"{name:8}| {np.mean(arr):.6f} | {np.std(arr):.6f}  | {poiregmodel.params.values[i]:.6f}  | {poiregmodel.bse.values[i]:.6f}")
#However PyMC is not very reliable here. Change the random seed from 0 to 4
#and look at the results. 

In [None]:
#Newton's Method for Calculating MLE in Poisson Regression
beta_hat = poiregmodel.params.values #this is the correct answer computed by statsmodels
print(beta_hat)

#Initialization for Newton's Method
m = 6
p = 7
beta_initial = [3, 0, 0, 0, 0, 0, 0]
#beta_initial = beta_hat
n = mroz.shape[0]
Xmat = X.values
Yvec = mroz['hours'].values

In [None]:
#Newton's Method
log_muvec = np.dot(Xmat, beta_initial)
muvec = np.exp(log_muvec)
gradient = np.dot(Xmat.T,  Yvec - muvec)
M = np.diag(muvec)
Hessian = -Xmat.T @ M @ Xmat
Hessian_inv = np.linalg.inv(Hessian)
beta_initial = beta_initial - Hessian_inv @ gradient
print(beta_initial)

In [None]:
num_iterations = 100
for i in range(num_iterations):
    log_muvec = np.dot(Xmat, beta_initial)
    muvec = np.exp(log_muvec)
    gradient = np.dot(Xmat.T,  Yvec - muvec)
    M = np.diag(muvec)
    Hessian = -Xmat.T @ M @ Xmat
    Hessian_inv = np.linalg.inv(Hessian)
    beta_initial = beta_initial - Hessian_inv @ gradient
    print(beta_initial)

In [None]:
#Compare to estimate given by statsmodels
print(beta_hat)

In [None]:
#Standard Error Calculation:
log_muvec = np.dot(Xmat, beta_hat)
muvec = np.exp(log_muvec)
M = np.diag(muvec)
Hessian = -Xmat.T @ M @ Xmat
Hessian_inv = np.linalg.inv(Hessian)
CovMat = -Hessian_inv
print(np.sqrt(np.diag(CovMat)))

In [None]:
print(poiregmodel.bse)

In [None]:
#Logistic Regression
# Define the response variable and covariates
Y = mroz['inlf'] #this is a binary variable
X = mroz[['kidslt6', 'age', 'educ', 
        'huswage', 'exper', 'expersq']].copy()
X = sm.add_constant(X)
logimodel = sm.GLM(Y, X, family=sm.families.Binomial()).fit()
print(logimodel.summary())

What is the interpretation of the coefficient -1.4516 of the 'kidslt6' variable? It means that having a small kid reduces the log-odds of working by -1.4516. This means that the odds are reduced by a factor of $\exp(-1.4516) \approx 0.234$. 

In [None]:
pold = 0.5
pnew = 1/((np.exp(-logimodel.params['kidslt6']) * (-1 + 1/pold)) + 1)
print([pold, pnew])

In [None]:
beta_hat = logimodel.params.values #this is the correct answer computed by statsmodels
print(beta_hat)

#Initialization for Newton's Method
m = 6
p = 7
beta_initial = [0, 0, 0, 0, 0, 0, 0]
#beta_initial = beta_hat
n = mroz.shape[0]
Xmat = X.values
Yvec = mroz['inlf'].values

In [None]:
#Newton's Method
xbeta = np.dot(Xmat, beta_initial)
muvec = np.exp(xbeta)/(1+np.exp(xbeta))
gradient = np.dot(Xmat.T,  Yvec - muvec)
M = np.diag(muvec*(1-muvec))
Hessian = -Xmat.T @ M @ Xmat
Hessian_inv = np.linalg.inv(Hessian)
beta_initial = beta_initial - Hessian_inv @ gradient
print(beta_initial)

In [None]:
num_iterations = 10
for i in range(num_iterations):
    xbeta = np.dot(Xmat, beta_initial)
    muvec = np.exp(xbeta)/(1+np.exp(xbeta))
    gradient = np.dot(Xmat.T,  Yvec - muvec)
    M = np.diag(muvec*(1-muvec))
    Hessian = -Xmat.T @ M @ Xmat
    Hessian_inv = np.linalg.inv(Hessian)
    beta_initial = beta_initial - Hessian_inv @ gradient
    print(beta_initial)

In [None]:
print(beta_hat)

In [None]:
#Standard Error Calculation:
xbeta = np.dot(Xmat, beta_hat)
muvec = np.exp(xbeta)/(1+np.exp(xbeta))
M = np.diag(muvec * (1-muvec))
Hessian = -Xmat.T @ M @ Xmat
Hessian_inv = np.linalg.inv(Hessian)
CovMat = -Hessian_inv
print(np.sqrt(np.diag(CovMat)))

In [None]:
print(logimodel.bse)