# Project II: Economic Growth 


In [11]:
import pandas as pd 
import numpy as np 
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import estimation as est
from numpy import linalg as la

%load_ext autoreload
%autoreload 2
import logit_tools as lm 
import probit_tools as pm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read data 
First, we need to read the data. The dataset is stored in the `growth.csv` file.

The data is then examined to understand its structure.

In [12]:
dat = pd.read_csv('data/ppcs_cc.csv')
# Add constant
dat['const'] = 1

# Check if the data is cross-sectional
assert len(dat.year.unique())==1

# Dimensions of the data
N, K = dat.shape

# Print the data
print(f'The data contains {N} rows and {K} columns (variables) and is cross-sectional \n')
print(f'Variables are \n {dat.columns.values} \n')
dat.head(5)

The data contains 3799 rows and 20 columns (variables) and is cross-sectional 

Variables are 
 ['sblack' 'shisp' 'swhite' 'sother' 'smale' 'sage' 'sempl' 'sincome'
 'spop' 'daytime' 'inctype_lin' 'omajblack' 'omajhisp' 'omajwhite'
 'omajother' 'osplit' 'sbehavior' 'year' 'anyuseofforce_coded' 'const'] 



Unnamed: 0,sblack,shisp,swhite,sother,smale,sage,sempl,sincome,spop,daytime,inctype_lin,omajblack,omajhisp,omajwhite,omajother,osplit,sbehavior,year,anyuseofforce_coded,const
0,1,0,0,0,1,18,0,1,1,1,2,0,0,1,0,0,0,2011,0,1
1,1,0,0,0,1,20,1,2,4,0,2,0,0,1,0,0,0,2011,0,1
2,1,0,0,0,1,22,1,2,3,1,2,0,0,1,0,0,0,2011,0,1
3,1,0,0,0,1,22,1,3,1,1,2,0,0,1,0,0,0,2011,0,1
4,1,0,0,0,1,22,1,1,1,1,2,0,0,1,0,0,0,2011,0,1


In [14]:
# create the array with the y-values
y = dat['anyuseofforce_coded'].values.reshape((N,))
y_lab = ['anyuseofforce_coded']

# list of characteristics 
race_list = ['sblack', 'shisp', 'swhite', 'sother']
s_char = ['smale','sage', 'sempl', 'sincome', 'spop', 'sbehavior']
o_char = ['omajblack', 'omajhisp', 'omajwhite','omajother', 'osplit'] 
other_char = ['daytime', 'inctype_lin']

##

In [4]:

print('Average use of force for black/white (if higher then indication of police force) \n', 
      dat.groupby('sblack')['anyuseofforce_coded'].mean(), 
      '\n')

# Create a list of race variables and run regression
x_lab = ['const','sblack', 'shisp', 'sother']
x = dat[x_lab].values.reshape((N,len(x_lab)))

# Run regression by getting starting values and then estimating the model
theta0 = lm.starting_values(y,x)
ll = lm.loglikelihood(theta0, y, x); #print(f"Using the starting values {theta0}, the log sum of likelihood is {ll.sum():,.0f}")
logit_results = est.estimate(lm.q, theta0, y, x, options={'disp': False})
# Print the results
logit_tab = est.print_table(x_lab, logit_results, title=f'Logit, y = {y_lab[0]}', )
logit_tab

Average use of force for black/white (if higher then indication of police force) 
 sblack
0    0.004735
1    0.007143
Name: anyuseofforce_coded, dtype: float64 

Optimizer succeded after 51 iter. (265 func. evals.). Final criterion:  0.03044.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
const,-5.7413,0.3339,-17.1963
sblack,0.8056,0.6687,1.2046
shisp,1.5934,0.5299,3.0072
sother,0.5345,1.0568,0.5058


In [5]:
def create_var_list(coef_interest, sets:list):
    """ List of variables to include in the regression."""
    # sets = [s_char, o_char, other_char]
    x_labels = [['const', coef_interest]]  # Start with just the constant

    # Add all variables in stages
    for i, group in enumerate(sets):
        i += 1
        x_labels.append(x_labels[i-1] + group)
    
    x_values = []
    for x_list in x_labels:
        x = dat[x_list].values.reshape((N,len(x_list)))
        x_values.append(x)
    
    return x_values, x_labels

# Logit 

## Example of logit Estimation and average partial effects 

This illustrates how to estimate a probit and logit model and calculate average partial effects for the model.

$\mathcal{I} \in (\text{Income}, \text{Male}, \text{Age}, \text{Employment})$

In [6]:
coef_interest = 'sblack'

# Create the list of variables
xLabels = ['const',coef_interest,'smale','sage','sincome', 'sempl']
xValues = dat[xLabels].values.reshape((N,len(xLabels)))

# test if the matrix is full rank
if la.matrix_rank(xValues) < xValues.shape[1]:
    raise ValueError('The matrix is not full rank')

# estimate the model logit model
logit_results = est.estimate(lm.q, theta0=lm.starting_values(y, xValues), y=y, x=xValues, options={'disp': False})

for i in range(len(xLabels)):
    print(f"Estimation of {xLabels[i]}: {logit_results['theta'][i]:10.3f} with std: {np.sqrt(logit_results['cov'][i,i]):10.3f}")
print()

# calculate the average marginal effects
ape, ape_var, pe_sample_std = lm.average_partial_effect(x_i=xValues,                      # x-values
                                                         betas=logit_results['theta'],    # estimated coefficients
                                                         cov_matrix=logit_results['cov'], # covariance matrix
                                                         k=1 # location in the xLabels of the variable of interest
                                                         )

print(f"Average Partial Effect of {coef_interest}: {ape:4.3f} with std: {np.sqrt(ape_var[0])[0]:4.3f}")


Estimation of const:     -3.945 with std:      1.082
Estimation of sblack:      0.341 with std:      0.662
Estimation of smale:      1.298 with std:      0.673
Estimation of sage:     -0.044 with std:      0.025
Estimation of sincome:     -0.005 with std:      0.284
Estimation of sempl:     -1.117 with std:      0.522

Average Partial Effect of sblack: 0.002 with std: 0.003


### Creating a loop to estimate the model for each variable

In [94]:
coef_interest = 'sblack'

race_list = ['sblack', 'shisp', 'swhite', 'sother']
subject_char = ['smale','sage','sincome', 'sempl']
officer_char = ['omajblack', 'omajwhite','omajother'] # removed: 'omajother'
other_char = ['daytime']

resDict = {}
for i, coef_interest in enumerate(race_list):
    print(f'Estimating the model with {coef_interest} as the variable of interest')
    # initialize the result dictionary to store the results
    resDict[coef_interest] = {}

    # Create the list of variables
    xLabels = ['const',coef_interest]
    xValues = dat[xLabels].values.reshape((N,len(xLabels)))

    # loop over the characteristics
    for j, addChar in enumerate([[], subject_char, officer_char, other_char]): # Adding an empty to run standard model
        resDict[coef_interest][j] = {}
        # add the characteristics
        xLabels = xLabels + addChar
        xValues = dat[xLabels].values.reshape((N,len(xLabels)))

        print(f'Estimating the model with {xLabels} as the variables')
                                              
        # test if the matrix is full rank
        if la.matrix_rank(xValues) < xValues.shape[1]:
            print('The matrix is not full rank')
            print(xValues)
            continue

        # estimate the model logit model
        logit_results = est.estimate(lm.q, theta0=lm.starting_values(y, xValues), y=y, x=xValues, options={'disp': False})

        # store the results
        resDict[coef_interest][j] = {'xLabels': xLabels, 'logit_theta': logit_results['theta'], 'logit_cov': np.diag(logit_results['cov'])}
        
        # calculate the average marginal effects
        ape, ape_var, pe_sample_std = lm.average_partial_effect(x_i=xValues,                      # x-values
                                                                betas=logit_results['theta'],    # estimated coefficients
                                                                cov_matrix=logit_results['cov'], # covariance matrix
                                                                k=1 # location in the xLabels of the variable of interest
                                                                )
        
        # Add ape an ape_var to the dictionary
        resDict[coef_interest][j]['ape'] = ape
        resDict[coef_interest][j]['ape_var'] = ape_var[0][0]
    

Estimating the model with sblack as the variable of interest
Estimating the model with ['const', 'sblack'] as the variables
Estimating the model with ['const', 'sblack', 'smale', 'sage', 'sincome', 'sempl'] as the variables
Estimating the model with ['const', 'sblack', 'smale', 'sage', 'sincome', 'sempl', 'omajblack', 'omajwhite', 'omajother'] as the variables
Estimating the model with ['const', 'sblack', 'smale', 'sage', 'sincome', 'sempl', 'omajblack', 'omajwhite', 'omajother', 'daytime'] as the variables
Estimating the model with shisp as the variable of interest
Estimating the model with ['const', 'shisp'] as the variables
Estimating the model with ['const', 'shisp', 'smale', 'sage', 'sincome', 'sempl'] as the variables
Estimating the model with ['const', 'shisp', 'smale', 'sage', 'sincome', 'sempl', 'omajblack', 'omajwhite', 'omajother'] as the variables
Estimating the model with ['const', 'shisp', 'smale', 'sage', 'sincome', 'sempl', 'omajblack', 'omajwhite', 'omajother', 'daytim

In [99]:
logit_est = pd.DataFrame(index=pd.MultiIndex.from_product([resDict.keys(), resDict['sblack'].keys(),['Coeff','Std']]),
                columns=['const','race','smale','sage','sincome','sempl','omajblack','omajwhite','omajother','daytime'])
logit_est

for race, mod,_ in logit_est.index:
    beta = resDict[race][mod]['logit_theta']
    std = np.sqrt(resDict[race][mod]['logit_cov'])
    logit_est.loc[(race, mod, 'Coeff'),:len(beta)] = beta
    logit_est.loc[(race, mod, 'Std'),:len(beta)] = std

logit_est['race'].to_clipboard()


  logit_est.loc[(race, mod, 'Coeff'),:len(beta)] = beta
  logit_est.loc[(race, mod, 'Std'),:len(beta)] = std


In [114]:
logit_ape = pd.DataFrame(index=pd.MultiIndex.from_product([resDict.keys(), resDict['sblack'].keys()]),
                columns=['APE']
                )

for race, mod in logit_ape.index:
    logit_ape.loc[(race, mod),'APE'] = resDict[race][mod]['ape']

logit_ape.to_clipboard()

# Probit - NOT WORKING!!

## Example of probit Estimation and average partial effects 

This illustrates how to estimate a probit and logit model and calculate average partial effects for the model.

$\mathcal{I} \in (\text{Income}, \text{Male}, \text{Age}, \text{Employment})$

In [93]:
coef_interest = 'sblack'

# Create the list of variables
xLabels = ['const',coef_interest,'smale','sage','sincome', 'sempl']
xValues = dat[xLabels].values.reshape((N,len(xLabels)))

# test if the matrix is full ran
if la.matrix_rank(xValues) < xValues.shape[1]:
    raise ValueError('The matrix is not full rank')

# estimate the model logit model
thetaInit = pm.starting_values(y, xValues)
print(thetaInit)
probit_results = est.estimate(pm.q, theta0=thetaInit, y=y, x=xValues, cov_type='Sandwich', options={'disp': False})

for i in range(len(xLabels)):
    print(f"Estimation of {xLabels[i]}: {probit_results['theta'][i]:10.3f} with std: {np.sqrt(probit_results['cov'][i,i]):10.3f}")
print()

# # calculate the average marginal effects
# ape, ape_var, pe_sample_std = pm.average_partial_effect(x_i=xValues,                      # x-values
#                                                          betas=probit_results['theta'],    # estimated coefficients
#                                                          cov_matrix=probit_results['cov'], # covariance matrix
#                                                          k=1 # location in the xLabels of the variable of interest
#                                                          )

# print(f"Average Partial Effect of {coef_interest}: {ape:4.3f} with std: {np.sqrt(ape_var[0])[0]:4.3f}")


OLS estimate of beta: [ 1.61126547e-02  1.93175205e-03  5.79265425e-03 -2.24685323e-04
 -9.40630995e-05 -7.15339040e-03]
[ 0.06445062  0.00772701  0.02317062 -0.00089874 -0.00037625 -0.02861356]


LinAlgError: Singular matrix

### Creating a loop to estimate the model for each variable

In [92]:
coef_interest = 'sblack'

race_list = ['sblack', 'shisp', 'swhite', 'sother']

officer_char = ['omajblack', 'omajwhite','omajother'] # removed: 'omajother'
other_char = ['daytime']

resDict = {}
for i, coef_interest in enumerate(race_list):
    print(f'Estimating the model with {coef_interest} as the variable of interest')
    # initialize the result dictionary to store the results
    resDict[coef_interest] = {}

    # Create the list of variables
    xLabels = ['const',coef_interest]
    xValues = dat[xLabels].values.reshape((N,len(xLabels)))

    # loop over the characteristics
    for j, addChar in enumerate([['smale','sage','sincome', 'sempl'], officer_char, other_char]): # Adding an empty to run standard model
        resDict[coef_interest][j] = {}
        # add the characteristics
        xLabels = xLabels + addChar
        xValues = dat[xLabels].values.reshape((N,len(xLabels)))

        print(f'Estimating the model with {xLabels} as the variables')
                                              
        # test if the matrix is full rank
        if la.matrix_rank(xValues) < xValues.shape[1]:
            print('The matrix is not full rank')
            print(xValues)
            continue

        # estimate the model logit model
        try:
            logit_results = est.estimate(pm.q, theta0=pm.starting_values(y, xValues), y=y, x=xValues, options={'disp': False})
        except:
            print('Probit did not converge')
            continue
        # store the results
        resDict[coef_interest][j] = {'xLabels': xLabels, 'logit_theta': logit_results['theta'], 'logit_cov': np.diag(logit_results['cov'])}
        
        # calculate the average marginal effects
        # ape, ape_var, pe_sample_std = pm.average_partial_effect(x_i=xValues,                      # x-values
        #                                                         betas=logit_results['theta'],    # estimated coefficients
        #                                                         cov_matrix=logit_results['cov'], # covariance matrix
        #                                                         k=1 # location in the xLabels of the variable of interest
        #                                                         )
        
        # # Add ape an ape_var to the dictionary
        # resDict[coef_interest][j]['ape'] = ape
        # resDict[coef_interest][j]['ape_var'] = ape_var[0][0]
    

Estimating the model with sblack as the variable of interest
Estimating the model with ['const', 'sblack', 'smale', 'sage', 'sincome', 'sempl'] as the variables
Probit did not converge
Estimating the model with ['const', 'sblack', 'smale', 'sage', 'sincome', 'sempl', 'omajblack', 'omajwhite', 'omajother'] as the variables
Probit did not converge
Estimating the model with ['const', 'sblack', 'smale', 'sage', 'sincome', 'sempl', 'omajblack', 'omajwhite', 'omajother', 'daytime'] as the variables
Probit did not converge
Estimating the model with shisp as the variable of interest
Estimating the model with ['const', 'shisp', 'smale', 'sage', 'sincome', 'sempl'] as the variables
Probit did not converge
Estimating the model with ['const', 'shisp', 'smale', 'sage', 'sincome', 'sempl', 'omajblack', 'omajwhite', 'omajother'] as the variables
Probit did not converge
Estimating the model with ['const', 'shisp', 'smale', 'sage', 'sincome', 'sempl', 'omajblack', 'omajwhite', 'omajother', 'daytime'] a

In [None]:
df_logit = pd.DataFrame.from_dict(resDict, orient='index')#.drop(columns=['ape', 'ape_var'])
df_logit.stack().to_frame()
df_logit = pd.json_normalize(resDict)


In [None]:
logit_ape = pd.DataFrame(index=pd.MultiIndex.from_product([resDict.keys(), resDict['sblack'].keys()]),
                columns=['APE','APE std']
                )

for race, mod in logit_ape.index:
    logit_ape.loc[(race, mod),'APE'] = resDict[race][mod]['ape']
    logit_ape.loc[(race, mod),'APE std'] = np.sqrt(resDict[race][mod]['ape_var'])

logit_ape


Unnamed: 0,Unnamed: 1,APE,APE std
sblack,0,0.00189,0.003304
sblack,1,0.003007,0.003665
sblack,2,0.002787,0.004135
shisp,0,0.007589,0.0036
shisp,1,0.007089,0.006065
shisp,2,0.00732,0.006019
swhite,0,-0.006002,0.003622
swhite,1,-0.006218,0.005983
swhite,2,-0.006139,0.005848
sother,0,-0.000604,0.007097
