# Project II: Economic Growth 

This notebook will help you getting started with analyzing the growth dataset, `growth.csv`.

In [1]:
import pandas as pd 
import numpy as np 
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import estimation as est
from numpy import linalg as la

%load_ext autoreload
%autoreload 2
import tools as lm 

## Read data 

In [40]:
dat = pd.read_csv('data/ppcs_cc.csv')
assert len(dat.year.unique())==1

# characteristics
print(f'The data contains {dat.shape[0]} rows and {dat.shape[1]} columns (variables) and is cross-sectional \n')
print(f'Variables are \n {dat.columns} \n')
dat 

The data contains 3799 rows and 19 columns (variables) and is cross-sectional 

Variables are 
 Index(['sblack', 'shisp', 'swhite', 'sother', 'smale', 'sage', 'sempl',
       'sincome', 'spop', 'daytime', 'inctype_lin', 'omajblack', 'omajhisp',
       'omajwhite', 'omajother', 'osplit', 'sbehavior', 'year',
       'anyuseofforce_coded'],
      dtype='object') 



Unnamed: 0,sblack,shisp,swhite,sother,smale,sage,sempl,sincome,spop,daytime,inctype_lin,omajblack,omajhisp,omajwhite,omajother,osplit,sbehavior,year,anyuseofforce_coded
0,1,0,0,0,1,18,0,1,1,1,2,0,0,1,0,0,0,2011,0
1,1,0,0,0,1,20,1,2,4,0,2,0,0,1,0,0,0,2011,0
2,1,0,0,0,1,22,1,2,3,1,2,0,0,1,0,0,0,2011,0
3,1,0,0,0,1,22,1,3,1,1,2,0,0,1,0,0,0,2011,0
4,1,0,0,0,1,22,1,1,1,1,2,0,0,1,0,0,0,2011,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3794,0,0,1,0,0,72,1,3,1,0,1,0,0,1,0,0,1,2011,0
3795,0,0,1,0,0,71,0,2,1,1,2,0,0,1,0,0,0,2011,0
3796,0,0,1,0,0,76,0,1,1,1,2,0,0,1,0,0,0,2011,0
3797,0,0,1,0,0,79,0,3,4,1,2,0,0,1,0,0,0,2011,0


In [41]:
race_list = ['sblack', 'shisp', 'swhite', 'sother']

# list of characteristics 
s_char = ['smale','sage', 'sempl', 'sincome', 'spop', 'sbehavior']
o_char = ['omajblack', 'omajhisp', 'omajwhite', 'omajother', 'osplit']
other_char = ['daytime', 'inctype_lin']

control_variable_list = []

# check we specified all variables in lists. 
if len(race_list + s_char + o_char + other_char) != (len(dat.columns)-2): # 2 since not using year and anyuseofforce_coded. 
    raise ValueError(f'{len(race_list + s_char + o_char + other_char)} != {(len(dat.columns)-1)} ')

# Generate iterations
coef_interest = 'sblack'
constant = ['_const', coef_interest]
iterations = [constant]  # Start with just the constant

# Add all variables in stages
for i, group in enumerate([s_char, o_char, other_char]):
    iterations.append(iterations[i] + group)

In [42]:
iterations

[['_const', 'sblack'],
 ['_const',
  'sblack',
  'smale',
  'sage',
  'sempl',
  'sincome',
  'spop',
  'sbehavior'],
 ['_const',
  'sblack',
  'smale',
  'sage',
  'sempl',
  'sincome',
  'spop',
  'sbehavior',
  'omajblack',
  'omajhisp',
  'omajwhite',
  'omajother',
  'osplit'],
 ['_const',
  'sblack',
  'smale',
  'sage',
  'sempl',
  'sincome',
  'spop',
  'sbehavior',
  'omajblack',
  'omajhisp',
  'omajwhite',
  'omajother',
  'osplit',
  'daytime',
  'inctype_lin']]

In [46]:
dat['_const'] = 1 
N,K = dat.shape

y = dat['anyuseofforce_coded'].to_numpy().reshape((N,))
y_lab = ['anyuseofforce_coded']

x_iter = []
x_labs = iterations 
for x_list in iterations:
    x = dat[x_list].to_numpy().reshape((N,len(x_list)))
    x_iter.append(x)

In [47]:
x_iter

[array([[1, 1],
        [1, 1],
        [1, 1],
        ...,
        [1, 0],
        [1, 0],
        [1, 0]]),
 array([[1, 1, 1, ..., 1, 1, 0],
        [1, 1, 1, ..., 2, 4, 0],
        [1, 1, 1, ..., 2, 3, 0],
        ...,
        [1, 0, 0, ..., 1, 1, 0],
        [1, 0, 0, ..., 3, 4, 0],
        [1, 0, 0, ..., 2, 1, 0]]),
 array([[1, 1, 1, ..., 1, 0, 0],
        [1, 1, 1, ..., 1, 0, 0],
        [1, 1, 1, ..., 1, 0, 0],
        ...,
        [1, 0, 0, ..., 1, 0, 0],
        [1, 0, 0, ..., 1, 0, 0],
        [1, 0, 0, ..., 1, 0, 0]]),
 array([[1, 1, 1, ..., 0, 1, 2],
        [1, 1, 1, ..., 0, 0, 2],
        [1, 1, 1, ..., 0, 1, 2],
        ...,
        [1, 0, 0, ..., 0, 1, 2],
        [1, 0, 0, ..., 0, 1, 2],
        [1, 0, 0, ..., 0, 1, 2]])]

In [None]:
dat.groupby('sblack')['anyuseofforce_coded'].mean() # expect positive coef

sblack
0    0.004735
1    0.007143
Name: anyuseofforce_coded, dtype: float64

In [None]:
# define y and x 
N,K = dat.shape

y = dat['anyuseofforce_coded'].to_numpy().reshape((N,))
y_lab = ['anyuseofforce_coded']

# x_drop = ['anyuseofforce_coded', # y
#           'year', # constant
#           'swhite', # reference dummy
#           'omajwhite', 
#           ]
# x_keep = [i for i in dat.columns if i not in x_drop]
# x = dat[x_keep].to_numpy().reshape((N,K-len(x_drop)))
x_lab = ['sblack', 'shisp', 'sother']
x = dat[x_lab].to_numpy().reshape((N,len(x_lab)))

# add constant
x = np.column_stack([np.ones(N),x])
x_lab = ['const']+ x_lab


In [30]:
theta0 = lm.starting_values(y,x)
theta0

array([0.01282051, 0.01575092, 0.04935565, 0.00880111])

In [31]:
ll = lm.loglikelihood(theta0, y, x)
ll.sum()


-2670.9682180872105

In [32]:
logit_results = est.estimate(lm.q, theta0, y, x)

Optimization terminated successfully.
         Current function value: 0.030440
         Iterations: 51
         Function evaluations: 265
         Gradient evaluations: 53


In [None]:
logit_tab = est.print_table(x_lab, logit_results, title=f'Logit, y = {y_lab}')
logit_tab

Optimizer succeded after 51 iter. (265 func. evals.). Final criterion:  0.03044.
Logit, y = ['anyuseofforce_coded']


Unnamed: 0,theta,se,t
const,-5.7413,0.3339,-17.1963
sblack,0.8056,0.6687,1.2046
shisp,1.5934,0.5299,3.0072
sother,0.5345,1.0568,0.5058
