# Project II: Economic Growth 

This notebook will help you getting started with analyzing the growth dataset, `growth.csv`.

In [1]:
import pandas as pd 
import numpy as np 
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import estimation as est
from numpy import linalg as la

%load_ext autoreload
%autoreload 2
import tools as lm 

## Read data 

In [2]:
dat = pd.read_csv('data/ppcs_cc.csv')
# Add constant
dat['const'] = 1

# Check if the data is cross-sectional
assert len(dat.year.unique())==1

# Dimensions of the data
N, K = dat.shape

# Print the data
print(f'The data contains {N} rows and {K} columns (variables) and is cross-sectional \n')
print(f'Variables are \n {dat.columns.values} \n')
dat.head(5)

The data contains 3799 rows and 20 columns (variables) and is cross-sectional 

Variables are 
 ['sblack' 'shisp' 'swhite' 'sother' 'smale' 'sage' 'sempl' 'sincome'
 'spop' 'daytime' 'inctype_lin' 'omajblack' 'omajhisp' 'omajwhite'
 'omajother' 'osplit' 'sbehavior' 'year' 'anyuseofforce_coded' 'const'] 



Unnamed: 0,sblack,shisp,swhite,sother,smale,sage,sempl,sincome,spop,daytime,inctype_lin,omajblack,omajhisp,omajwhite,omajother,osplit,sbehavior,year,anyuseofforce_coded,const
0,1,0,0,0,1,18,0,1,1,1,2,0,0,1,0,0,0,2011,0,1
1,1,0,0,0,1,20,1,2,4,0,2,0,0,1,0,0,0,2011,0,1
2,1,0,0,0,1,22,1,2,3,1,2,0,0,1,0,0,0,2011,0,1
3,1,0,0,0,1,22,1,3,1,1,2,0,0,1,0,0,0,2011,0,1
4,1,0,0,0,1,22,1,1,1,1,2,0,0,1,0,0,0,2011,0,1


In [3]:
# create the array with the y-values
y = dat['anyuseofforce_coded'].values.reshape((N,))
y_lab = ['anyuseofforce_coded']

# list of characteristics 
race_list = ['sblack', 'shisp', 'swhite', 'sother']
s_char = ['smale','sage', 'sempl', 'sincome', 'spop', 'sbehavior']
s_char = ['sincome'] # removed to avoid multicollinearity
o_char = ['omajblack', 'omajhisp', 'omajwhite','omajother', 'osplit'] 
o_char = ['omajwhite'] # removed to avoid multicollinearity
other_char = ['daytime', 'inctype_lin']
other_char = ['daytime'] # removed to avoid multicollinearity

# check we specified all variables in lists. 
# if len(race_list + s_char + o_char + other_char) != (len(dat.columns)-3): # 2 since not using year and anyuseofforce_coded. 
#     raise ValueError(f'{len(race_list + s_char + o_char + other_char)} != {(len(dat.columns)-1)} ')


def create_var_list(coef_interest):
    """ List of variables to include in the regression."""
    sets = [s_char, o_char, other_char]
    x_labels = [['const', coef_interest]]  # Start with just the constant

    # Add all variables in stages
    for i, group in enumerate(sets):
        i += 1
        x_labels.append(x_labels[i-1] + group)
    
    x_values = []
    for x_list in x_labels:
        x = dat[x_list].values.reshape((N,len(x_list)))
        x_values.append(x)
    
    return x_values, x_labels

In [4]:
dat.groupby('sblack')['anyuseofforce_coded'].mean() # expect positive coef

sblack
0    0.004735
1    0.007143
Name: anyuseofforce_coded, dtype: float64

In [5]:
x_lab = ['const','sblack', 'shisp', 'sother']
x = dat[x_lab].values.reshape((N,len(x_lab)))

In [6]:
theta0 = lm.starting_values(y,x)
ll = lm.loglikelihood(theta0, y, x)
print(f"Using the starting values {theta0}, the log sum of likelihood is {ll.sum():,.0f}")

Using the starting values [0.01282051 0.01575092 0.04935565 0.00880111], the log sum of likelihood is -2,671


In [7]:
logit_results = est.estimate(lm.q, theta0, y, x, options={'disp': False})
logit_tab = est.print_table(x_lab, logit_results, title=f'Logit, y = {y_lab[0]}', )
logit_tab

Optimizer succeded after 51 iter. (265 func. evals.). Final criterion:  0.03044.
Logit, y = anyuseofforce_coded


Unnamed: 0,theta,se,t
const,-5.7413,0.3339,-17.1963
sblack,0.8056,0.6687,1.2046
shisp,1.5934,0.5299,3.0072
sother,0.5345,1.0568,0.5058


## Marginal effect

In [8]:
race_dict = {}

for i, coef_interest in enumerate(['sblack', 'shisp', 'swhite', 'sother']):
    # initialize dict
    race_dict[coef_interest] = {}
    # create the x-values and labels 
    values_list, labels_list = create_var_list(coef_interest)
    for j, (x_values, x_labels) in enumerate(zip(values_list, labels_list)):
        # initialize dict
        inner_dict = {'estimation': {}, 'ME logit': None, 'ME logit std': None}
                
        # slice the data and the labels
        x_i = x_values[:,:]; x_lab_i = x_labels[:];
        
        # test if the matrix is full rank
        if la.matrix_rank(x_i) < x_i.shape[1]:
            continue
        
        # estimate the model logit model
        logit_results = est.estimate(lm.q, theta0=lm.starting_values(y, x_values), y=y, x=x_values, options={'disp': False})
        inner_dict['estimation']['beta'] = logit_results['theta'] 
        inner_dict['estimation']['cov'] = logit_results['cov']
        inner_dict['estimation']['labels'] = x_lab_i
        
        # calculate the average marginal effects
        ape, ape_var, pe_sample_std = lm.average_partial_effect(x_i=x_i, betas=logit_results['theta'],cov_matrix=logit_results['cov'],k=1)
        inner_dict['ME logit'], inner_dict['ME logit std'] = ape, np.sqrt(ape_var[0])[0]
        inner_dict['ME sample std'] = pe_sample_std

        # store all the results
        race_dict[coef_interest][f"Mod: {str(j)}"] = inner_dict

In [None]:
# Restructure the data to handle two levels of keys and separate rows for ME and SE
data = []
for outer_key, inner_dict in race_dict.items():
    for inner_key, values in inner_dict.items():
        for label, coeff, var, in zip(values['estimation']['labels'], values['estimation']['beta'], np.diag(values['estimation']['cov'])):
            data.append({
                'sRace': outer_key,
                'Model': inner_key,
                'Regressor': label,
                'Tipo':'Coeff',
                'Value': coeff,
            })
            data.append({
                'sRace': outer_key,
                'Model': inner_key,
                'Regressor': label,
                'Tipo':'Std',
                'Value': var,
            })

# Convert to a DataFrame
df = pd.DataFrame(data)

# Pivot for a cleaner layout if needed
df_pivot = df.pivot(index=['sRace', 'Model','Tipo'], columns='Regressor', values='Value')

# Create one race column for the pivot
df_pivot['ethnicity'] = df_pivot[['sblack', 'shisp', 'sother','swhite']].fillna(0).sum(axis=1)

# Convert to latex
latex_output = lm.dataframe_to_latex_table_multirow(
    df_pivot[['const','ethnicity','sincome', 'omajwhite', 'daytime']],
    caption='Regression Results for sblack',
    label='tab:sblack_results', 
    std=True,
)
 
with open('output/sblack_results.tex', 'w') as f:
    f.write(latex_output)

# Final output
df_pivot[['const','ethnicity','sincome', 'omajwhite', 'daytime']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Regressor,const,ethnicity,sincome,omajwhite,daytime
sRace,Model,Tipo,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sblack,Mod: 0,Coeff,-5.34808,0.413645,,,
sblack,Mod: 0,Std,0.062797,0.398529,,,
sblack,Mod: 1,Coeff,-5.052252,0.382896,-0.138556,,
sblack,Mod: 1,Std,0.39339,0.398776,0.077671,,
sblack,Mod: 2,Coeff,-5.756441,0.466344,-0.136973,0.745548,
sblack,Mod: 2,Std,1.312758,0.403429,0.077733,1.069021,
sblack,Mod: 3,Coeff,-5.301891,0.424534,-0.121417,0.701487,-0.772012
sblack,Mod: 3,Std,1.760415,0.53367,0.080965,1.200962,0.311798
shisp,Mod: 0,Coeff,-5.566549,1.418004,,,
shisp,Mod: 0,Std,0.077217,0.246515,,,


In [82]:
# Restructure the data to handle two levels of keys and separate rows for ME and SE
data = []
for outer_key, inner_dict in race_dict.items():
    for inner_key, values in inner_dict.items():
        data.append({
            'sRace': outer_key,
            'Model': inner_key,
            'Metric': 'ME logit',
            'Value': values['ME logit']
        })
        data.append({
            'sRace': outer_key,
            'Model': inner_key,
            'Metric': 'SE logit',
            'Value': values['ME logit std']
        })
        data.append({
            'sRace': outer_key,
            'Model': inner_key,
            'Metric': 'ME sample std',
            'Value': values['ME sample std']
        })

# Convert to a DataFrame
df = pd.DataFrame(data)

# Pivot for a cleaner layout if needed
df_pivot = df.pivot(index=['sRace', 'Model'], columns='Metric', values='Value')

# Final output
df_pivot = df_pivot.assign(Tipo='Coeff').set_index('Tipo', append=True)

# Convert to latex
latex_output = lm.dataframe_to_latex_table_multirow(
    df_pivot,
    caption='Average Marginal ',
    label='tab:sblack_results', 
    std=False,
    format_func=lambda x: f"${x:.3%}$".replace('%','\%') if pd.notna(x) else '',
    prime_vspace=0, sec_vspace=0
)
 
with open('output/ME_results.tex', 'w') as f:
    f.write(latex_output)

# Final output
df_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Metric,ME logit,ME sample std,SE logit
sRace,Model,Tipo,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sblack,Mod: 0,Coeff,0.002408,4.336809e-19,0.003258
sblack,Mod: 1,Coeff,0.002191,0.000259141,0.003226
sblack,Mod: 2,Coeff,0.002793,0.0005302551,0.003334
sblack,Mod: 3,Coeff,0.002481,0.001090446,0.003783
shisp,Mod: 0,Coeff,0.011733,1.734723e-18,0.004529
shisp,Mod: 1,Coeff,0.011465,0.0009886161,0.004485
shisp,Mod: 2,Coeff,0.011766,0.002129149,0.004746
shisp,Mod: 3,Coeff,0.011497,0.004922208,0.004846
sother,Mod: 0,Coeff,0.000459,0.0,0.005128
sother,Mod: 1,Coeff,0.000503,6.625791e-05,0.005322
