In [72]:
import pandas as pd
from statsmodels.formula.api import ols
import numpy as np

## Explanatory analysis

In [73]:
df = pd.read_stata("assignment_1.dta", convert_dates=False)

In [74]:
df.head(7)

Unnamed: 0,county,year,crmrte,prbarr,prbconv,prbpris,avgsen,polpc,density,taxpc,...,lpctmin,clcrmrte,clprbarr,clprbcon,clprbpri,clavgsen,clpolpc,cltaxpc,clmix,trend
0,1,81,0.039885,0.289696,0.402062,0.472222,5.61,0.001787,2.307159,25.69763,...,3.006608,,,,,,,,,1.0
1,1,82,0.038345,0.338111,0.433005,0.506993,5.59,0.001767,2.330254,24.874252,...,3.006608,-0.039376,0.154542,0.074143,0.071048,-0.003571,-0.011364,-0.032565,0.030857,2.0
2,1,83,0.030305,0.330449,0.525703,0.479705,5.8,0.001836,2.341801,26.451443,...,3.006608,-0.235316,-0.022922,0.193987,-0.055326,0.036879,0.038413,0.061477,-0.244732,3.0
3,1,84,0.034726,0.362525,0.604706,0.520104,6.89,0.001886,2.34642,26.842348,...,3.006608,0.13618,0.092641,0.140006,0.080857,0.172213,0.02693,0.01467,-0.027331,4.0
4,1,85,0.036573,0.325395,0.578723,0.497059,6.55,0.001924,2.364896,28.140337,...,3.006608,0.051825,-0.108054,-0.043918,-0.04532,-0.050606,0.020199,0.047223,0.172125,5.0
5,1,86,0.034752,0.326062,0.512324,0.439863,6.9,0.001895,2.385681,29.74098,...,3.006608,-0.051062,0.002048,-0.121867,-0.122245,0.052056,-0.015258,0.055322,0.042765,6.0
6,1,87,0.035604,0.29827,0.527596,0.43617,6.71,0.001828,2.422633,30.993681,...,3.006608,0.024198,-0.089089,0.029374,-0.008431,-0.027923,-0.036189,0.041257,-0.193899,7.0


In [41]:
df[[
    "crmrte", 
    "prbconv", 
    "prbarr", 
    "avgsen", 
    "polpc",
    "density",
    'taxpc', 
    'west', 
    'central', 
    'urban'
]]

Unnamed: 0,crmrte,prbconv,prbarr,avgsen,polpc,density,taxpc,west,central,urban
0,0.039885,0.402062,0.289696,5.61,0.001787,2.307159,25.697630,0,1,0
1,0.038345,0.433005,0.338111,5.59,0.001767,2.330254,24.874252,0,1,0
2,0.030305,0.525703,0.330449,5.80,0.001836,2.341801,26.451443,0,1,0
3,0.034726,0.604706,0.362525,6.89,0.001886,2.346420,26.842348,0,1,0
4,0.036573,0.578723,0.325395,6.55,0.001924,2.364896,28.140337,0,1,0
...,...,...,...,...,...,...,...,...,...,...
625,0.015575,0.480392,0.226667,7.77,0.001073,0.869048,18.905853,1,0,0
626,0.013662,1.410260,0.204188,10.11,0.001109,0.872024,22.704754,1,0,0
627,0.013086,0.830769,0.180556,5.96,0.001054,0.875000,24.123611,1,0,0
628,0.012874,2.250000,0.112676,7.68,0.001088,0.880952,24.981979,1,0,0


In [75]:
df = df.assign(
    log_crmrte=np.log(df["crmrte"]),
    log_prbconv = np.log(df["prbconv"]),
    log_prbarr = np.log(df["prbarr"]),
    log_avgsen = np.log(df["avgsen"]),
    log_polpc = np.log(df["polpc"]),
    log_density = np.log(df["density"])
    )

log_variables = [column for column in df.columns if column.split("_")[0] == "log" and column != "log_crmrte"]
dependent_variable = ["log_crmrte"]

In [76]:
df_final = df.dropna()

In [99]:
len(list(df_final.county.unique()))

90

## Estimations

- Pooled OLS

In [77]:
pooled_ols = ols("log_crmrte ~" + " + ".join(log_variables+["west", "central", "urban"]), data=df_final).fit()
print(pooled_ols.summary())


                            OLS Regression Results                            
Dep. Variable:             log_crmrte   R-squared:                       0.766
Model:                            OLS   Adj. R-squared:                  0.762
Method:                 Least Squares   F-statistic:                     217.3
Date:                Fri, 10 Mar 2023   Prob (F-statistic):          4.93e-162
Time:                        14:24:29   Log-Likelihood:                -77.694
No. Observations:                 540   AIC:                             173.4
Df Residuals:                     531   BIC:                             212.0
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -1.8795      0.189     -9.921      

- Random Effects

In [70]:
pooled_ols = ols(
    "log_crmrte ~" + " + ".join(log_variables+["west", "central", "urban"]), 
    data=df_final).\
    fit()
print(pooled_ols.summary())

                            OLS Regression Results                            
Dep. Variable:             log_crmrte   R-squared:                       0.766
Model:                            OLS   Adj. R-squared:                  0.762
Method:                 Least Squares   F-statistic:                     217.3
Date:                Thu, 09 Mar 2023   Prob (F-statistic):          4.93e-162
Time:                        19:39:53   Log-Likelihood:                -77.694
No. Observations:                 540   AIC:                             173.4
Df Residuals:                     531   BIC:                             212.0
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -1.8795      0.189     -9.921      

- Fixed Effects

(I have to make an F-test comparing the restricted model with the unrestricted one, where the unrestricted model is the model with all dummies added)

In [84]:
dummy_vars = pd.get_dummies(df_final["year"].astype(str), prefix="year")
dummy_vars

Unnamed: 0,year_82,year_83,year_84,year_85,year_86,year_87
1,1,0,0,0,0,0
2,0,1,0,0,0,0
3,0,0,1,0,0,0
4,0,0,0,1,0,0
5,0,0,0,0,1,0
...,...,...,...,...,...,...
625,0,1,0,0,0,0
626,0,0,1,0,0,0
627,0,0,0,1,0,0
628,0,0,0,0,1,0


In [93]:
df_fixed_effects = pd.concat([df_final, dummy_vars], axis=1)

year_dummy_columns = list(dummy_vars.columns)[1:] # year_82 is removed to avoid multicollinearity
geographical_columns = ["west", "central", "urban"]

fixed_effects_ols = ols(
    "log_crmrte ~" + " + ".join(log_variables+geographical_columns+year_dummy_columns), 
    data=df_fixed_effects).\
    fit(cov_type='cluster', cov_kwds={'groups': df_fixed_effects['county']})
    
print(fixed_effects_ols.summary())

                            OLS Regression Results                            
Dep. Variable:             log_crmrte   R-squared:                       0.771
Model:                            OLS   Adj. R-squared:                  0.765
Method:                 Least Squares   F-statistic:                     66.60
Date:                Fri, 10 Mar 2023   Prob (F-statistic):           4.88e-40
Time:                        14:37:22   Log-Likelihood:                -71.700
No. Observations:                 540   AIC:                             171.4
Df Residuals:                     526   BIC:                             231.5
Df Model:                          13                                         
Covariance Type:              cluster                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -1.7624      0.760     -2.319      