# ** Do Permanent School Closings in Chicago Increase Crime Rates? **
# By: Dani Valverde and Katherine Tai

In [2]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib as plt
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from xgboost import XGBClassifier, XGBRegressor
import sklearn
import os
from austen_plots.AustenPlot import AustenPlot
from doubleml import DoubleMLData
from doubleml import DoubleMLPLR
import doubleml as dml

# Introduction
Since 2002, Chicago Public Schools (CPS) has closed at least one school every year between 2002 and 2018. In this report, the question we are aiming to answer is: "What is the effect of school closings on crime rates in the ZIP Codes where schools are closed?"

Previous research has studied the effects of school closings on the students whose schools were closed and on the students' academic performance as a group. However, few studies have studied the short-term and long-term effects of school closings on the neighborhoods where the schools are closed. 

One study, performed by N Brazil, has studied the effects of the 2013 school closings in Chicago. However, this study focused more on the absolute distance of the school in meters and the effect on the community. Further, this study explicitly chose not to focus on the binary variable of whether schools were closed or not closed. In addition, Brazil's study focused exclusively on census-tract level data. Our analysis focuses on Chicago Community Areas, which are larger than census tracts and may reveal differences in crime patterns. Further, we are analyzing the effects of all school closures, including those in years where only 1-2 schools were closed. We hope that analyzing more schools over time will yield revealing into insights into the ways that closing schools affects a community, if at all. 

In particular, the vast majority of schools were closed in 2013. Since it has been almost 10 years since the schools were closed, this time elapsed provides a useful time period for estimating the short-term effects on neighborhoods. Further, because crime data is widely available, it is possible to measure the changes in crime rates in particular zip codes from year-to-year. And, because some community areas did not have schools closed, it is possible to determine how much of a change in crime is due to citywide trends, and how much can be attributed to the particular school closing. 

# Data 
We utilized publicly available data from the City of Chicago Data Portal about Crime Statistics since 2001 by Zip Code. There were also publicly available lists of school closings. Combining the dates and ZIP Codes of school closings and the crime rates for those ZIP codes and years, it is possible to measure the effect of these school closings within a given community area within the city of Chicago.

Community areas are defined geographic areas set by the City of Chicago. They are  static, larger than census tracts, and their boundaries do not change as often as city wards, and are more geographically precise than ZIP Codes. 

One of the challenges in answering this question is identifying confounders. Previous research has shown that overall economic indicators such as unemployment rate and economic growth, as well as the proportion of males ages 15-25 are strong predictors for crime rates. Some of the often stated reasons for schools closing are poor performance and population declines. 

Therefore, we calculated the 

# Methodology
To identify the causal estimand of interest, we utilized a Double Machine Learning approach. In the figures below, we show the results of a linear regression. 
When we adjusted for confounders, we found that 

For estimation, we utilized a Difference in Differences approach to estimating the data.

One of the challenges with this analysis was adjusting for observed and unobserved confounders. Social characteristics such as poverty rates, household income, college attainment rates are all predictors of crime. 

In addition to our identification and estimation techniques, we also conducted sensitivity analysis

In [19]:
crime_data_2002_2010 = pd.read_csv("2002-2010_crimes.csv")

In [20]:
crime_data_small_2002_2010 = crime_data_2002_2010[["Year", "Community Area"]]
grouped_2002_2010 = crime_data_small.groupby(["Year", "Community Area"]).size()
grouped_2002_2010.to_csv('2002_2010_crime_groupings.csv')

In [3]:
crime_data_2011_2020 = pd.read_csv("2011-2020_crimes.csv")

In [5]:
crime_data_small_2011_2020 = crime_data_2011_2020[["Year", "Community Area"]]
grouped_2011_2020 = crime_data_small_2011_2020.groupby(["Year", "Community Area"]).size()
grouped_2011_2020.to_csv('2011_2020_crime_groupings.csv')

In [5]:
# create nuisance functions 

# random forest model 
def create_random_forest_Q():
    return RandomForestRegressor(random_state = RANDOM_SEED, n_estimators = 500)
random_forest_Q = create_Q()

# gradient boosting model 
def create_xgb_Q():
    return XGBClassifier()

# OLS
def create_ols_Q():
    return OLS

# LASSO
def create_LASSO():
    return LASSO

X_w_treatment = confounders.copy()
X_w_treatment['treatment'] = treatment

X_train, X_trest, Y_train, Y_test = train_test_split(X_w_treatment, outcome, test_size = 0.2)
random_forest_Q.fit(X_train, Y_train)
xgb_Q_fit(X_train, Y_train)
ols_Q_fit(X_train, Y_train)
LASSO_Q_fit(X_train, Y_train)

fit_MSE = mean_squared_error(Y_pred, Y_test)
baseline_MSE = mean_squared_erro(y_train.mean()*np.ones_like(Y_test), Y_test)



NameError: name 'RANDOM_SEED' is not defined

In [None]:
# propensity scores model 

def create_g():
    return Logistic Regression(max_iter=1000)
    return RandomForestClassifier(n_estimators=100, max_depth=5)
g_model = create_g()

X_train, X_test, A_train, A_test = train_test_split(confounders, treatment)
g_model.fit(X_train, A_train)
A_pred = g_model.predict_proba(X_test)[:,1]

test_cross_entropy = log_loss(A_test, A_pred)
baseline_cross_entropy = log_loss(A_test, A_Train.mean()*np.ones_like(A_test))

In [None]:
# could do cross fitting here? Idt it will work tho 

In [None]:
# Double ML estimator for ATT

def att_aiptw(Q0, Q1, g, A, Y, prob_t=None):
    if prob_t is None:
        prob_t = A.mean()
    tau_hat = (A*(Y-Q0) - (1-A)*(g/(1-g))*(Y-Q0) - tau_hat*A) / prob_t
    n = Y.shape[0]
    std_hat = np.std(scores) / np.sqrt(n)
    
    return tau_hat, std_hat

In [None]:
# Double ML estimator for ATE 
def ate_aiptw(Q0, Q1, g, A, Y, prob_t=None):
    tau_hat = (Q1-Q0 + A*(Y-Q1)/g - (1-A) * (Y-Q0)/(1-g)).mean()
    
    scores = Q1 - Q0 + A*(Y-Q1)/g - (1-A) * (Y-Q0)(1-g) - tau_hat
    n = Y.shape[0]
    std_hat = np.std(scores) / np.sqrt(n)
    
    return tau_hat, std_hat

In [None]:
# Double ML Library Estimation Procedure 

In [None]:
# Differences in Differences Estimation 

# Results

# Sensitivity Analysis

In [None]:
# Austen Plots 
target_bias = 15.0 
covariates = {}
estimates_for_nuisance = {}
for group, covariates in covariates.items():
    narrowed_confounders = confounders.drop(columns = covariates)
    
g = treatment_k_fold_fit_and_predict(create_g_model, X = narrowed_confounders, A = treatment, n_splits = 5)
Q0, Q1 = outcome_k_fold_fit_and_predict()

data_nuisance_estimates = pd.DataFrame(({'g': g, 'Q0': Q0, 'Q1': Q1, 'A': treatment, 'Y': outcome}))
nuisance_estimates[group] = data_nuisance_estimates
austen_plot = AustenPlot(data_nuisance, covariate_path)
p, plot_cooredinates, variable_coordinates = austen_plot.fit(bias = target_bias)

# Additional Considerations
One of the most obvious considerations in our analysis is that the factors that lead to crime are not precisely understood. As stated earlier, there are a few indicators 