# Incarceration Trends (Maryland)
This notebook analyzes county-level incarceration trends and explores the relationship between incarceration and recidivism.
Workflow:
- load + clean
- trend + distribution EDA
- correlation analysis
- baseline model predicting recidivism


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('../data/incarceration_md_synthetic.csv')
df.head()


In [None]:
df.shape, df.dtypes


## Data Preprocessing
Standardize county labels and validate numeric ranges.


In [None]:
df['county'] = df['county'].astype(str).str.strip().str.replace(r"\s+", " ", regex=True).str.title()

# keep a consistent window (2010–2024)
df = df[df['year'].between(2010, 2024)]

# simple sanity checks
df = df[(df['incarceration_rate_per_100k'] >= 0) & (df['recidivism_rate_pct'].between(0, 100))]

df.head()


In [None]:
df.isna().sum()


## Exploratory Data Analysis
Trends over time and county comparisons.


In [None]:
# Trend (statewide aggregate)
trend = df.groupby('year')[['incarceration_rate_per_100k','recidivism_rate_pct']].mean().reset_index()

plt.figure(figsize=(10,4))
sns.lineplot(data=trend, x='year', y='incarceration_rate_per_100k')
plt.title('Average incarceration rate over time')
plt.show()

plt.figure(figsize=(10,4))
sns.lineplot(data=trend, x='year', y='recidivism_rate_pct')
plt.title('Average recidivism rate over time')
plt.show()


In [None]:
# Top counties by average incarceration
top = df.groupby('county')['incarceration_rate_per_100k'].mean().sort_values(ascending=False).head(10).reset_index()

plt.figure(figsize=(8,5))
sns.barplot(data=top, y='county', x='incarceration_rate_per_100k')
plt.title('Top counties by average incarceration rate')
plt.show()


## Correlation Analysis
We quantify the relationship between incarceration and recidivism, then look at broader correlations.


In [None]:
corr_pair = df['incarceration_rate_per_100k'].corr(df['recidivism_rate_pct'])
corr_pair


In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df.sample(500, random_state=42), x='incarceration_rate_per_100k', y='recidivism_rate_pct')
plt.title('Recidivism vs Incarceration (sample)')
plt.show()


In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df.drop(columns=['year']).corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation matrix')
plt.show()


## Outlier Removal + Modeling
Target: recidivism_rate_pct  
Model: DecisionTreeRegressor + GridSearchCV


In [None]:
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

model_df = df.copy()
z = np.abs(stats.zscore(model_df[['incarceration_rate_per_100k','recidivism_rate_pct','violent_crime_rate_per_100k']]))
model_df = model_df[(z < 3).all(axis=1)]

# encode county
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
model_df['county_enc'] = le.fit_transform(model_df['county'])

X = model_df[[
    'county_enc','year','incarceration_rate_per_100k','unemployment_rate_pct','poverty_rate_pct',
    'college_education_pct','police_per_1k','violent_crime_rate_per_100k'
]]
y = model_df['recidivism_rate_pct']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    'max_depth': [3,5,7,9],
    'min_samples_split': [2,4,8],
    'min_samples_leaf': [1,2,4],
    'random_state': [42]
}

grid = GridSearchCV(DecisionTreeRegressor(), param_grid=params, cv=5, n_jobs=-1, verbose=0)
grid.fit(X_train, y_train)

best = grid.best_estimator_
best.fit(X_train, y_train)

pred = best.predict(X_test)

print("Best params:", grid.best_params_)
print("R2:", r2_score(y_test, pred))
print("MAE:", mean_absolute_error(y_test, pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred)))


In [None]:
feat_df = pd.DataFrame({'Feature': X.columns, 'Importance': best.feature_importances_}).sort_values('Importance', ascending=False)
feat_df


In [None]:
plt.figure(figsize=(8,4))
sns.barplot(data=feat_df, x='Importance', y='Feature')
plt.title('Feature Importance')
plt.show()


## Export Cleaned Dataset


In [None]:
df.to_csv('../reports/cleaned_incarceration_md.csv', index=False)
df.head()


## Conclusion
This notebook demonstrates:
- county-year trend analysis
- correlation-driven insight (incarceration ↔ recidivism)
- a baseline predictive model with feature importance for explainability
