In [None]:
'''
Emanda Bisrat - Data Visualization Project
'''

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px

In [137]:
data = pd.read_csv('NIJ_s_Recidivism_Challenge_Full_Dataset_20241203.csv')
data

Unnamed: 0,ID,Gender,Race,Age_at_Release,Residence_PUMA,Gang_Affiliated,Supervision_Risk_Score_First,Supervision_Level_First,Education_Level,Dependents,...,DrugTests_Meth_Positive,DrugTests_Other_Positive,Percent_Days_Employed,Jobs_Per_Year,Employment_Exempt,Recidivism_Within_3years,Recidivism_Arrest_Year1,Recidivism_Arrest_Year2,Recidivism_Arrest_Year3,Training_Sample
0,1,M,BLACK,43-47,16,False,3.0,Standard,At least some college,3 or more,...,0.000000,0.0,0.488562,0.447610,False,False,False,False,False,1
1,2,M,BLACK,33-37,16,False,6.0,Specialized,Less than HS diploma,1,...,0.000000,0.0,0.425234,2.000000,False,True,False,False,True,1
2,3,M,BLACK,48 or older,24,False,7.0,High,At least some college,3 or more,...,0.166667,0.0,0.000000,0.000000,False,True,False,True,False,1
3,4,M,WHITE,38-42,16,False,7.0,High,Less than HS diploma,1,...,0.000000,0.0,1.000000,0.718996,False,False,False,False,False,1
4,5,M,WHITE,33-37,16,False,4.0,Specialized,Less than HS diploma,3 or more,...,0.058824,0.0,0.203562,0.929389,False,True,True,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25830,26756,M,BLACK,23-27,9,False,5.0,Standard,At least some college,1,...,0.000000,0.0,0.189507,0.572044,False,True,True,False,False,1
25831,26758,M,WHITE,38-42,25,False,5.0,Standard,At least some college,3 or more,...,0.000000,0.0,0.757098,0.576104,False,True,False,True,False,1
25832,26759,M,BLACK,33-37,15,False,5.0,Standard,At least some college,3 or more,...,,,0.711138,0.894125,False,True,False,True,False,1
25833,26760,F,WHITE,33-37,15,,5.0,Standard,At least some college,3 or more,...,0.000000,0.0,0.000000,0.000000,True,False,False,False,False,1


# Data Cleaning

In [139]:
# Check for missing values
print(data.isna().sum())

ID                                                      0
Gender                                                  0
Race                                                    0
Age_at_Release                                          0
Residence_PUMA                                          0
Gang_Affiliated                                      3167
Supervision_Risk_Score_First                          475
Supervision_Level_First                              1720
Education_Level                                         0
Dependents                                              0
Prison_Offense                                       3277
Prison_Years                                            0
Prior_Arrest_Episodes_Felony                            0
Prior_Arrest_Episodes_Misd                              0
Prior_Arrest_Episodes_Violent                           0
Prior_Arrest_Episodes_Property                          0
Prior_Arrest_Episodes_Drug                              0
Prior_Arrest_E

In [140]:
#Dropping uneccesarry columns
data = data.drop(columns=['Avg_Days_per_DrugTest', 'Jobs_Per_Year', 
                         'DrugTests_Other_Positive', 'DrugTests_Meth_Positive', 'DrugTests_Cocaine_Positive',
                         'DrugTests_THC_Positive', 'Gang_Affiliated'])
len(data)

25835

After looking at the data, I want to cateogrize the supervision risk scores with the supervision levels that seem to commonly be given to certain scores. So for the null values, if the risk score is available and is 1,2,3, or 4 I want to fill that null supervision level value with Standard. If the risk score is 5 or 6, I would categorize it as Specialized. And then for 7,8,9, and 10 I would categorize it as High. I think this would be the best way to distribute the scores with the levels for those rows where at least one is available. And then for those where both aren't available, I would drop those columns. I want to take this approach because if I just dropped all the rows with null values, then it would decrease the data from 25k to 18k which is too drastic. 

In [141]:
def fill_supervision(row):
    if pd.notna(row['Supervision_Risk_Score_First']) and pd.isna(row['Supervision_Level_First']):
        if 1 <= row['Supervision_Risk_Score_First'] <= 4:
            row['Supervision_Level_First'] = 'Standard'
        elif 5 <= row['Supervision_Risk_Score_First'] <= 6:
            row['Supervision_Level_First'] = 'Specialized'
        elif 7 <= row['Supervision_Risk_Score_First'] <= 10:
            row['Supervision_Level_First'] = 'High'
    
    elif pd.isna(row['Supervision_Risk_Score_First']) and pd.notna(row['Supervision_Level_First']):
        if row['Supervision_Level_First'] == 'Standard':
            row['Supervision_Risk_Score_First'] = 3
        elif row['Supervision_Level_First'] == 'Specialized':
            row['Supervision_Risk_Score_First'] = 6
        elif row['Supervision_Level_First'] == 'High':
            row['Supervision_Risk_Score_First'] = 8
    
    return row

data = data.apply(fill_supervision, axis=1)
data = data.dropna(subset=['Supervision_Risk_Score_First', 'Supervision_Level_First'], how='all')

In [142]:
len(data)

25520

In [143]:
data = data.dropna()
len(data)

21851

I am happy with this result. The data wasn't cut too much, especially considering how large the data is. I was able to fill in some values while also dropping some columns that aren't valuable for this specific project.

In [133]:
#Handling Data types to make it easier for me when doing visualizations

data['Prison_Years'] = pd.to_numeric(data['Prison_Years'], errors='coerce')

categorical_columns = ['Gender', 'Race', 'Education_Level', 'Supervision_Level_First', 'Prison_Offense', 
                       'Dependents', 'Delinquency_Reports', 'Program_Attendances', 'Program_UnexcusedAbsences', 
                       'Residence_Changes']

for col in categorical_columns:
    data[col] = data[col].astype('category')

In [148]:
misd_mapping = {
    '6 or more': 6, 
    '4': 4,
    '0': 0,
    '1': 1,
    '3': 3,
    '5': 5,
    '2': 2
}

felony_mapping = {
    '6': 6,
    '7': 7,
    '8': 8,
    '4': 4,
    '10 or more': 10, 
    '3': 3,
    '9': 9,
    '2': 2,
    '5': 5,
    '1': 1,
    '0': 0
}

violent_mapping = {
    '1': 1,
    '3 or more': 3,  
    '0': 0,
    '2': 2
}

data['Prior_Arrest_Episodes_Misd'] = data['Prior_Arrest_Episodes_Misd'].replace(misd_mapping)
data['Prior_Arrest_Episodes_Felony'] = data['Prior_Arrest_Episodes_Felony'].replace(felony_mapping)
data['Prior_Arrest_Episodes_Violent'] = data['Prior_Arrest_Episodes_Violent'].replace(violent_mapping)

arrest_columns = [
    'Prior_Arrest_Episodes_Misd', 
    'Prior_Arrest_Episodes_Felony', 
    'Prior_Arrest_Episodes_Violent'
]

for col in arrest_columns:
    data[col] = pd.to_numeric(data[col])

In [149]:
#Ensuring all null values are gone
print(data.isna().sum())

ID                                                   0
Gender                                               0
Race                                                 0
Age_at_Release                                       0
Residence_PUMA                                       0
Supervision_Risk_Score_First                         0
Supervision_Level_First                              0
Education_Level                                      0
Dependents                                           0
Prison_Offense                                       0
Prison_Years                                         0
Prior_Arrest_Episodes_Felony                         0
Prior_Arrest_Episodes_Misd                           0
Prior_Arrest_Episodes_Violent                        0
Prior_Arrest_Episodes_Property                       0
Prior_Arrest_Episodes_Drug                           0
Prior_Arrest_Episodes_PPViolationCharges             0
Prior_Arrest_Episodes_DVCharges                      0
Prior_Arre

In [152]:
data['Recidivism_Within_3years'] = data['Recidivism_Within_3years'].astype(int)

In [None]:
'''
Now that the data is cleaned, I will proceed with visualizations. 
'''

# Visualizations

In [177]:
#Looking at the whether most people attend rehabilitation programs 
data['Program_Attendances'] = data['Program_Attendances'].replace({'10 or more': 10})
data['Program_Attendances'] = pd.to_numeric(data['Program_Attendances'], errors='coerce')
fig = px.histogram(data, x='Program_Attendances', nbins=10, title='Distribution of Program Attendances')
fig.update_layout(xaxis_title='Number of Program Attendances', yaxis_title='Count')
fig.show()

I did not add this to my report but I thought it was good to conceptualize how many people do not attend rehabilitation programs before showing the next visualization. The figure above shows that approximately 12.4k of convicted felons in this dataset did not attend which is a little over half of the enitre data. This just shows the lack of emphasis on the importance of attendance. 

In [226]:
#Interactive Heatmap - Supervision Risk Score Levels, Program Attendances, and Recidivism. 
heatmap_data = data.pivot_table(
    index='Supervision_Risk_Score_First',  
    columns='Program_Attendances',  
    values='Recidivism_Within_3years', 
    aggfunc='mean' 
)

fig = px.imshow(
    heatmap_data,
    color_continuous_scale='RdBu',  # Color scale for recidivism (Red to Blue)
    labels={'x': 'Program Attendances', 'y': 'Supervision Risk Score'},
    title='Program Attendance vs Recidivism by Supervision Risk Score'
)

fig.update_layout(
    xaxis_title='Program Attendances',
    yaxis_title='Supervision Risk Score',
    coloraxis_colorbar_title='Recidivism Rate<br>(Proportion of Reoffenders)'
)

fig.show()

This investigate whether higher Program Attendance reduces Recidivism for individuals with different Supervision Risk Scores. This helps see if higher-risk individuals benefit more from attending rehabilitation programs. Analysis is in the report. 

In [228]:
#Stacked bar chart of Recidivism by Education Level, Race, and Gender which was normalized. 
fig = px.histogram(
    data,
    x='Education_Level',  
    color='Recidivism_Within_3years',  
    barmode='stack',  
    facet_row='Race', 
    facet_col='Gender', 
    category_orders={'Education_Level': ['Less than HS diploma', 'High School Diploma', 'At least some college']},  # Order education levels
    title="Recidivism by Education Level, Race, and Gender (Normalized)",
    labels={'Recidivism_Within_3years': 'Recidivism (1 = Reoffended, 0 = Not Reoffended)', 
            'Education_Level': 'Education Level', 'Race': 'Race', 'Gender': 'Gender'},
    color_discrete_map={0: 'lightblue', 1: 'red'},  
    histnorm='percent' 
)

fig.update_layout(
    xaxis_title='Education Level',
    yaxis_title='Percentage (%)',
    barmode='stack',
    legend_title='Recidivism',
    title_x=0.5,  
)

fig.show()

Added tool tips to help show data more clearly and add an interactive aspect to the chart. Chose distinct colors so that data can stand out. Clearly labeled and axises and the legend. 

In [204]:
# Calculate the percentage of people reoffended in Year 1, 2, and 3
recidivism_yearly = {
    'Year 1': data['Recidivism_Arrest_Year1'].mean() * 100,
    'Year 2': data['Recidivism_Arrest_Year2'].mean() * 100,
    'Year 3': data['Recidivism_Arrest_Year3'].mean() * 100
}

# Convert to a pandas DataFrame for easy plotting
import pandas as pd
recidivism_df = pd.DataFrame(list(recidivism_yearly.items()), columns=['Year', 'Recidivism Rate (%)'])
recidivism_df

Unnamed: 0,Year,Recidivism Rate (%)
0,Year 1,30.35559
1,Year 2,18.067823
2,Year 3,10.448034


Again, this wasn't shown in the report but good to see conceptually. It seems like most people become reoffended in their first year out of prison. 

In [213]:
data['Prison_Years'].unique()

array(['More than 3 years', '1-2 years', 'Greater than 2 to 3 years',
       'Less than 1 year'], dtype=object)

In [214]:
#Grouped bar chart of recidivism rates and prison years
#grouping the data by years and then calculating rates
recidivism_analysis = data.groupby('Prison_Years').agg({
    'Recidivism_Within_3years': 'mean',  
    'Recidivism_Arrest_Year1': 'mean', 
    'Recidivism_Arrest_Year3': 'mean'  
}).reset_index()
recidivism_analysis.columns = ['Prison Years', 'Overall Recidivism Rate', 'Year 1 Recidivism Rate', 'Year 3 Recidivism Rate']

In [235]:
recidivism_melted = recidivism_analysis.melt(
    id_vars='Prison Years', 
    value_vars=['Overall Recidivism Rate', 'Year 1 Recidivism Rate', 'Year 3 Recidivism Rate'],
    var_name='Recidivism Type', value_name='Rate'
)
fig = px.bar(
    recidivism_melted,
    x='Prison Years',
    y='Rate',
    color='Recidivism Type',
    barmode='group',
    title='Recidivism Rates by Prison Years',
    labels={'Prison Years': 'Prison Years Category', 'Rate': 'Recidivism Rate'},
    color_discrete_map={
        'Overall Recidivism Rate': 'blue', 
        'Year 1 Recidivism Rate': 'red', 
        'Year 3 Recidivism Rate': 'green'
    },
    category_orders={
        'Prison Years': ['Less than 1 year', '1-2 years', 'Greater than 2 to 3 years', 'More than 3 years']
    }
)
fig.update_layout(
    xaxis_title='Prison Years Category',
    yaxis_title='Recidivism Rate',
    title_x=0.5 
)
fig.show()

Added tool tips to help show data more clearly and add an interactive aspect to the chart. Chose distinct colors so that the bars can stand out. Clearly labeled and axises and the legend. 