## DS4A: MinJusticia
### Reducing the recidivim risk for people released from jail between 2010 and 2019 <br> 


Laura A Goyeneche <br>

In [1]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn')

In [None]:
# Import data
inmate     = pd.read_csv('data/ReincidenciaPospenadosNal26Mayo2020Rev.csv', sep = ';', skiprows = 5, encoding= 'unicode_escape')
inmatevars = pd.read_excel('data/ReincidenciaPospenadosNal2010-31Mayo2020Rev_con variables demograficas.xlsx', skiprows = 6) 

In [None]:
# Data sample
inmatevars.head(3)

In [None]:
# Feature changes
inmatevars.FECHA_INGRESO = pd.to_datetime(inmatevars.FECHA_INGRESO)
inmatevars.FECHA_SALIDA  = pd.to_datetime(inmatevars.FECHA_SALIDA)
inmatevars.FECHA_CAPTURA = pd.to_datetime(inmatevars.FECHA_CAPTURA)

In [None]:
# Data columns
list(inmatevars.columns)

In [None]:
# Basic information
print(
'''
Number of records (inmates, booking date): {} 
Number of variables                      : {} 
Number of inmates                        : {}
Number of crime types                    : {}
Start and end capture date               : {} and {}
Start and end booking date               : {} and {}
Start and end release date               : {} and {}
'''
.format(inmatevars.shape[0], 
        inmatevars.shape[1], 
        inmatevars.INTERNOEN.unique().shape[0], 
        inmatevars.DELITO.unique().shape[0],
        inmatevars.FECHA_CAPTURA.min(), inmatevars.FECHA_CAPTURA.max(),
        inmatevars.FECHA_INGRESO.min(), inmatevars.FECHA_INGRESO.max(),
        inmatevars.FECHA_SALIDA.dropna().min(), inmatevars.FECHA_SALIDA.dropna().max())
)

In [None]:
plt.figure(figsize = (13,5))
plt.plot(inmatevars.FECHA_CAPTURA.dt.year.value_counts().to_frame().sort_index()        , label = 'Capture Date', color = '#98c1d9', alpha = 0.7)
plt.plot(inmatevars.FECHA_SALIDA.dropna().dt.year.value_counts().to_frame().sort_index(), label = 'Release Date', color = '#118ab2', alpha = 0.7)
plt.plot(inmatevars.FECHA_INGRESO.dt.year.value_counts().to_frame().sort_index()        , label = 'Booking Date', color = '#faa307', alpha = 0.7)
plt.title('Frequency of captures, bookings and release\n', fontsize = 14)
plt.ylabel('Frequency\n')
plt.xlabel('\nYear')
#plt.xlim(2000,2020)
plt.legend(loc = 'upper left', fontsize = 12)
plt.show()

In [None]:
temp               = inmatevars.DELITO.str.title().value_counts().to_frame().reset_index().rename(columns = {'index':'DELITONAME'})
temp['DELITO_PER'] = round(temp.DELITO*100/temp.DELITO.sum(),2)
temp               = temp.head(10).sort_values(by = 'DELITO', ascending = True)
temp.DELITONAME    = temp.DELITONAME.apply(lambda x: x[:int(len(x)/2 + 2)] + '\n' + x[int(len(x)/2 + 2):] if len(x) > 20 else x)

plt.figure(figsize = (10,5))
plt.barh(temp.DELITONAME, temp.DELITO)
plt.xticks(rotation = 90)
plt.title('Top crime types (%.2f %%)\n' % (temp.DELITO.sum()*100/inmatevars.shape[0]), fontsize = 15)

for i in range(len(temp)):
    plt.text(1.01*temp.DELITO[i], temp.DELITONAME[i], str(temp.DELITO_PER[i]) + '%', fontweight = 'bold')

plt.ylabel('Crime Type\n')
plt.xlabel('\nFrequency')
plt.show()

In [None]:
plt.figure(figsize = (13,5))
plt.hist(inmatevars.EDAD, bins = 20, label = 'Capture Date', color = '#98c1d9')
plt.vlines(inmatevars.EDAD.mean(), 0, 30000, color = '#d00000')
plt.title('Age distribution\n', fontsize = 14)
plt.ylabel('Frequency\n')
plt.xlabel('\Age')
plt.show()