# Analysis on COMPAS Recidivism Data
### By: Tyler Rosselli, Kyle Mettler, Ethan Wong

## Testing...

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import ast, json

from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
sns.set(rc={'figure.figsize':(30,50)})

In [3]:
df = pd.read_csv("compas-scores-raw.csv")

In [4]:
df.head()

Unnamed: 0,Person_ID,AssessmentID,Case_ID,Agency_Text,LastName,FirstName,MiddleName,Sex_Code_Text,Ethnic_Code_Text,DateOfBirth,...,RecSupervisionLevel,RecSupervisionLevelText,Scale_ID,DisplayText,RawScore,DecileScore,ScoreText,AssessmentType,IsCompleted,IsDeleted
0,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,7,Risk of Violence,-2.08,4,Low,New,1,0
1,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,8,Risk of Recidivism,-1.06,2,Low,New,1,0
2,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,18,Risk of Failure to Appear,15.0,1,Low,New,1,0
3,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,7,Risk of Violence,-2.84,2,Low,New,1,0
4,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,8,Risk of Recidivism,-1.5,1,Low,New,1,0


## Pandas Profiling

In [5]:
import pandas_profiling

In [6]:
pandas_profiling.ProfileReport(df)

TypeError: concat() got an unexpected keyword argument 'join_axes'

## There are no duplicates

In [7]:
df.shape

(60843, 28)

In [8]:
df = df.drop_duplicates(keep='first')

In [9]:
df.shape

(60843, 28)

## Correct Types

In [10]:
df.dtypes

Person_ID                    int64
AssessmentID                 int64
Case_ID                      int64
Agency_Text                 object
LastName                    object
FirstName                   object
MiddleName                  object
Sex_Code_Text               object
Ethnic_Code_Text            object
DateOfBirth                 object
ScaleSet_ID                  int64
ScaleSet                    object
AssessmentReason            object
Language                    object
LegalStatus                 object
CustodyStatus               object
MaritalStatus               object
Screening_Date              object
RecSupervisionLevel          int64
RecSupervisionLevelText     object
Scale_ID                     int64
DisplayText                 object
RawScore                   float64
DecileScore                  int64
ScoreText                   object
AssessmentType              object
IsCompleted                  int64
IsDeleted                    int64
dtype: object

In [11]:
df.DateOfBirth = pd.to_datetime(df['DateOfBirth'])
df.Screening_Date = pd.to_datetime(df['Screening_Date'])

In [12]:
df.dtypes

Person_ID                           int64
AssessmentID                        int64
Case_ID                             int64
Agency_Text                        object
LastName                           object
FirstName                          object
MiddleName                         object
Sex_Code_Text                      object
Ethnic_Code_Text                   object
DateOfBirth                datetime64[ns]
ScaleSet_ID                         int64
ScaleSet                           object
AssessmentReason                   object
Language                           object
LegalStatus                        object
CustodyStatus                      object
MaritalStatus                      object
Screening_Date             datetime64[ns]
RecSupervisionLevel                 int64
RecSupervisionLevelText            object
Scale_ID                            int64
DisplayText                        object
RawScore                          float64
DecileScore                       

## Columns

In [13]:
df.columns

Index(['Person_ID', 'AssessmentID', 'Case_ID', 'Agency_Text', 'LastName',
       'FirstName', 'MiddleName', 'Sex_Code_Text', 'Ethnic_Code_Text',
       'DateOfBirth', 'ScaleSet_ID', 'ScaleSet', 'AssessmentReason',
       'Language', 'LegalStatus', 'CustodyStatus', 'MaritalStatus',
       'Screening_Date', 'RecSupervisionLevel', 'RecSupervisionLevelText',
       'Scale_ID', 'DisplayText', 'RawScore', 'DecileScore', 'ScoreText',
       'AssessmentType', 'IsCompleted', 'IsDeleted'],
      dtype='object')

In [14]:
columns_to_drop = ['AssessmentID','Case_ID','ScaleSet_ID','IsCompleted','IsDeleted']
df = df.drop(columns_to_drop, axis=1)
df.head()

## Rename column names

Unnamed: 0,Person_ID,Agency_Text,LastName,FirstName,MiddleName,Sex_Code_Text,Ethnic_Code_Text,DateOfBirth,ScaleSet,AssessmentReason,...,MaritalStatus,Screening_Date,RecSupervisionLevel,RecSupervisionLevelText,Scale_ID,DisplayText,RawScore,DecileScore,ScoreText,AssessmentType
0,50844,PRETRIAL,Fisher,Kevin,,Male,Caucasian,1992-12-05,Risk and Prescreen,Intake,...,Single,2013-01-01,1,Low,7,Risk of Violence,-2.08,4,Low,New
1,50844,PRETRIAL,Fisher,Kevin,,Male,Caucasian,1992-12-05,Risk and Prescreen,Intake,...,Single,2013-01-01,1,Low,8,Risk of Recidivism,-1.06,2,Low,New
2,50844,PRETRIAL,Fisher,Kevin,,Male,Caucasian,1992-12-05,Risk and Prescreen,Intake,...,Single,2013-01-01,1,Low,18,Risk of Failure to Appear,15.0,1,Low,New
3,50848,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,1984-09-16,Risk and Prescreen,Intake,...,Married,2013-01-01,1,Low,7,Risk of Violence,-2.84,2,Low,New
4,50848,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,1984-09-16,Risk and Prescreen,Intake,...,Married,2013-01-01,1,Low,8,Risk of Recidivism,-1.5,1,Low,New


In [15]:
df.rename(columns={'Sex_Code_Text': 'Sex', 'Ethnic_Code_Text': 'Race', 'DisplayText': 'RiskType'}, inplace=True)

In [16]:
df.head()

Unnamed: 0,Person_ID,Agency_Text,LastName,FirstName,MiddleName,Sex,Race,DateOfBirth,ScaleSet,AssessmentReason,...,MaritalStatus,Screening_Date,RecSupervisionLevel,RecSupervisionLevelText,Scale_ID,RiskType,RawScore,DecileScore,ScoreText,AssessmentType
0,50844,PRETRIAL,Fisher,Kevin,,Male,Caucasian,1992-12-05,Risk and Prescreen,Intake,...,Single,2013-01-01,1,Low,7,Risk of Violence,-2.08,4,Low,New
1,50844,PRETRIAL,Fisher,Kevin,,Male,Caucasian,1992-12-05,Risk and Prescreen,Intake,...,Single,2013-01-01,1,Low,8,Risk of Recidivism,-1.06,2,Low,New
2,50844,PRETRIAL,Fisher,Kevin,,Male,Caucasian,1992-12-05,Risk and Prescreen,Intake,...,Single,2013-01-01,1,Low,18,Risk of Failure to Appear,15.0,1,Low,New
3,50848,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,1984-09-16,Risk and Prescreen,Intake,...,Married,2013-01-01,1,Low,7,Risk of Violence,-2.84,2,Low,New
4,50848,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,1984-09-16,Risk and Prescreen,Intake,...,Married,2013-01-01,1,Low,8,Risk of Recidivism,-1.5,1,Low,New


In [17]:
df.columns

Index(['Person_ID', 'Agency_Text', 'LastName', 'FirstName', 'MiddleName',
       'Sex', 'Race', 'DateOfBirth', 'ScaleSet', 'AssessmentReason',
       'Language', 'LegalStatus', 'CustodyStatus', 'MaritalStatus',
       'Screening_Date', 'RecSupervisionLevel', 'RecSupervisionLevelText',
       'Scale_ID', 'RiskType', 'RawScore', 'DecileScore', 'ScoreText',
       'AssessmentType'],
      dtype='object')

## Any Nulls?

In [18]:
df.isnull().sum()

Person_ID                      0
Agency_Text                    0
LastName                       0
FirstName                      0
MiddleName                 45195
Sex                            0
Race                           0
DateOfBirth                    0
ScaleSet                       0
AssessmentReason               0
Language                       0
LegalStatus                    0
CustodyStatus                  0
MaritalStatus                  0
Screening_Date                 0
RecSupervisionLevel            0
RecSupervisionLevelText        0
Scale_ID                       0
RiskType                       0
RawScore                       0
DecileScore                    0
ScoreText                     45
AssessmentType                 0
dtype: int64

## Rest of cleaning and exporting data

In [131]:
df[df.Race == 'African-Am'].head()

Unnamed: 0,Person_ID,Agency_Text,LastName,FirstName,MiddleName,Sex,Race,DateOfBirth,ScaleSet,AssessmentReason,Language,LegalStatus,CustodyStatus,MaritalStatus,Screening_Date,RecSupervisionLevel,RecSupervisionLevelText,Scale_ID,RiskType,RawScore,DecileScore,ScoreText,AssessmentType


In [20]:
df.Race = df.Race.replace({
    'African-Am':'African-American'
})

In [21]:
df.head()

Unnamed: 0,Person_ID,Agency_Text,LastName,FirstName,MiddleName,Sex,Race,DateOfBirth,ScaleSet,AssessmentReason,...,MaritalStatus,Screening_Date,RecSupervisionLevel,RecSupervisionLevelText,Scale_ID,RiskType,RawScore,DecileScore,ScoreText,AssessmentType
0,50844,PRETRIAL,Fisher,Kevin,,Male,Caucasian,1992-12-05,Risk and Prescreen,Intake,...,Single,2013-01-01,1,Low,7,Risk of Violence,-2.08,4,Low,New
1,50844,PRETRIAL,Fisher,Kevin,,Male,Caucasian,1992-12-05,Risk and Prescreen,Intake,...,Single,2013-01-01,1,Low,8,Risk of Recidivism,-1.06,2,Low,New
2,50844,PRETRIAL,Fisher,Kevin,,Male,Caucasian,1992-12-05,Risk and Prescreen,Intake,...,Single,2013-01-01,1,Low,18,Risk of Failure to Appear,15.0,1,Low,New
3,50848,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,1984-09-16,Risk and Prescreen,Intake,...,Married,2013-01-01,1,Low,7,Risk of Violence,-2.84,2,Low,New
4,50848,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,1984-09-16,Risk and Prescreen,Intake,...,Married,2013-01-01,1,Low,8,Risk of Recidivism,-1.5,1,Low,New


In [22]:
df.to_csv("compas-scores-clean.csv")

# Race vs Recidivism

In [23]:
#graphing data, changing non majority races to 'other'
df.Race = df.Race.replace({
    'Asian':'Other',
    'Oriental':'Other',
    'Arabic':'Other',
    'Native American':'Other'
})

# Decile Score

In [70]:
#race vs decile score
df.groupby(['Race', 'RiskType'])['DecileScore'].mean()

Race              RiskType                 
African-American  Risk of Failure to Appear    3.368392
                  Risk of Recidivism           5.323063
                  Risk of Violence             4.183863
Caucasian         Risk of Failure to Appear    3.134279
                  Risk of Recidivism           3.587660
                  Risk of Violence             2.602810
Hispanic          Risk of Failure to Appear    2.671585
                  Risk of Recidivism           3.312629
                  Risk of Violence             2.570350
Other             Risk of Failure to Appear    2.205910
                  Risk of Recidivism           3.027701
                  Risk of Violence             2.618652
Name: DecileScore, dtype: float64

In [59]:
ax = sns.catplot(x='DecileScore',y='Race', col='RiskType', data=df, kind='bar', ci=False, orient='h')
ax

<seaborn.axisgrid.FacetGrid at 0x1a292535d0>

In [60]:
#distribution of decile scores across all races
#can see that african americans have the most evenly distributed decile scores
g = sns.FacetGrid(df, col='Race', col_wrap=2)
g = g.map(sns.distplot, 'DecileScore', kde=False, bins=10)

# Hypothesis Test 1

Question: Do African-Americans receive higher scores for recidivism than caucasians?

In [27]:
african_american = df[df.Race == 'African-American']
caucasian = df[df.Race == 'Caucasian']
african_american_scores = african_american['DecileScore']
caucasian_scores = caucasian['DecileScore']

In [28]:
african_american_scores.describe()

count    27069.000000
mean         4.291773
std          2.767021
min         -1.000000
25%          2.000000
50%          4.000000
75%          6.000000
max         10.000000
Name: DecileScore, dtype: float64

In [29]:
caucasian_scores.describe()

count    21783.000000
mean         3.108250
std          2.401878
min         -1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         10.000000
Name: DecileScore, dtype: float64

In [30]:
from statsmodels.stats.weightstats import ttest_ind
from scipy import stats

In [31]:
tstat, pvalue, ddof = ttest_ind(
                                african_american_scores,
                                caucasian_scores,
                                alternative="larger",
                                usevar="pooled"
)

In [32]:
print(pvalue)

0.0


P-value of 0, so we can reject the null hypothesis and accept the statistically proven hypothesis that African Americans receive higher scores for recidivism than Caucasians

# Recommended Supervision Level

In [33]:
#race vs recommended supervision level
df.groupby('Race')['RecSupervisionLevel'].mean()

Race
African-American    1.943145
Caucasian           1.406418
Hispanic            1.334935
Other               1.314866
Name: RecSupervisionLevel, dtype: float64

In [34]:
ax = sns.catplot(x='RecSupervisionLevel',y='Race', data=df, kind='bar', ci=False, orient='h')

# Hypothesis Test 2

Question: Do African-Americans receive higher recommended supervision levels than caucasians?

In [35]:
african_american = df[df.Race == 'African-American']
caucasian = df[df.Race == 'Caucasian']
african_american_lvl = african_american['RecSupervisionLevel']
caucasian_lvl = caucasian['RecSupervisionLevel']

In [36]:
african_american_lvl.describe()

count    27069.000000
mean         1.943145
std          1.054442
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max          4.000000
Name: RecSupervisionLevel, dtype: float64

In [37]:
caucasian_lvl.describe()

count    21783.000000
mean         1.406418
std          0.773997
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          4.000000
Name: RecSupervisionLevel, dtype: float64

In [38]:
tstat, pvalue, ddof = ttest_ind(
                                african_american_lvl,
                                caucasian_lvl,
                                alternative="larger",
                                usevar="pooled"
)

In [65]:

print(pvalue)

0.0


P-value of 0, so we can reject the null hypothesis and accept the statistically proven hypothesis that African Americans receive higher recommended supervision levels than Caucasians

### Positive Correlation between RawScore and Scale_ID

In [118]:
sns.lmplot(x='RawScore',y='Scale_ID',data=df, hue="Race", ci=False)

<seaborn.axisgrid.FacetGrid at 0x1a2c435dd0>

# Using two year data

In [73]:
recid = pd.read_csv("compas-scores-two-years.csv")

In [74]:
pd.set_option('max_columns', 53)

In [75]:
recid.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,c_case_number,c_offense_date,c_arrest_date,c_days_from_compas,c_charge_degree,c_charge_desc,is_recid,r_case_number,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,vr_case_number,vr_charge_degree,vr_offense_date,vr_charge_desc,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,0,1,0,0,0,-1.0,2013-08-13 06:03:42,2013-08-14 05:41:20,13011352CF10A,2013-08-13,,1.0,F,Aggravated Assault w/Firearm,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,0,3,0,0,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,13001275CF10A,2013-01-26,,1.0,F,Felony Battery w/Prior Convict,1,13009779CF10A,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,13009779CF10A,(F3),2013-07-05,Felony Battery (Dom Strang),Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,0,4,0,1,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,13005330CF10A,2013-04-13,,1.0,F,Possession of Cocaine,1,13011511MM10A,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,,,,,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,0,8,1,0,1,,,,13000570CF10A,2013-01-12,,1.0,F,Possession of Cannabis,0,,,,,,,,,0,,,,,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,0,1,0,0,2,,,,12014130CF10A,,2013-01-09,76.0,F,arrest case no charge,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,0,1102,0,0


In [76]:
recid.isnull().sum()

id                            0
name                          0
first                         0
last                          0
compas_screening_date         0
sex                           0
dob                           0
age                           0
age_cat                       0
race                          0
juv_fel_count                 0
decile_score                  0
juv_misd_count                0
juv_other_count               0
priors_count                  0
days_b_screening_arrest     307
c_jail_in                   307
c_jail_out                  307
c_case_number                22
c_offense_date             1159
c_arrest_date              6077
c_days_from_compas           22
c_charge_degree               0
c_charge_desc                29
is_recid                      0
r_case_number              3743
r_charge_degree            3743
r_days_from_arrest         4898
r_offense_date             3743
r_charge_desc              3801
r_jail_in                  4898
r_jail_o

In [77]:
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas']
recid = recid.drop(drop_columns,axis=1)

In [78]:
recid.shape

(7214, 39)

In [105]:
recid.drop(['violent_recid'],axis=1).head()

Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,decile_score,priors_count,c_jail_in,c_jail_out,c_offense_date,c_charge_degree,c_charge_desc,is_recid,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,is_violent_recid,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,end,event,two_year_recid,Ave_score
0,1,miguel hernandez,miguel,hernandez,Male,1947-04-18,69,Greater than 45,Other,1,0,2013-08-13 06:03:42,2013-08-14 05:41:20,2013-08-13,F,Aggravated Assault w/Firearm,0,,,,,,,0,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,327,0,0,1.0
1,3,kevon dixon,kevon,dixon,Male,1982-01-22,34,25 - 45,African-American,3,0,2013-01-26 03:45:27,2013-02-05 05:36:53,2013-01-26,F,Felony Battery w/Prior Convict,1,(F3),,2013-07-05,Felony Battery (Dom Strang),,,1,Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,159,1,1,2.333333
2,4,ed philo,ed,philo,Male,1991-05-14,24,Less than 25,African-American,4,4,2013-04-13 04:58:34,2013-04-14 07:02:04,2013-04-13,F,Possession of Cocaine,1,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,0,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,63,0,1,3.666667
3,5,marcu brown,marcu,brown,Male,1993-01-21,23,Less than 25,African-American,8,1,,,2013-01-12,F,Possession of Cannabis,0,,,,,,,0,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,1174,0,0,7.333333
4,6,bouthy pierrelouis,bouthy,pierrelouis,Male,1973-01-22,43,25 - 45,Other,1,2,,,NaT,F,arrest case no charge,0,,,,,,,0,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,1102,0,0,1.0


In [103]:
recid.dtypes.head()

id        int64
name     object
first    object
last     object
sex      object
dtype: object

In [81]:
recid.dob = pd.to_datetime(recid['dob'])
recid.c_offense_date = pd.to_datetime(recid['c_offense_date'])

In [104]:
recid.dtypes.head()

id        int64
name     object
first    object
last     object
sex      object
dtype: object

In [83]:
recid.shape

(7214, 39)

In [84]:
recid.to_csv('compas-scores-two-years-clean')


### Calculate Average Decile Score

In [85]:
recid['Ave_score'] =(recid['v_decile_score']+recid['decile_score']+recid['decile_score.1'])/3

In [86]:
recid = recid.drop_duplicates(keep='first')

In [87]:
recid.shape

(7214, 40)

In [88]:
recid.head()

Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,decile_score,priors_count,c_jail_in,c_jail_out,c_offense_date,c_charge_degree,c_charge_desc,is_recid,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,end,event,two_year_recid,Ave_score
0,1,miguel hernandez,miguel,hernandez,Male,1947-04-18,69,Greater than 45,Other,1,0,2013-08-13 06:03:42,2013-08-14 05:41:20,2013-08-13,F,Aggravated Assault w/Firearm,0,,,,,,,,0,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,327,0,0,1.0
1,3,kevon dixon,kevon,dixon,Male,1982-01-22,34,25 - 45,African-American,3,0,2013-01-26 03:45:27,2013-02-05 05:36:53,2013-01-26,F,Felony Battery w/Prior Convict,1,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,159,1,1,2.333333
2,4,ed philo,ed,philo,Male,1991-05-14,24,Less than 25,African-American,4,4,2013-04-13 04:58:34,2013-04-14 07:02:04,2013-04-13,F,Possession of Cocaine,1,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,63,0,1,3.666667
3,5,marcu brown,marcu,brown,Male,1993-01-21,23,Less than 25,African-American,8,1,,,2013-01-12,F,Possession of Cannabis,0,,,,,,,,0,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,1174,0,0,7.333333
4,6,bouthy pierrelouis,bouthy,pierrelouis,Male,1973-01-22,43,25 - 45,Other,1,2,,,NaT,F,arrest case no charge,0,,,,,,,,0,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,1102,0,0,1.0


In [89]:
recid.groupby('race')[['two_year_recid']].mean()

Unnamed: 0_level_0,two_year_recid
race,Unnamed: 1_level_1
African-American,0.51434
Asian,0.28125
Caucasian,0.393643
Hispanic,0.364207
Native American,0.555556
Other,0.352785


In [92]:
recid.head()

Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,decile_score,priors_count,c_jail_in,c_jail_out,c_offense_date,c_charge_degree,c_charge_desc,is_recid,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,end,event,two_year_recid,Ave_score
0,1,miguel hernandez,miguel,hernandez,Male,1947-04-18,69,Greater than 45,Other,1,0,2013-08-13 06:03:42,2013-08-14 05:41:20,2013-08-13,F,Aggravated Assault w/Firearm,0,,,,,,,,0,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,327,0,0,1.0
1,3,kevon dixon,kevon,dixon,Male,1982-01-22,34,25 - 45,African-American,3,0,2013-01-26 03:45:27,2013-02-05 05:36:53,2013-01-26,F,Felony Battery w/Prior Convict,1,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,159,1,1,2.333333
2,4,ed philo,ed,philo,Male,1991-05-14,24,Less than 25,African-American,4,4,2013-04-13 04:58:34,2013-04-14 07:02:04,2013-04-13,F,Possession of Cocaine,1,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,63,0,1,3.666667
3,5,marcu brown,marcu,brown,Male,1993-01-21,23,Less than 25,African-American,8,1,,,2013-01-12,F,Possession of Cannabis,0,,,,,,,,0,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,1174,0,0,7.333333
4,6,bouthy pierrelouis,bouthy,pierrelouis,Male,1973-01-22,43,25 - 45,Other,1,2,,,NaT,F,arrest case no charge,0,,,,,,,,0,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,1102,0,0,1.0


In [127]:
recid[recid.race == 'African-American']

Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,decile_score,priors_count,c_jail_in,c_jail_out,c_offense_date,c_charge_degree,c_charge_desc,is_recid,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,end,event,two_year_recid,Ave_score
1,3,kevon dixon,kevon,dixon,Male,1982-01-22,34,25 - 45,African-American,3,0,2013-01-26 03:45:27,2013-02-05 05:36:53,2013-01-26,F,Felony Battery w/Prior Convict,1,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,159,1,1,2.333333
2,4,ed philo,ed,philo,Male,1991-05-14,24,Less than 25,African-American,4,4,2013-04-13 04:58:34,2013-04-14 07:02:04,2013-04-13,F,Possession of Cocaine,1,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,63,0,1,3.666667
3,5,marcu brown,marcu,brown,Male,1993-01-21,23,Less than 25,African-American,8,1,,,2013-01-12,F,Possession of Cannabis,0,,,,,,,,0,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,1174,0,0,7.333333
11,15,ellyaher lanza,ellyaher,lanza,Male,1992-08-18,23,Less than 25,African-American,6,3,2013-10-03 04:07:35,2013-10-07 08:17:30,2013-10-03,M,Battery,1,(M2),,2014-02-08,Driving License Suspended,,,,0,Risk of Recidivism,6,Medium,2013-10-03,Risk of Violence,4,Low,2013-10-03,2013-10-03,2013-10-07,3,128,1,1,5.333333
13,18,jarrod turbe,jarrod,turbe,Male,1974-12-02,41,25 - 45,African-American,4,0,2013-10-08 11:53:09,2013-10-09 02:16:51,2013-10-08,F,"Poss3,4 Methylenedioxymethcath",0,,,,,,,,0,Risk of Recidivism,4,Low,2013-10-09,Risk of Violence,2,Low,2013-10-09,2013-10-08,2013-10-09,0,905,0,0,3.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207,10994,jarred payne,jarred,payne,Male,1985-07-31,30,25 - 45,African-American,2,0,2014-05-09 10:01:33,2014-05-10 08:28:12,2014-05-09,M,Possess Cannabis/20 Grams Or Less,1,(F3),1.0,2015-10-21,Possession of Cannabis,2015-10-22,2015-10-22,,0,Risk of Recidivism,2,Low,2014-05-10,Risk of Violence,2,Low,2014-05-10,2015-10-22,2015-10-22,0,529,1,1,2.000000
7208,10995,raheem smith,raheem,smith,Male,1995-06-28,20,Less than 25,African-American,9,0,2013-10-19 11:17:15,2013-10-20 08:13:06,2013-10-19,F,Possession of Cocaine,0,,,,,,,,0,Risk of Recidivism,9,High,2013-10-20,Risk of Violence,9,High,2013-10-20,2014-04-07,2014-04-27,0,169,0,0,9.000000
7209,10996,steven butler,steven,butler,Male,1992-07-17,23,Less than 25,African-American,7,0,2013-11-22 05:18:27,2013-11-24 02:59:20,2013-11-22,F,Deliver Cannabis,0,,,,,,,,0,Risk of Recidivism,7,Medium,2013-11-23,Risk of Violence,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,860,0,0,6.333333
7210,10997,malcolm simmons,malcolm,simmons,Male,1993-03-25,23,Less than 25,African-American,3,0,2014-01-31 07:13:54,2014-02-02 04:03:52,2014-01-31,F,Leaving the Scene of Accident,0,,,,,,,,0,Risk of Recidivism,3,Low,2014-02-01,Risk of Violence,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,790,0,0,3.666667


In [97]:
recid.groupby('age_cat')['two_year_recid'].mean()

age_cat
25 - 45            0.459723
Greater than 45    0.315990
Less than 25       0.565075
Name: two_year_recid, dtype: float64

## This group by shows average decile score with average corresponding two year recid value

In [None]:
sns.

In [100]:
recid.groupby(['Ave_score'])['two_year_recid'].mean()

Ave_score
1.000000     0.203968
1.333333     0.271676
1.666667     0.298507
2.000000     0.312303
2.333333     0.340058
2.666667     0.368217
3.000000     0.375465
3.333333     0.392453
3.666667     0.399329
4.000000     0.396364
4.333333     0.480565
4.666667     0.535714
5.000000     0.539033
5.333333     0.567901
5.666667     0.513725
6.000000     0.573991
6.333333     0.605381
6.666667     0.601990
7.000000     0.658416
7.333333     0.664804
7.666667     0.728916
8.000000     0.688312
8.333333     0.676829
8.666667     0.720930
9.000000     0.746479
9.333333     0.761905
9.666667     0.805556
10.000000    0.750000
Name: two_year_recid, dtype: float64

### Correlation of Average Score and If they recidivated

In [102]:
recid['two_year_recid'].corr(recid['Ave_score'])

0.35109046729878796

### Hypothesis Test: Caucasian Recid vs. African American Recid

Question: Do afican Amercians recitivate more than caucasians?
    

In [132]:
caucasian_recid = recid[recid.race == "Caucasian"].two_year_recid
caucasian_recid

6       1
8       0
9       1
10      0
12      0
       ..
7192    0
7194    1
7199    0
7205    1
7206    1
Name: two_year_recid, Length: 2454, dtype: int64

In [135]:
aa_recid = recid[recid.race == "African-American"].two_year_recid
aa_recid

1       1
2       1
3       0
11      1
13      0
       ..
7207    1
7208    0
7209    0
7210    0
7212    0
Name: two_year_recid, Length: 3696, dtype: int64

In [141]:
print("AA: \n" ,aa_recid.describe())
print("Caucasian: \n", caucasian_recid.describe())

AA: 
 count    3696.000000
mean        0.514340
std         0.499862
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: two_year_recid, dtype: float64
Caucasian: 
 count    2454.000000
mean        0.393643
std         0.488657
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: two_year_recid, dtype: float64


In [154]:
tstat, pvalue, ddof = ttest_ind(
                                caucasian_recid,
                                aa_recid                                                       
)

In [150]:
print(pvalue,'\n', tstat,'\n',ddof)

1.1363681614743772e-20 
 -9.355920228380974 
 6148.0


With sucha low p value, we are able to conclude that african americans recitivate more than caucasians 

Earlier, we found that African Americans recieve higher recidivation scores that caucasians. We have now found that they end up recitivating at a higher rate. 

In [166]:
mid_age = recid[recid.age_cat == '25 - 45'].two_year_recid
under_25 = recid[recid.age_cat == 'Less than 25'].two_year_recid
tstat, pvalue, ddof = ttest_ind(
                                mid_age,
                                under_25                                                      
)

In [167]:
print(pvalue)

1.7943713406031644e-12


In [159]:
recid.head()

Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,decile_score,priors_count,c_jail_in,c_jail_out,c_offense_date,c_charge_degree,c_charge_desc,is_recid,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,end,event,two_year_recid,Ave_score
0,1,miguel hernandez,miguel,hernandez,Male,1947-04-18,69,Greater than 45,Other,1,0,2013-08-13 06:03:42,2013-08-14 05:41:20,2013-08-13,F,Aggravated Assault w/Firearm,0,,,,,,,,0,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,327,0,0,1.0
1,3,kevon dixon,kevon,dixon,Male,1982-01-22,34,25 - 45,African-American,3,0,2013-01-26 03:45:27,2013-02-05 05:36:53,2013-01-26,F,Felony Battery w/Prior Convict,1,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,159,1,1,2.333333
2,4,ed philo,ed,philo,Male,1991-05-14,24,Less than 25,African-American,4,4,2013-04-13 04:58:34,2013-04-14 07:02:04,2013-04-13,F,Possession of Cocaine,1,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,63,0,1,3.666667
3,5,marcu brown,marcu,brown,Male,1993-01-21,23,Less than 25,African-American,8,1,,,2013-01-12,F,Possession of Cannabis,0,,,,,,,,0,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,1174,0,0,7.333333
4,6,bouthy pierrelouis,bouthy,pierrelouis,Male,1973-01-22,43,25 - 45,Other,1,2,,,NaT,F,arrest case no charge,0,,,,,,,,0,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,1102,0,0,1.0


In [155]:
recid.head()

Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,decile_score,priors_count,c_jail_in,c_jail_out,c_offense_date,c_charge_degree,c_charge_desc,is_recid,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,end,event,two_year_recid,Ave_score
0,1,miguel hernandez,miguel,hernandez,Male,1947-04-18,69,Greater than 45,Other,1,0,2013-08-13 06:03:42,2013-08-14 05:41:20,2013-08-13,F,Aggravated Assault w/Firearm,0,,,,,,,,0,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,327,0,0,1.0
1,3,kevon dixon,kevon,dixon,Male,1982-01-22,34,25 - 45,African-American,3,0,2013-01-26 03:45:27,2013-02-05 05:36:53,2013-01-26,F,Felony Battery w/Prior Convict,1,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,159,1,1,2.333333
2,4,ed philo,ed,philo,Male,1991-05-14,24,Less than 25,African-American,4,4,2013-04-13 04:58:34,2013-04-14 07:02:04,2013-04-13,F,Possession of Cocaine,1,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,63,0,1,3.666667
3,5,marcu brown,marcu,brown,Male,1993-01-21,23,Less than 25,African-American,8,1,,,2013-01-12,F,Possession of Cannabis,0,,,,,,,,0,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,1174,0,0,7.333333
4,6,bouthy pierrelouis,bouthy,pierrelouis,Male,1973-01-22,43,25 - 45,Other,1,2,,,NaT,F,arrest case no charge,0,,,,,,,,0,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,1102,0,0,1.0
