# <font color='#eb3483'> Import Data </font>

In [138]:
#from scipy import stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#ignore warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [139]:
df = pd.read_csv("compas.csv", skip_blank_lines = True)
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,8/14/13,Male,4/18/47,69,Greater than 45,Other,...,1,Low,8/14/13,7/7/14,7/14/14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,1/27/13,Male,1/22/82,34,25 - 45,African-American,...,1,Low,1/27/13,1/26/13,2/5/13,0,9,159,1,1
2,4,ed philo,ed,philo,4/14/13,Male,5/14/91,24,Less than 25,African-American,...,3,Low,4/14/13,6/16/13,6/16/13,4,0,63,0,1
3,5,marcu brown,marcu,brown,1/13/13,Male,1/21/93,23,Less than 25,African-American,...,6,Medium,1/13/13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,3/26/13,Male,1/22/73,43,25 - 45,Other,...,1,Low,3/26/13,,,2,0,1102,0,0


In [140]:
df.shape

(7214, 53)

In [141]:
df.columns

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [142]:
duplicate = df[df.duplicated()]
duplicate.shape

(0, 53)

# <font color='#eb3483'> Data Analysis </font>

Questions: Does the algorithm lead to biased results?

Clean data after break them down according to topics for convienice, when cleaning, check for:
1. missing/useless values
2. cardinality (how many repetitive values)
3. outliners

Steps for analysis:
1. pick out columns, clean & prepare data, raname columns
2. run analysis
3. make graphs

## <font color='#eb3483'> 1. Are there any race differences in decile score (for the first/current arrest)? </font>

### <font color='#eb3483'> Prep </font>

In [143]:
df_q1 = df[['id', 'race', 'decile_score', 'priors_count', 'c_charge_degree']]
df_q1.head()

Unnamed: 0,id,race,decile_score,priors_count,c_charge_degree
0,1,Other,1,0,F
1,3,African-American,3,0,F
2,4,African-American,4,4,F
3,5,African-American,8,1,F
4,6,Other,1,2,F


In [144]:
df_q1.rename(columns = {'decile_score':'score', 'priors_count' : 'priors', 'c_charge_degree' : 'charge_degree'}, inplace = True)
df_q1.head()

Unnamed: 0,id,race,score,priors,charge_degree
0,1,Other,1,0,F
1,3,African-American,3,0,F
2,4,African-American,4,4,F
3,5,African-American,8,1,F
4,6,Other,1,2,F


In [145]:
df_q1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             7214 non-null   int64 
 1   race           7214 non-null   object
 2   score          7214 non-null   int64 
 3   priors         7214 non-null   int64 
 4   charge_degree  7214 non-null   object
dtypes: int64(3), object(2)
memory usage: 281.9+ KB


In [146]:
df_q1.race.unique()

array(['Other', 'African-American', 'Caucasian', 'Hispanic',
       'Native American', 'Asian'], dtype=object)

In [147]:
keep = ['Other', 'Hispanic',
       'Native American', 'Asian']

In [148]:
#only keep Caucasian and Afican American
df_q1 = df_q1[~df_q1['race'].isin(keep)]
df_q1.head()

Unnamed: 0,id,race,score,priors,charge_degree
1,3,African-American,3,0,F
2,4,African-American,4,4,F
3,5,African-American,8,1,F
6,8,Caucasian,6,14,F
8,10,Caucasian,1,0,M


In [149]:
df_q1.shape

(6150, 5)

### <font color='#eb3483'> Analysis </font>

In [150]:
mean_by_race = df_q1[['race', 'score']].groupby("race", as_index=False).mean()
mean_by_race

Unnamed: 0,race,score
0,African-American,5.368777
1,Caucasian,3.735126


This shows that Arifan Americans tend to recieve a higher score.

We could argue this is because their criminal record. So - how about when both group of people have the same number of priors and the same degree of charge?

In [151]:
df_q1['priors'].unique()

array([ 0,  4,  1, 14,  3,  7,  6,  5, 13,  8,  9, 21,  2, 20, 15, 10, 12,
       28, 19, 11, 22, 23, 25, 24, 36, 18, 16, 33, 17, 30, 27, 38, 26, 37,
       29, 35, 31])

In [152]:
pd.crosstab(df_q1.score, [df_q1.priors, df_q1.race], rownames=['score'], colnames=['priors', 'race'])

priors,0,0,1,1,2,2,3,3,4,4,...,29,30,30,31,33,33,35,36,37,38
race,African-American,Caucasian,African-American,Caucasian,African-American,Caucasian,African-American,Caucasian,African-American,Caucasian,...,African-American,African-American,Caucasian,African-American,African-American,Caucasian,African-American,Caucasian,African-American,African-American
score,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,179,347,111,170,33,80,30,37,20,18,...,0,0,0,0,0,0,0,0,0,0
2,130,142,99,86,47,53,45,24,26,18,...,0,0,0,0,0,0,0,0,0,0
3,123,89,66,65,46,47,25,29,22,13,...,0,0,0,0,0,0,0,0,0,0
4,114,79,82,51,47,39,25,35,23,20,...,0,1,0,0,0,0,0,0,0,0
5,88,58,73,46,44,35,34,28,30,15,...,0,0,0,0,0,0,0,0,0,0
6,50,45,68,35,41,24,44,12,21,14,...,2,0,1,1,1,0,0,1,0,0
7,71,30,52,20,45,13,30,16,24,9,...,0,0,0,0,1,0,0,0,1,1
8,55,23,35,17,25,10,22,13,31,12,...,2,0,0,0,0,0,0,0,0,1
9,35,17,41,7,42,14,23,5,19,5,...,1,0,0,0,0,1,0,0,0,0
10,27,8,35,7,22,5,22,5,15,4,...,0,0,0,0,0,0,1,0,0,0


In [153]:
#mean_by_race = df_q1[['race', 'score']].groupby("race", as_index=False).mean()

df_q1[['score', 'race', 'charge_degree']].groupby(['race','charge_degree'], as_index=False).mean()

#df_q1.groupby(['race','priors'])['score'].mean()

Unnamed: 0,race,charge_degree,score
0,African-American,F,5.621516
1,African-American,M,4.808529
2,Caucasian,F,4.155405
3,Caucasian,M,3.096509


In [154]:
df_q1.dtypes

id                int64
race             object
score             int64
priors            int64
charge_degree    object
dtype: object

In [155]:
df_q1.race = df_q1.race.astype("category")

In [156]:
df_q1.charge_degree = df_q1.charge_degree.astype("category")

In [157]:
df_q1.dtypes

id                  int64
race             category
score               int64
priors              int64
charge_degree    category
dtype: object

In [158]:
df_q1['race_charge'] = df_q1.race.str.cat(df_q1.charge_degree)
df_q1.head()

Unnamed: 0,id,race,score,priors,charge_degree,race_charge
1,3,African-American,3,0,F,African-AmericanF
2,4,African-American,4,4,F,African-AmericanF
3,5,African-American,8,1,F,African-AmericanF
6,8,Caucasian,6,14,F,CaucasianF
8,10,Caucasian,1,0,M,CaucasianM


In [159]:
mean_by_race_charge = df_q1[['race_charge', 'score']].groupby("race_charge", as_index=False).mean()
mean_by_race_charge

Unnamed: 0,race_charge,score
0,African-AmericanF,5.621516
1,African-AmericanM,4.808529
2,CaucasianF,4.155405
3,CaucasianM,3.096509


### <font color='#eb3483'> Graphs </font>