# <font color='#eb3483'> COMPAS ANALYSIS </font>


Our group analyzed the COMPAS Dataset to understand criminal recidivism more clearly.

We hypothesized that a machine learning analysis of the COMPAS Two-Year Excel Spreadsheets would describe a higher likelihood of African-American surveyors 

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('compas-analysis-master/compas-scores-two-years.csv') 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

#This is so that you don't see a bunch of code 'warnings' (things that you could change but don't have to right now)
import warnings
warnings.filterwarnings("ignore")

import seaborn as sns

#This makes all of our graphs show up in our notebook when they're made
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

sns.set(rc={'figure.figsize':(6,5)}) 
%matplotlib inline

## <font color='#eb3483'> Introduction </font>


### <font color='#eb3483'> Understanding the Initial Dataset </font>

In [2]:
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


In [3]:
df.tail()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
7209,10996,steven butler,steven,butler,2013-11-23,Male,1992-07-17,23,Less than 25,African-American,...,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,1,860,0,0
7210,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,1,790,0,0
7211,10999,winston gregory,winston,gregory,2014-01-14,Male,1958-10-01,57,Greater than 45,Other,...,1,Low,2014-01-14,2014-01-13,2014-01-14,0,0,808,0,0
7212,11000,farrah jean,farrah,jean,2014-03-09,Female,1982-11-17,33,25 - 45,African-American,...,2,Low,2014-03-09,2014-03-08,2014-03-09,3,0,754,0,0
7213,11001,florencia sanmartin,florencia,sanmartin,2014-06-30,Female,1992-12-18,23,Less than 25,Hispanic,...,4,Low,2014-06-30,2015-03-15,2015-03-15,2,0,258,0,1


In [4]:
df.shape

(7214, 53)

In [5]:
df.columns

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [6]:
df.dtypes

id                           int64
name                        object
first                       object
last                        object
compas_screening_date       object
sex                         object
dob                         object
age                          int64
age_cat                     object
race                        object
juv_fel_count                int64
decile_score                 int64
juv_misd_count               int64
juv_other_count              int64
priors_count                 int64
days_b_screening_arrest    float64
c_jail_in                   object
c_jail_out                  object
c_case_number               object
c_offense_date              object
c_arrest_date               object
c_days_from_compas         float64
c_charge_degree             object
c_charge_desc               object
is_recid                     int64
r_case_number               object
r_charge_degree             object
r_days_from_arrest         float64
r_offense_date      

### <font color='#eb3483'> Unnecessary/Duplicate Column Removal </font>

In [7]:
del_col_list = ['id', 'name', 'first', 'last', 'c_case_number',
                'r_case_number', 'vr_case_number']

df = df.drop(del_col_list, axis=1)
df.head()

Unnamed: 0,compas_screening_date,sex,dob,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,0,1,0,0,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,0,3,0,0,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,0,4,0,1,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,0,8,1,0,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,2013-03-26,Male,1973-01-22,43,25 - 45,Other,0,1,0,0,...,1,Low,2013-03-26,,,2,0,1102,0,0


In [8]:
print(df.shape)

# Rows containing duplicate data
duplicate_rows_df = df[df.duplicated()]

print(duplicate_rows_df.shape)

(7214, 46)
(0, 46)


## <font color='#eb3483'> Understanding The Top Offenders </font>

In [9]:
top_reoffenders = df.sort_values(by ='priors_count', ascending=False).head()
top_reoffenders

Unnamed: 0,compas_screening_date,sex,dob,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
6825,2014-07-06,Male,1970-12-11,45,Greater than 45,African-American,0,8,0,0,...,8,High,2014-07-06,2014-07-05,2014-11-03,38,120,462,1,1
2058,2014-09-03,Male,1957-06-10,58,Greater than 45,African-American,0,7,0,0,...,2,Low,2014-09-03,2015-04-08,2015-06-24,38,0,217,1,1
2186,2014-04-01,Male,1977-08-06,38,25 - 45,African-American,0,7,5,0,...,2,Low,2014-04-01,2014-03-31,2014-05-01,37,30,138,1,1
328,2014-12-11,Male,1963-04-02,53,Greater than 45,Caucasian,0,6,0,0,...,9,High,2014-12-11,2015-05-08,2015-05-09,36,26,148,0,1
4374,2013-08-22,Male,1976-07-22,39,25 - 45,African-American,0,10,0,0,...,8,High,2013-08-22,2013-10-08,2013-10-22,35,0,47,0,0


In [10]:
def find_min_max_in(col):
    top = df[col].idxmax()
    top_df = pd.DataFrame(df.loc[top])
    
    bottom = df[col].idxmin()
    bottom_df = pd.DataFrame(df.loc[bottom])
    
    info_df = pd.concat([top_df, bottom_df], axis=1)
    return info_df

find_min_max_in('priors_count')

Unnamed: 0,2058,0
compas_screening_date,2014-09-03,2013-08-14
sex,Male,Male
dob,1957-06-10,1947-04-18
age,58,69
age_cat,Greater than 45,Greater than 45
race,African-American,Other
juv_fel_count,0,0
decile_score,7,1
juv_misd_count,0,0
juv_other_count,0,0


### <font color='#eb3483'> Linear Regression </font>

In [19]:
two_years = pd.read_csv("compas-analysis-master/compas-scores-two-years.csv")

In [20]:
two_years

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,10996,steven butler,steven,butler,2013-11-23,Male,1992-07-17,23,Less than 25,African-American,...,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,1,860,0,0
7210,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,1,790,0,0
7211,10999,winston gregory,winston,gregory,2014-01-14,Male,1958-10-01,57,Greater than 45,Other,...,1,Low,2014-01-14,2014-01-13,2014-01-14,0,0,808,0,0
7212,11000,farrah jean,farrah,jean,2014-03-09,Female,1982-11-17,33,25 - 45,African-American,...,2,Low,2014-03-09,2014-03-08,2014-03-09,3,0,754,0,0


In [26]:
y=two_years['priors_count']
X=two_years.drop('priors_count', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [27]:
print('X train', X_train.shape)
print('y train', y_train.shape)
print('X test', X_test.shape)
print('y test', y_test.shape)

X train (5771, 52)
y train (5771,)
X test (1443, 52)
y test (1443,)


In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
LinearRegression

sklearn.linear_model._base.LinearRegression

In [32]:
model = LinearRegression()
model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [35]:
model.fit(y=y_train, X=X_train)

ValueError: could not convert string to float: 'john ballew'

In [34]:
model.intercept_

AttributeError: 'LinearRegression' object has no attribute 'intercept_'

In [None]:
model.coef_

In [None]:
predictions = model.predict(X_test)

In [None]:
for y, y_pred in list(zip(y_test, predictions))[:5]:
    print("Real value: {:.3f} Estimated value: {:.5f}".format(y, y_pred))

In [None]:
X = X_test.reset_index().copy() # make a copy of indices and data
X["target"] = y_test.tolist()
X["prediction"] = predictions
X.head()

In [None]:
type(X)

In [None]:
sns.relplot(x="target", y="prediction", data=X, kind="scatter")

In [None]:
sns.scatterplot(x=X["CRIM"], y=X["target"], label = 'Target(truth)')
sns.scatterplot(x=X["CRIM"], y=X["prediction"], label = 'Predictions')
sns.mpl.pyplot.ylabel("Price");

## <font color='#eb3483'> Statistics </font>

In [None]:
import scipy.stats as stats

In [None]:
survey.to_csv("data/survey.csv", index=False)

In [None]:
survey.race.describe()

In [None]:
survey.age.describe()

In [None]:
survey.priors_count.describe()

In [None]:
survey.race.mode()

In [None]:
survey.sex.mode()

In [None]:
survey.groupby('race')["priors_count"].mean()

In [None]:
survey[['juv_fel_count', 'priors_count']].corr()

In [None]:
ax = sns.scatterplot(x="race", y="priors_count", data=df, hue = "is_recid")

## <font color='#eb3483'> Logistical Regression </font>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
compas = pd.read_csv('data/compas-scores.csv')
compas.head()

In [None]:
sns.pairplot(compas)

In [None]:
from ipywidgets import interact, fixed

#Create our plotting function
def plotRecidivismPercent(df, col):
    #Check out what average recidivism is for each potential value in a collumn
    compas.groupby(col).agg({'Two_yr_Recidivism':np.mean}).plot.barh()

#Let's look at all columns (you might want to drop numeric columns 
#that aren't binary but you can also just ignore that graph)
columns_to_plot = compas.drop('Two_yr_Recidivism',axis=1).columns
interact(plotRecidivismPercent, 
         col=columns_to_plot, df=fixed(compas));

In [None]:
#Get our data into the right format
X = compas.drop('Two_yr_Recidivism', axis=1)
Y = compas['Two_yr_Recidivism']
X_tr, X_test, Y_tr, Y_test = train_test_split(X,Y, test_size = 0.2)

In [None]:
Y_tr

In [None]:
#Instantiate our logistic regression model
logreg = LogisticRegression()

#Fit our training data
logreg.fit(X_tr, Y_tr)

#Predict on our test data
predictions = logreg.predict(X_test)

predictions[:10]

In [None]:
#Check accuracy
print("Accuracy: ", (predictions == Y_test).mean())

In [None]:
from sklearn.model_selection import cross_val_score
logreg = LogisticRegression()
cross_val_score(logreg, X, Y, scoring="accuracy", 
                cv=10).mean()

In [None]:
logreg = LogisticRegression()
cross_val_score(logreg, X, Y, scoring="roc_auc", 
                cv=10).mean()

In [None]:
logreg = LogisticRegression()

#Fit our training data
logreg.fit(X_tr, Y_tr)

#Predict on our test data
for col in ['African_American', 'Asian', 'Hispanic', 'Native_American', 'Other']:
    predictions = logreg.predict(X_test[X_test[col] == 1])
    print("Accuracy (%s): "%col, (predictions == Y_test[X_test[col] == 1]).mean())

## <font color='#eb3483'> Classification </font>


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(true_classes, predictions)

from sklearn import metrics
metrics.accuracy_score(true_classes, predictions)

In [None]:
metrics.precision_score(true_classes, predictions)

metrics.recall_score(true_classes, predictions)

In [None]:
metrics.f1_score(true_classes, predictions)

In [None]:
model.predict_proba(X_test) [:5]

In [None]:
df = pd.DataFrame({"true_class":true_classes,
                   "pred_class": predictions,
                   "probabilities_0":model.predict_proba(X_test)[:,0],
                    "probabilities_1":model.predict_proba(X_test)[:,1],
                  })

df["sum_probas"] = df.probabilities_0 + df.probabilities_1

df.sum_probas.head()

In [None]:
df.sample(10)

In [None]:
df.query("probabilities_1>0.5 & pred_class==0")

In [None]:
df.query("probabilities_0>0.5 & pred_class==1")