# Analysis on COMPAS Recidivism Data
### By: Tyler Rosselli, Kyle Mettler, Ethan Wong

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import ast, json

from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.set(rc={'figure.figsize':(30,50)})

In [None]:
df = pd.read_csv("compas-scores-raw.csv")

In [None]:
df.head()

## Pandas Profiling

In [None]:
import pandas_profiling

In [None]:
pandas_profiling.ProfileReport(df)

## There are no duplicates

In [None]:
df.shape

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df.shape

## Correct Types

In [None]:
df.dtypes

In [None]:
df.DateOfBirth = pd.to_datetime(df['DateOfBirth'])
df.Screening_Date = pd.to_datetime(df['Screening_Date'])

In [None]:
df.dtypes

## Columns

In [None]:
df.columns

In [None]:
columns_to_drop = ['AssessmentID','Case_ID','ScaleSet_ID','IsCompleted','IsDeleted']
df = df.drop(columns_to_drop, axis=1)
df.head()

## Rename column names

In [None]:
df.rename(columns={'Sex_Code_Text': 'Sex', 'Ethnic_Code_Text': 'Race', 'DisplayText': 'RiskType'}, inplace=True)

In [None]:
df.head()

In [None]:
df.columns

## Any Nulls?

In [None]:
df.isnull().sum()

## Rest of cleaning and exporting data

In [None]:
df[df.Race == 'African-Am'].head()

In [None]:
df.Race = df.Race.replace({
    'African-Am':'African-American'
})

In [None]:
df.head()

In [None]:
df.to_csv("compas-scores-clean.csv")

# Race vs Recidivism

In [None]:
#graphing data, changing non majority races to 'other'
df.Race = df.Race.replace({
    'Asian':'Other',
    'Oriental':'Other',
    'Arabic':'Other',
    'Native American':'Other'
})

# Decile Score

In [None]:
#race vs decile score
df.groupby(['Race', 'RiskType'])['DecileScore'].mean()

In [None]:
ax = sns.catplot(x='DecileScore',y='Race', col='RiskType', data=df, kind='bar', ci=False, orient='h')
ax

In [None]:
#distribution of decile scores across all races
#can see that african americans have the most evenly distributed decile scores
g = sns.FacetGrid(df, col='Race', col_wrap=2)
g = g.map(sns.distplot, 'DecileScore', kde=False, bins=10)

# Hypothesis Test 1

Question: Do African-Americans receive higher scores for recidivism than caucasians?

In [None]:
african_american = df[df.Race == 'African-American']
caucasian = df[df.Race == 'Caucasian']
african_american_scores = african_american['DecileScore']
caucasian_scores = caucasian['DecileScore']

In [None]:
african_american_scores.describe()

In [None]:
caucasian_scores.describe()

In [None]:
from statsmodels.stats.weightstats import ttest_ind
from scipy import stats

In [None]:
tstat, pvalue, ddof = ttest_ind(
                                african_american_scores,
                                caucasian_scores,
                                alternative="larger",
                                usevar="pooled"
)

In [None]:
print(pvalue)

P-value of 0, so we can reject the null hypothesis and accept the statistically proven hypothesis that African Americans receive higher scores for recidivism than Caucasians

# Recommended Supervision Level

In [None]:
#race vs recommended supervision level
df.groupby('Race')['RecSupervisionLevel'].mean()

In [None]:
ax = sns.catplot(x='RecSupervisionLevel',y='Race', data=df, kind='bar', ci=False, orient='h')

# Hypothesis Test 2

Question: Do African-Americans receive higher recommended supervision levels than caucasians?

In [None]:
african_american = df[df.Race == 'African-American']
caucasian = df[df.Race == 'Caucasian']
african_american_lvl = african_american['RecSupervisionLevel']
caucasian_lvl = caucasian['RecSupervisionLevel']

In [None]:
african_american_lvl.describe()

In [None]:
caucasian_lvl.describe()

In [None]:
tstat, pvalue, ddof = ttest_ind(
                                african_american_lvl,
                                caucasian_lvl,
                                alternative="larger",
                                usevar="pooled"
)

In [None]:

print(pvalue)

P-value of 0, so we can reject the null hypothesis and accept the statistically proven hypothesis that African Americans receive higher recommended supervision levels than Caucasians

# Linear Regression Model on compas-scores-raw

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
numerical_data = df.select_dtypes(np.number).fillna(0).drop(columns='Person_ID')
numerical_data.head()

In [None]:
target_variable = 'DecileScore'
independent_variables = numerical_data.drop(columns = target_variable).columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
     numerical_data[independent_variables],   # X
     numerical_data[target_variable], # y
     test_size=0.2,   # % of the data that goes to the test dataset
     random_state=13   # ensure reproductibility
)

In [None]:
print('X train', X_train.shape)
print('y train', y_train.shape)
print('X test', X_test.shape)
print('y test', y_test.shape)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
X = X_test.reset_index().copy()
X["DecileScore"] = y_test.tolist()
X["prediction"] = predictions
X.head()

In [None]:
sns.relplot(x="DecileScore", y="prediction", data=X, kind="scatter")

In [None]:
sns.scatterplot(x=X["RecSupervisionLevel"], y=X["DecileScore"], label = 'True')
sns.scatterplot(x=X["RecSupervisionLevel"], y=X["prediction"], label = 'Predictions')
sns.mpl.pyplot.ylabel("DecileScore")

# Using two year data

In [None]:
recid = pd.read_csv("compas-scores-two-years.csv")

In [None]:
pd.set_option('max_columns', 53)

In [None]:
recid.head()

In [None]:
recid.isnull().sum()

In [None]:
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas',
                'decile_score.1', 'priors_count.1']
recid = recid.drop(drop_columns,axis=1)

In [None]:
recid.shape

In [None]:
recid.drop(['violent_recid'],axis=1).head()

In [None]:
recid.dtypes.head()

In [None]:
recid.dob = pd.to_datetime(recid['dob'])
recid.c_offense_date = pd.to_datetime(recid['c_offense_date'])

In [None]:
recid.dtypes.head()

In [None]:
recid.shape

In [None]:
recid.to_csv('compas-scores-two-years-clean')


### Calculate Average Decile Score

In [None]:
recid['Ave_score'] =(recid['v_decile_score']+recid['decile_score']+recid['decile_score.1'])/3

In [None]:
recid = recid.drop_duplicates(keep='first')

In [None]:
recid.shape

In [None]:
recid.head()

In [None]:
#values equal to the two year recidivism rate within each race
two_year_recid = recid.groupby('race')[['two_year_recid']].mean()
two_year_recid.sort_values(by = 'two_year_recid', ascending = False)

In [None]:
recid.head()

In [None]:
recid[recid.race == 'African-American']

In [None]:
recid.groupby('age_cat')['two_year_recid'].mean()

In [None]:
numerical_recid = recid.select_dtypes(np.number).fillna(0)
numerical_recid.head()

In [None]:
sns.heatmap(numerical_recid.corr(), annot = True, square = True)

### Age vs. decile score and age vs. recidivism

In [None]:
sns.catplot(data=recid, x='age', y='decile_score', kind = 'box', aspect = 3)

In [None]:
numerical_recid['age'].corr(numerical_recid['decile_score'])

In [None]:
sns.catplot(data=recid, x='age', y='is_recid', aspect = 3, kind = 'box', orient = 'h', order = [1,0])

In [None]:
numerical_recid['age'].corr(numerical_recid['is_recid'])

## Decile score and recidivism compared by race

In [None]:
sns.catplot(x="decile_score", y='is_recid', data=recid, kind="box", 
            col='race', orient ='h', order = [1,0], height = 5)

## This group by shows average decile score with average corresponding two year recid value

In [None]:
recid.groupby(['Ave_score'])['two_year_recid'].mean()

### Correlation of Average Score and If they recidivated

In [None]:
recid['two_year_recid'].corr(recid['Ave_score'])

### Hypothesis Test: Caucasian Recid vs. African American Recid

Question: Do afican Amercians recitivate more than caucasians?
    

In [None]:
caucasian_recid = recid[recid.race == "Caucasian"].two_year_recid
caucasian_recid

In [None]:
aa_recid = recid[recid.race == "African-American"].two_year_recid
aa_recid

In [None]:
print("AA: \n" ,aa_recid.describe())
print("Caucasian: \n", caucasian_recid.describe())

In [None]:
tstat, pvalue, ddof = ttest_ind(
                                caucasian_recid,
                                aa_recid                                                       
)

In [None]:
print(pvalue,'\n', tstat,'\n',ddof)

With sucha low p value, we are able to conclude that african americans recitivate more than caucasians 

Earlier, we found that African Americans recieve higher recidivation scores that caucasians. We have now found that they end up recitivating at a higher rate. 

In [None]:
mid_age = recid[recid.age_cat == '25 - 45'].two_year_recid
under_25 = recid[recid.age_cat == 'Less than 25'].two_year_recid
tstat, pvalue, ddof = ttest_ind(
                                mid_age,
                                under_25                                                      
)

In [None]:
print(pvalue)

In [None]:
recid.head()

In [None]:
recid.head()