In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import ast, json

from datetime import datetime
import matplotlib.pyplot as plt1
%matplotlib inline

In [2]:
sns.set(rc={'figure.figsize':(30,50)})

In [3]:
df = pd.read_csv("compas-scores-raw.csv")

In [4]:
df.head()

Unnamed: 0,Person_ID,AssessmentID,Case_ID,Agency_Text,LastName,FirstName,MiddleName,Sex_Code_Text,Ethnic_Code_Text,DateOfBirth,...,RecSupervisionLevel,RecSupervisionLevelText,Scale_ID,DisplayText,RawScore,DecileScore,ScoreText,AssessmentType,IsCompleted,IsDeleted
0,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,7,Risk of Violence,-2.08,4,Low,New,1,0
1,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,8,Risk of Recidivism,-1.06,2,Low,New,1,0
2,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,18,Risk of Failure to Appear,15.0,1,Low,New,1,0
3,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,7,Risk of Violence,-2.84,2,Low,New,1,0
4,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,8,Risk of Recidivism,-1.5,1,Low,New,1,0


## Pandas Profiling

In [5]:
import pandas_profiling

In [6]:
pandas_profiling.ProfileReport(df)

TypeError: concat() got an unexpected keyword argument 'join_axes'

## There are no duplicates

In [None]:
df.shape

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df.shape

## Correct Types

In [None]:
df.dtypes

In [None]:
df.DateOfBirth = pd.to_datetime(df['DateOfBirth'])
df.Screening_Date = pd.to_datetime(df['Screening_Date'])

In [None]:
df.dtypes

## Columns

In [None]:
df.columns

In [None]:
columns_to_drop = ['AssessmentID','Case_ID','ScaleSet_ID','IsCompleted','IsDeleted']
df = df.drop(columns_to_drop, axis=1)
df.head()

## Rename column names

In [None]:
df.rename(columns={'Sex_Code_Text': 'Sex', 'Ethnic_Code_Text': 'Race', 'DisplayText': 'RiskType'}, inplace=True)

In [None]:
df.head()

In [None]:
df.columns

## Any Nulls?

In [None]:
df.isnull().sum()

# Any relation between race and recidivism?

In [None]:
df[df.Race == 'African-Am']

In [None]:
df.Race = df.Race.replace({
    'African-Am':'African-American'
})

In [None]:
df.head()

In [None]:
df.to_csv("compas-scores-clean.csv")

In [None]:
#race vs decile score
df.groupby(['Race', 'RiskType'])['DecileScore'].mean()

In [None]:
ax = sns.catplot(x='DecileScore',y='Race', col='RiskType', hue='Race', data=df, kind='bar', ci=False, orient='h')

In [None]:
#race vs risk of recidivism
df[df.RiskType == 'Risk of Recidivism'].groupby('Race')["Person_ID"].count().sort_values(ascending=False)

In [None]:
#race vs recommended supervision level
df.groupby('Race')['RecSupervisionLevel'].mean()

In [None]:
ax = sns.catplot(x='RecSupervisionLevel',y='Race',hue='Race', data=df, kind='bar', ci=False, orient='h')

In [None]:
#race vs rawscore
df.groupby('Race')['RawScore'].mean()

In [None]:
ax = sns.catplot(x='RawScore',y='Race',hue='Race', data=df, kind='bar', ci=False, orient='h')

In [None]:
#male vs female
df.groupby('Sex')['Person_ID'].count()

### Positive Correlation between RawScore and Scale_ID

In [None]:
sns.lmplot(x='RawScore',y='Scale_ID',data=df, hue="Race", ci=False)