In [None]:
# Import basic libraries
import numpy as np 
import pandas as pd 

# import visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/fenago/compas-analysis/master/compas-scores.csv')

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

# DATA PREPROCESSSING

In [None]:
print('sex ' + str(sorted(df['sex'].unique())))
print('race ' + str(sorted(df['race'].unique())))
print('priors_count ' + str(sorted(df['priors_count'].unique())))
print('is_violent_recid ' + str(sorted(df['is_violent_recid'].unique())))
print('is_recid ' + str(sorted(df['is_recid'].unique())))

In [None]:
df = df[df.is_recid != -1]
print('is_recid ' + str(sorted(df['is_recid'].unique())))

In [None]:
df = df.rename(columns={'score_text': 'prediction', 
                        'priors_count': 'prior_charges'})
df.head()

# Univariate Analysis

In [None]:
df.describe(include='all')

In [None]:
df.info()

### Univariate Analysis of Continuous Variables

In [None]:
# store columns with specific data type
integer_columns = df.select_dtypes(include=['int64']).columns
float_columns = df.select_dtypes(include=['float64']).columns
object_columns = df.select_dtypes(include=['object']).columns
  
# display columns
print('\nint64 columns:\n',integer_columns)
print('\nfloat64 columns:\n',float_columns)
print('\nobject columns:\n',object_columns)

In [None]:
# Remove the .columns and it saves it as a dataframe
num_features = df.select_dtypes(exclude=['object'])
cat_features = df.select_dtypes(include=['object'])
type(num_features)
type(cat_features)

In [None]:
num_list = ['days_b_screening_arrest', 'c_days_from_compas', 'num_r_cases',
       'r_days_from_arrest', 'num_vr_cases','id', 'age', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'prior_charges', 'is_recid', 'is_violent_recid',
       'v_decile_score', 'decile_score.1']
num_list

In [None]:
cat_list = ['name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age_cat', 'race', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_charge_degree', 'c_charge_desc',
       'r_case_number', 'r_charge_degree', 'r_offense_date', 'r_charge_desc',
       'r_jail_in', 'r_jail_out', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'v_type_of_assessment',
       'v_score_text', 'v_screening_date', 'type_of_assessment', 'prediction',
       'screening_date']

In [None]:
# Remove the .columns and it saves it as a dataframe
num_features = df.select_dtypes(exclude=['object'])
cat_features = df.select_dtypes(include=['object'])
type(num_features)
type(cat_features)

In [None]:
num_features.describe(include='all')

#### Now we will plot histograms for continuous columns to see the frequency distribution of values of columns.
The histogram for the age column can be plotted using the below line of code

In [None]:
sns.histplot(df.age,kde=True)

In [None]:
sns.histplot(df.prior_charges,kde=True)

#### Count plots various variables

In [None]:
sns.countplot(x="is_recid", data=df)

In [None]:
df['is_recid'].value_counts()

In [None]:
sns.countplot(x="race", data=df)

In [None]:
df['race'].value_counts()

In [None]:
sns.countplot(x="prior_charges", data=df)

In [None]:
df['prior_charges'].value_counts()

In [None]:
sns.countplot(x="age_cat", data=df)

In [None]:
df['age_cat'].value_counts()

In [None]:
sns.countplot(x="prior_charges", data=df)

In [None]:
df['prior_charges'].value_counts()

In [None]:
sns.countplot(x="juv_fel_count", data=df)

In [None]:
df['juv_fel_count'].value_counts()

In [None]:
sns.countplot(x="juv_misd_count", data=df)

In [None]:
df['juv_misd_count'].value_counts()

# Bivariate Analysis

#### Bivariate Analysis of Categorical Variables vs Categorical Variables:

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sex = sns.countplot(x='sex', hue='is_recid', data=df)
plt.show()

In [None]:
pd.crosstab(df.sex,df.is_recid,margins=True)

In [None]:
pd.crosstab(df.sex,df.is_recid,normalize='index')

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
edu = sns.countplot(x='race', hue='is_recid', data=df)
plt.show()

In [None]:
pd.crosstab(df.race,df.is_recid,margins=True)

In [None]:
pd.crosstab(df.race,df.is_recid,normalize='index')

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
priors = sns.countplot(x='prior_charges', hue='is_recid', data=df)
plt.show()

In [None]:
pd.crosstab(df.prior_charges,df.is_recid,margins=True)

In [None]:
pd.crosstab(df.prior_charges,df.is_recid,normalize='index',margins=True)

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
edu = sns.countplot(x='juv_fel_count', hue='is_recid', data=df)
plt.show()

In [None]:
pd.crosstab(df.juv_fel_count,df.is_recid,margins=True)

In [None]:
pd.crosstab(df.juv_fel_count,df.is_recid,normalize='index',margins=True)

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
edu = sns.countplot(x='age_cat', hue='is_recid', data=df)
plt.show()

In [None]:
pd.crosstab(df.age_cat,df.is_recid,margins=True)

In [None]:
pd.crosstab(df.age_cat,df.is_recid,normalize='index',margins=True)

We can also draw line plots and scatterplots to see a relation between the two continuous variables.

In [None]:
sns.scatterplot(df.age, df.r_days_from_arrest)
# plt.ylim(0,10)

#### Bivariate Analysis of Categorical Variables vs Continuous Variables:
GroupBy: First, we will perform the GroupBy operation on the continuous variables. Groupby allows us to split our data into separate groups to perform computations for better analysis.

In [None]:
# Get a global view of all continuous variables with respect to a single categorical variable
# This can take a long time to run - you may want to limit the charts
# this is for the eyeball test.  look for obvious patterns

# sns.pairplot(data=df[num_list],hue='is_recid')

In [None]:
# by=<catagorical variable... then - everything after that is a numeric (continuous variable)
df.groupby(by='is_recid').agg('mean')

In [None]:
df.groupby(by='is_recid').agg('mean')[['age','prior_charges','decile_score']]

KDE Plots with Hue: A kernel density estimate (KDE) plot is a method for visualizing the distribution of observations in a dataset, analogous to a histogram. KDE represents the data using a continuous probability density curve in one or more dimensions.

We will plot KDE plots of continius variables with hue=’is_recid’

In [None]:
# x = <NUMERIC VARIABLE>, hue = <CATEGORICAL VARIABLE>
plt.figure(figsize=(12,8))
sns.kdeplot(data=df,x='age',hue='is_recid',fill=True)

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(data=df,x='prior_charges',hue='is_recid',fill=True)

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(data=df,x='v_decile_score',hue='is_recid',fill=True)

In the above plots, we can see how the distribution of variables behaves separately for the “Approved‘ and ‘Rejected’ cases.

#### Bivariate Analysis of Categorical Variables vs Categorical Variables:

In [None]:
df['is_recid'].value_counts()

In [None]:
# This is the global mean for recidivisim rates for ALL 
# So we use this as the basis and compare individual features with this value to see if a feature is above or below this
df['is_recid'].value_counts(normalize=True)

In [None]:
# Compare the values to the global mean.  if it is close to the global mean then it doesn't have a lot of predictive power
# if it is far from the global mean - then there is something special about the variable that gives it predictive power
pd.crosstab(df.race,df.is_recid,normalize='index')

In [None]:
pd.crosstab(df.age_cat,df.is_recid,normalize='index')

In [None]:
cat_features.dtypes

In [None]:
# by= CAT VARIABLE   &&   ALL others CONTINUOUS VARIABLES
df.groupby(by='is_recid').agg('mean')[['age','prior_charges']]

In [None]:
# Means TEST - these are the GLOBAL Averages.
# Compare individual categorical values with the MEAN and look for significant differences
# Important - all of this assumes an even distribution of the data
df[num_list].agg('mean')

In [None]:
df[df.race=='African-American'][num_list].agg('mean')

In [None]:
df[df.sex=='Female'][num_list].agg('mean')

In [None]:
# Use Crosstabs to validate individual hypothesis
pd.crosstab(df.sex,df.is_recid,normalize='index',margins=True)

# Correlations

In [None]:
num_features.corr()

In [None]:
sns.set(rc={'figure.figsize':(30,10)})
sns.set_context("talk", font_scale=0.7)

In [None]:
sns.heatmap(df.iloc[:,1:].corr(method='spearman'), cmap='rainbow_r', annot=True)

In [None]:
df.drop("is_recid", axis=1).apply(lambda x: x.corr(df.is_recid,method='spearman'))

# Multivariate Analysis

In [None]:
plt.style.use('ggplot')

In [None]:
# Single Index - compare categorical value with all continuous values with pivot tables
table = pd.pivot_table(data=df,index=['sex'])
table

In [None]:
#multiple indexes
# Multiple  values with respect to all continuous values in the dataset
table = pd.pivot_table(df,index=['sex','race'])
table

In [None]:
table = pd.pivot_table(df,index=['sex','race','is_recid'])
table

In [None]:
#different aggregate functions
table = pd.pivot_table(df,index=['sex','race'],aggfunc={'age':np.mean,'is_recid':np.mean,'is_violent_recid':np.sum})
table

In [None]:
# Aggregate on specific features with values parameter
table = pd.pivot_table(df,index=['sex','race'],values=['is_recid'], aggfunc=np.mean)
table

In [None]:
table.plot(kind='bar');

In [None]:
#columns
# Find the relationship between features with columns parameter

table = pd.pivot_table(df,index=['sex'],columns=['race'],values=['is_recid'],aggfunc=np.sum)
table

In [None]:
table = pd.pivot_table(df,index=['sex'],columns=['race'],values=['is_recid'],aggfunc=np.mean)
table

In [None]:
table.plot(kind='bar');

In [None]:
# Aggregate on specific features with values parameter
table = pd.pivot_table(df,index=['race','is_recid'],dropna=False)
table

In [None]:
table = df.pivot_table(index=['race','is_recid'], 
               columns='prediction',
               aggfunc='size',
               fill_value=0,)
table

In [None]:
table = df.pivot_table(index=['race','is_recid'], 
               columns='prediction',
               aggfunc='var',
               fill_value=0,)
table

In [None]:
aggfuncs = [ 'count', 'sum', 'sem', 'skew', 'mean', 'min', 'max', 'std', 'quantile', 'nunique', 'mad', 'size', pd.Series.mode, 'var', 'unique']
df.groupby('prediction', dropna=False)['age'].agg(aggfuncs)

In [None]:
table = df.pivot_table(index=['race','is_recid'], 
               columns='prediction',
               aggfunc='size',
               fill_value=0,
                      margins_name='All',)
table

In [None]:
table = df.pivot_table(index=['race','is_recid'], 
               columns='prediction',
               aggfunc='mean',)
table

In [None]:
table = pd.pivot_table(df, values=['is_recid'], index=['race', 'prediction'],
                    aggfunc={'is_recid': [sum,np.mean]})
table

In [None]:
df["prediction"] = pd.Categorical(df["prediction"], categories=["High", "Medium", "Low"])
pd.pivot_table(df, index=['race', 'prediction'], columns="prediction", dropna=False)


Creating a Multi-Index Pandas Crosstab
You can add multiple indices (rows) to a crosstab as well. This can be done by passing a list of variables to the crosstab function.

Say you wanted to break items down by race and is_recid (the rows) --- and columns having prediction

In [None]:
# Use the Crosstab for multivariate
pd.crosstab([df.race, df.is_recid], df.prediction)

You can change the names of the labels. Let’s change them to better reflect that the data represents

In [None]:
pd.crosstab([df.race, df.is_recid], df.prediction,rownames=['RACE', 'Reoffend?'])

Similar to adding multiple rows, you can also add multiple columns. Let’s repeat the example above and break the data out by

In [None]:
pd.crosstab([df.race, df.is_recid], df.prediction,rownames=['RACE', 'Reoffend?'], colnames=['COMPAS PREDICTION'])

In [None]:
pd.crosstab([df.race, df.is_recid], df.prediction,rownames=['RACE', 'Reoffend?'], colnames=['COMPAS PREDICTION'],margins=True, margins_name='Total')

### Normalize a Pandas Crosstab for Row/Column Percentages 
A key benefit of the crosstab function over the Pandas Pivot Table function is that it allows you to normalize the resulting dataframe, returning values displayed as percentages.
 <br />
This goes one step further – the normalize argument accepts a number of different options: <br />
 <br />
‘all’ or True – normalizes the values across the entire dataframe (as a percentage of the total across rows and columns) <br />
‘index’ – normalizes across rows <br />
‘columns’ – normalizes down columns <br />
If the margins argument is set to True, the totals will also be normalized. <br />

It is usually best to normalize the index until you are confident you can interpret the others

In [None]:
pd.crosstab([df.race, df.is_recid], df.prediction,normalize='index')

In [None]:
pd.crosstab([df.race, df.is_recid], df.prediction,normalize='columns')

In [None]:
pd.crosstab([df.race, df.is_recid], df.prediction,normalize='all')

In [None]:
pd.crosstab([df.race, df.is_recid], df.prediction,normalize='all',margins=True, margins_name='Total')

In [None]:
# African American v. All other sub-groups
pd.crosstab([df.race=="African-American", df.is_recid], df.prediction,normalize='index')

In [None]:
pd.crosstab([df.race=="Caucasian", df.is_recid], df.prediction,normalize='index')

Plotting your Multivariate

In [None]:
pd.crosstab([df.race, df.is_recid], df.prediction,normalize='index').plot.bar(stacked=True)

In [None]:
pd.crosstab([df.race, df.is_recid], df.prediction,normalize='index').plot.bar()

In [None]:
pd.crosstab([df.race, df.sex, df.is_recid], df.prediction,normalize='index')

# Automated EDA Tooling

In [None]:
#Installing the library
!pip install dataprep
#Importing 
from dataprep.eda import create_report
#Creating report
create_report(df)

In [None]:
!pip install skimpy

In [None]:
from skimpy import skim
skim(df)

In [None]:
#Installing the library 
!pip install sweetviz
#Importing the library 
import sweetviz as sv
report = sv.analyze(df)
report.show_html()

In [None]:
# Spliting data set into training and testing set
training_data = df.sample(frac=0.8, random_state=25)
testing_data = df.drop(training_data.index)
#Applying compare function
report2 = sv.compare([training_data,"TRAINING SET"], [testing_data, "TESTING SET"])
report2.show_html()

In [None]:
report3 = sv.compare_intra(df, df["color"] == "D", ["D", "The rest"])
report3.show_html()