# Testing correlations that should be similar for statistical significance using two-year data COMPAS data and Steiger and Fisher methods for comparing correlations

In [1]:
from IPython.display import Image
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)})

import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv('data/compas-scores-two-years-clean')
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,...,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,end,event,two_year_recid
0,0,1,miguel hernandez,miguel,hernandez,Male,1947-04-18,69,Greater than 45,Other,...,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,327,0,0
1,1,3,kevon dixon,kevon,dixon,Male,1982-01-22,34,25 - 45,African-American,...,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,159,1,1
2,2,4,ed philo,ed,philo,Male,1991-05-14,24,Less than 25,African-American,...,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,63,0,1
3,3,5,marcu brown,marcu,brown,Male,1993-01-21,23,Less than 25,African-American,...,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1174,0,0
4,4,6,bouthy pierrelouis,bouthy,pierrelouis,Male,1973-01-22,43,25 - 45,Other,...,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,1102,0,0


In [3]:
#Correlation between age and decile score
xy = df['age'].corr(df['decile_score'])
xy

-0.38773255375203747

In [4]:
#Correlation between age and two-year recidivism
xz = df['age'].corr(df['two_year_recid'])
xz

-0.1903234822204188

In [5]:
#Correlation between decile score and two-year recidivism
yz = df['decile_score'].corr(df['two_year_recid'])
yz

0.35070264269452134

In [6]:
df['age'].count()

7214

In [7]:
df['decile_score'].count()

7214

In [8]:
n = df['two_year_recid'].count()
n

7214

## Hypothesis test: COMPAS risk scores overestimate decline in recidivism with age

In [9]:
import numpy as np
from scipy.stats import t, norm
from math import atanh, pow
from numpy import tanh

In [10]:
def dependent_corr(xy, xz, yz, n, twotailed=True, conf_level=0.95, method='steiger'):
    """
    Calculates the statistical significance between two dependent correlation coefficients
    @param xy: correlation coefficient between x and y
    @param xz: correlation coefficient between x and z
    @param yz: correlation coefficient between y and z
    @param n: number of elements in x, y and z
    @param twotailed: whether to calculate a one or two tailed test, only works for 'steiger' method
    @param conf_level: confidence level, only works for 'zou' method
    @param method: defines the method uses, 'steiger' or 'zou'
    @return: t and p-val
    """
    d = xy - xz
    determin = 1 - xy * xy - xz * xz - yz * yz + 2 * xy * xz * yz
    av = (xy + xz)/2
    cube = (1 - yz) * (1 - yz) * (1 - yz)

    t2 = d * np.sqrt((n - 1) * (1 + yz)/(((2 * (n - 1)/(n - 3)) * determin + av * av * cube)))
    p = 1 - t.cdf(abs(t2), n - 3)

    if twotailed:
        p *= 2
    
    return t2, p

In [11]:
dependent_corr(xy, xz, yz, n, twotailed = False)

(-15.868951111828885, 0.0)

#### The p-value for the difference between age vs. recidivism and decile score vs. recidivism is 0, so we can conclude that the COMPAS risk scores overestimate the degree to which recidivism declines with age.

## Hypothesis: Correlation between decile score and recidivism is different for African-Americans than for Caucasians

In [12]:
xy = (df[(df.race == 'African-American')].decile_score).corr(df['two_year_recid'])
xy

0.33389497211860036

In [13]:
xz = (df[(df.race == 'Caucasian')].decile_score).corr(df['two_year_recid'])
xz

0.3358525164041781

### Nevermind!

# Hypothesis: Correlation between COMPAS risk scores and recidivism is higher for men than women

In [None]:
# Rationale for one-tailed hypothesis: 
# https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm

In [17]:
xy = (df[(df.sex == 'Male')].decile_score).corr(df['two_year_recid'])
xy

0.3524960732182592

In [18]:
ab = (df[(df.sex == 'Female')].decile_score).corr(df['two_year_recid'])
ab

0.3237400515740392