# 1 Data Preprocessing

In [2]:
import numpy as np
from numpy import mean, std
import scipy.optimize as optim
import pandas as pd
import math
from sklearn.metrics import accuracy_score
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
import time
from statistics import mean

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [5]:
csv_file_path = './compas-analysis-master/compas-scores-two-years-violent.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(csv_file_path)


In [6]:
df = data[['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count',
           'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out','is_violent_recid']]
df.info()
df = df.loc[df['race'].isin(('African-American', 'Caucasian'))]
unique_races = set(data['race'])
print(unique_races)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4743 entries, 0 to 4742
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   age               4743 non-null   int64 
 1   c_charge_degree   4743 non-null   object
 2   race              4743 non-null   object
 3   age_cat           4743 non-null   object
 4   score_text        4738 non-null   object
 5   sex               4743 non-null   object
 6   priors_count      4743 non-null   int64 
 7   decile_score      4743 non-null   int64 
 8   is_recid          4743 non-null   int64 
 9   two_year_recid    4743 non-null   int64 
 10  c_jail_in         4475 non-null   object
 11  c_jail_out        4475 non-null   object
 12  is_violent_recid  4743 non-null   int64 
dtypes: int64(6), object(7)
memory usage: 481.8+ KB
{'Other', 'Asian', 'African-American', 'Native American', 'Hispanic', 'Caucasian'}


In [7]:
df = data[['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count',
           'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out','is_violent_recid']]


df = df.loc[df['race'].isin(('African-American', 'Caucasian'))]
df.loc[df["race"] == "African-American", "race"] = 0
df.loc[df["race"] == "Caucasian", "race"] = 1

df = df.replace({'sex': 'Male'}, 1)
df = df.replace({'sex': 'Female'}, 0)

df = df.loc[df['is_recid'] != -1]
df = df.loc[df['c_charge_degree'] != 'O']
df = df.loc[df['score_text'] != 'N/A']

df['length_of_stay'] = (df['c_jail_out'].apply(pd.to_datetime) - df['c_jail_in'].apply(pd.to_datetime)).dt.days
df = df.dropna(subset = ['length_of_stay'])
df['length_of_stay'] = df['length_of_stay'].apply(lambda length_of_stay: 0 if length_of_stay <= 7 else (2 if length_of_stay > 90 else 1))
df = df.drop(columns=['c_jail_in', 'c_jail_out'])

df['priors_count'] = df['priors_count'].apply(lambda priors_count: 0 if priors_count == 0 else (2 if priors_count > 3 else 1))

df = df.replace({'age_cat': 'Less than 25'}, 0)
df = df.replace({'age_cat': '25 - 45'}, 1)
df = df.replace({'age_cat': 'Greater than 45'}, 2)

df = df.replace({'c_charge_degree': 'F'}, 0)
df = df.replace({'c_charge_degree': 'M'}, 1)

df = df.replace({'score_text': 'Low'}, 0)
df = df.replace({'score_text': 'Medium'}, 1)
df = df.replace({'score_text': 'High'}, 2)

df = df.drop_duplicates()

df.tail()

Unnamed: 0,age,c_charge_degree,race,age_cat,score_text,sex,priors_count,decile_score,is_recid,two_year_recid,is_violent_recid,length_of_stay
4732,32,0,0,1,1.0,1,2,5,0,0,0,0
4733,23,0,1,0,2.0,1,0,8,0,0,0,0
4737,23,1,1,0,2.0,1,2,10,1,1,1,1
4738,20,0,0,0,2.0,1,0,9,0,0,0,0
4742,33,1,0,1,0.0,0,1,2,0,0,0,0


In [8]:
df.isna().sum()

age                 0
c_charge_degree     0
race                0
age_cat             0
score_text          4
sex                 0
priors_count        0
decile_score        0
is_recid            0
two_year_recid      0
is_violent_recid    0
length_of_stay      0
dtype: int64

In [9]:
df = df.dropna(subset=['score_text'])

In [10]:
df.isna().sum()

age                 0
c_charge_degree     0
race                0
age_cat             0
score_text          0
sex                 0
priors_count        0
decile_score        0
is_recid            0
two_year_recid      0
is_violent_recid    0
length_of_stay      0
dtype: int64

# 2 Handling Conditional Discrimination (LM and LPS)

## 2.1 Baselinemodel

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# from tensorflow import keras
# from keras.layers import Dense, Input
from tensorflow.keras import Model
import scipy.stats as ss
import numpy as np
from sklearn.metrics import classification_report

In [14]:
x = df.drop(['two_year_recid'],1)
y = df['two_year_recid']

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/7)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 1/6)

In [16]:
start = time.time()
model = LogisticRegression()
model.fit(x_train, y_train)
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1)
end = time.time()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 2.2 Local Massaging Model

In [17]:
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
clf.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.990521327014218

In [18]:
#Calculate Posterior probability and then rank
predicted_prob = model.predict_proba(x)
predicted_prob

array([[1.00924413e-01, 8.99075587e-01],
       [9.99455668e-01, 5.44332349e-04],
       [7.05235831e-02, 9.29476417e-01],
       ...,
       [4.00378841e-02, 9.59962116e-01],
       [9.97929070e-01, 2.07093029e-03],
       [9.99207116e-01, 7.92884165e-04]])

In [19]:
#Selecting right Col for class_1: crisis
pred_crisis = predicted_prob[:,1]
#Updating features in X
x['pred_crisis'] = pred_crisis
x['two_year_recid'] = y

In [20]:
#race: Caucasian = 1; African-American = 0
x_c = x[x['race'] == 1]
x_a = x[x['race'] == 0]

In [21]:
#Algo 4: subroutine DELTA(race)
G_c = x_c.shape[0]
G_a = x_a.shape[0]
print(G_c, G_a)

1255 1699


In [22]:
#To those Caucasian whose class is 0 (predicted_crisis > 0.5)
p_c_c = x_c[x_c['pred_crisis']>0.5].shape[0]/G_c
#To those African-American whose class is 0 (predicted_crisis > 0.5)
p_c_a = x_a[x_a['pred_crisis'] > 0.5].shape[0]/G_a
p_star_c = (p_c_c+p_c_a)/2

#To calculate DELTA(Caucasian)
delta_c = G_c*abs(p_c_c - p_star_c)
#To calculate DELTA(African-American)
delta_a = G_a*abs(p_c_a - p_star_c)
print(delta_c, delta_a)

53.314008240141234 72.1756972111554


In [23]:
delta_c = 53
delta_a = 72

In [24]:
x_c_0 = x_c[x_c['two_year_recid'] == 0]
x_c_sorted = x_c_0.sort_values(by = 'pred_crisis', ascending = False)
x_c_sorted = x_c_sorted[x_c_sorted['pred_crisis']>0.5]
x_c_sorted

Unnamed: 0,age,c_charge_degree,race,age_cat,score_text,sex,priors_count,decile_score,is_recid,is_violent_recid,length_of_stay,pred_crisis,two_year_recid
4693,25,1,1,1,2.0,1,2,8,1,1,1,0.956235,0
3254,49,1,1,2,2.0,1,2,8,1,1,0,0.944646,0
609,26,1,1,1,0.0,1,2,4,1,1,1,0.937752,0
4366,56,1,1,2,1.0,1,2,7,1,1,0,0.93483,0
1404,48,1,1,2,2.0,1,1,8,1,1,0,0.923182,0
2857,36,0,1,1,0.0,1,1,2,1,1,0,0.92043,0
3109,22,0,1,0,1.0,1,0,5,1,1,0,0.916427,0
3988,56,0,1,2,0.0,1,1,1,1,1,0,0.907159,0
2689,46,0,1,2,0.0,0,1,2,1,1,1,0.90166,0
4061,53,1,1,2,1.0,1,1,5,1,1,0,0.90061,0


In [25]:
len(x_c_sorted)

13

In [26]:
#We want to relabel the last  Caucasian by labels from - to +
x_c_sorted['two_year_recid'] = 1
x_c_sorted

Unnamed: 0,age,c_charge_degree,race,age_cat,score_text,sex,priors_count,decile_score,is_recid,is_violent_recid,length_of_stay,pred_crisis,two_year_recid
4693,25,1,1,1,2.0,1,2,8,1,1,1,0.956235,1
3254,49,1,1,2,2.0,1,2,8,1,1,0,0.944646,1
609,26,1,1,1,0.0,1,2,4,1,1,1,0.937752,1
4366,56,1,1,2,1.0,1,2,7,1,1,0,0.93483,1
1404,48,1,1,2,2.0,1,1,8,1,1,0,0.923182,1
2857,36,0,1,1,0.0,1,1,2,1,1,0,0.92043,1
3109,22,0,1,0,1.0,1,0,5,1,1,0,0.916427,1
3988,56,0,1,2,0.0,1,1,1,1,1,0,0.907159,1
2689,46,0,1,2,0.0,0,1,2,1,1,1,0.90166,1
4061,53,1,1,2,1.0,1,1,5,1,1,0,0.90061,1


In [27]:
x_a_1 = x_a[x_a['two_year_recid'] == 1]
x_a_sorted = x_a_1.sort_values(by = 'pred_crisis', ascending = False)
x_a_sorted = x_a_sorted[x_a_sorted['pred_crisis']<0.5]

In [28]:
# Updating on original X:
cond_1 = (x['two_year_recid']==0) & (x['pred_crisis']>0.5) & (x['race'] == 1)

In [29]:
x[cond_1] = x_c_sorted
new_x = x.drop(['two_year_recid','pred_crisis'],1)
new_y = x['two_year_recid']

In [30]:
x_train, x_test, y_train, y_test = train_test_split(new_x, new_y, test_size = 1/7)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 1/6)

In [31]:
model_new = LogisticRegression()
start = time.time()
model_new.fit(x_train, y_train)
#10-cross-fold-validation
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(model_new, new_x, new_y, scoring='accuracy', cv=cv, n_jobs=-1)
end = time.time()
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print( f"Testing time: {end-start}")
new_data=pd.concat([new_x,new_y],axis=1)
new_sen=new_data[new_data['race']==0]
new_nsen=new_data[new_data['race']==1]
new_sen_y=new_sen['two_year_recid']
new_sen_x=new_sen.drop(columns=['two_year_recid'])
new_nsen_y=new_nsen['two_year_recid']
new_nsen_x=new_nsen.drop(columns=['two_year_recid'])
score_sen=cross_val_score(model_new, new_sen_x, new_sen_y, scoring='accuracy', cv=cv, n_jobs=-1)
score_nsen=cross_val_score(model_new, new_nsen_x, new_nsen_y, scoring='accuracy', cv=cv, n_jobs=-1)
calib_1=abs(mean(score_sen)-mean(score_nsen))
print('Calibration: ', calib_1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.992 (0.006)
Testing time: 2.476166248321533
Calibration:  0.014121127741037243


# 2.3 Local preferential Sampling

From local massaging, we know that delta_c = 53, delta_a = 72 We want to at first delete 0.5206 Caucasian - and duplicate 0.5246 Caucasian +

Also,a we want to delete 0.5139 African-American + and duplicate 0.5163 African-American -

In [32]:
x.head()

Unnamed: 0,age,c_charge_degree,race,age_cat,score_text,sex,priors_count,decile_score,is_recid,is_violent_recid,length_of_stay,pred_crisis,two_year_recid
1,34,0,0,1,0.0,1,0,3,1,1,1,0.899076,1
6,39,1,1,1,0.0,0,0,1,0,0,0,0.000544,0
7,21,0,1,0,0.0,1,1,3,1,1,0,0.929476,1
8,27,0,1,1,0.0,1,0,4,0,0,0,0.00147,0
9,37,1,1,1,0.0,0,0,1,0,0,0,0.000553,0


In [51]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.base import clone

def sort_and_filter_data(data, race):
    data_filtered = data[(data['race'] == race) & (data['pred_crisis'] > 0.5)]
    data_sorted = data_filtered.sort_values(by='pred_crisis', ascending=False)
    return data_sorted

def balance_race(data_sorted, race):
    if race == 0:  # Caucasian
        data_to_remove = data_sorted.iloc[:25]
        data_to_duplicate = data_sorted.iloc[-25:]
    else:  # African-American
        data_to_remove = data_sorted.iloc[-35:]
        data_to_duplicate = data_sorted.iloc[:35]

    balanced_data = pd.concat([data_to_remove, data_to_duplicate])
    return balanced_data

def preprocess_data(x_a, x_c):
    x_a_sorted = sort_and_filter_data(x_a, 0)
    x_c_sorted = sort_and_filter_data(x_c, 1)
    x_a_balanced = balance_race(x_a_sorted, 0)
    x_c_balanced = balance_race(x_c_sorted, 1)
    return x_a_balanced, x_c_balanced

def train_model(x_train, y_train, model):
    start = time.time()
    model.fit(x_train, y_train)
    end = time.time()
    print(f"Training time: {end - start}")
    
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    scores = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
    print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
    return model

def evaluate_calibration(x_sen, y_sen, x_nsen, y_nsen,model):
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    score_sen = cross_val_score(model, x_sen, y_sen, scoring='accuracy', cv=cv, n_jobs=-1)
    score_nsen = cross_val_score(model, x_nsen, y_nsen, scoring='accuracy', cv=cv, n_jobs=-1)
    calibration = abs(np.mean(score_sen) - np.mean(score_nsen))
    print('Calibration: ', calibration)


x_a_balanced, x_c_balanced = preprocess_data(x_a, x_c)

x_copy = x.copy()
x_copy = x_copy.dropna()
new_x = x_copy.drop(['two_year_recid', 'pred_crisis'], 1)
new_y = x_copy['two_year_recid']

cond_a = (x_copy['two_year_recid'] == 0) & (x_copy['pred_crisis'] > 0.5) & (x_copy['race'] == 1)
x_copy.loc[cond_a] = x_a_balanced.values

cond_c = (x_copy['two_year_recid'] == 0) & (x_copy['pred_crisis'] < 0.5) & (x_copy['race'] == 0)
x_copy.loc[cond_c] = x_c_balanced.values

x_train, x_test, y_train, y_test = train_test_split(new_x, new_y, test_size=1/7)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=1/6)

trained_model = train_model(x_train, y_train,model_new)

evaluate_calibration(x_train[x_train['race'] == 0], y_train[x_train['race'] == 0],
                    x_train[x_train['race'] == 1], y_train[x_train['race'] == 1], trained_model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training time: 0.1760087013244629
Accuracy: 0.984 (0.011)
Calibration:  0.061397849462365484
