In [None]:
from __future__ import division
import urllib.request
import os,sys
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn import feature_extraction
from sklearn import preprocessing
from random import seed, shuffle
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/rclarkbar/hw/main/compas-scores-two-years.csv')

In [None]:
SEED = 1234
seed(SEED)
np.random.seed(SEED)


def add_intercept(x):
    """ Add intercept to the data before linear classification """
    m,n = x.shape
    intercept = np.ones(m).reshape(m, 1) # the constant b
    return np.concatenate((intercept, x), axis = 1)

def load_compas_data(data):

	FEATURES_CLASSIFICATION = ["age", "race", "sex", "priors_count", "c_charge_degree","days_served"] #features to be used for classification
	CONT_VARIABLES = ["priors_count","age","days_served"] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot
	CLASS_FEATURE = "two_year_recid" # the decision variable
	SENSITIVE_ATTRS = ["race"]

	# load the data and get some stats
	df = data.copy()
	df = df.dropna(subset=["days_b_screening_arrest"]) # dropping missing vals
	
	# convert to np array
	data = df.to_dict('list')
	for k in data.keys():
		data[k] = np.array(data[k])


	""" Filtering the data """

	# These filters are the same as propublica (refer to https://github.com/propublica/compas-analysis)
	# If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense. 
	idx = np.logical_and(data["days_b_screening_arrest"]<=30, data["days_b_screening_arrest"]>=-30)


	# We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all.
	idx = np.logical_and(idx, data["is_recid"] != -1)

	# In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them).
	idx = np.logical_and(idx, data["c_charge_degree"] != "O") # F: felony, M: misconduct

	# We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.
	idx = np.logical_and(idx, data["score_text"] != "NA")

	# we will only consider blacks and whites for this analysis
	idx = np.logical_and(idx, np.logical_or(data["race"] == "African-American", data["race"] == "Caucasian"))

	# select the examples that satisfy this criteria
	for k in data.keys():
		data[k] = data[k][idx]



	""" Feature normalization and one hot encoding """

	y = data[CLASS_FEATURE]
	
	print("\nNumber of people recidivating within two years")
	print(pd.Series(y).value_counts())
	print("\n")


	X = np.array([]).reshape(len(y), 0) # empty array with num rows same as num examples, will hstack the features to it
	x_control = defaultdict(list)

	feature_names = []
	for attr in FEATURES_CLASSIFICATION:
		vals = data[attr]
		if attr in CONT_VARIABLES:
			vals = [float(v) for v in vals]
			#vals = preprocessing.scale(vals) # 0 mean and 1 variance  
			vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col

		else: # for binary categorical variables, the label binarizer uses just one var instead of two
			lb = preprocessing.LabelBinarizer()
			lb.fit(vals)
			vals = lb.transform(vals)

		# add to sensitive features dict
		if attr in SENSITIVE_ATTRS:
			x_control[attr] = vals


		# add to learnable features
		X = np.hstack((X, vals))

		if attr in CONT_VARIABLES: # continuous feature, just append the name
			feature_names.append(attr)
		else: # categorical features
			if vals.shape[1] == 1: # binary features that passed through lib binarizer
				feature_names.append(attr)
			else:
				for k in lb.classes_: # non-binary categorical features, need to add the names for each cat
					feature_names.append(attr + "_" + str(k))


	# convert the sensitive feature to 1-d array
	x_control = dict(x_control)
	for k in x_control.keys():
		assert(x_control[k].shape[1] == 1) # make sure that the sensitive feature is binary after one hot encoding
		x_control[k] = np.array(x_control[k]).flatten()

	# sys.exit(1)

	"""permute the date randomly"""
	perm = list(range(0,X.shape[0]))
	shuffle(perm)
	X = X[perm]
	y = y[perm]
	for k in x_control.keys():
		x_control[k] = x_control[k][perm]

	assert(len(feature_names) == X.shape[1])
	print("Features we will be using for classification are:", feature_names, "\n")


	return X, y, feature_names

In [None]:
X, y, feature_names = load_compas_data(data)
print(X.shape)
print(y.shape)


Number of people recidivating within two years
0    2795
1    2483
dtype: int64


Features we will be using for classification are: ['age', 'race', 'sex', 'priors_count', 'c_charge_degree', 'days_served'] 

(5278, 6)
(5278,)


In [None]:
df = pd.DataFrame(np.concatenate((X,np.asmatrix(y).T),axis=1))
df.columns = np.append(feature_names,['two_year_recid'])

In [None]:
df

Unnamed: 0,age,race,sex,priors_count,c_charge_degree,days_served,two_year_recid
0,28.0,0.0,1.0,5.0,0.0,34.91,0.0
1,27.0,0.0,1.0,5.0,1.0,6.30,1.0
2,28.0,0.0,1.0,21.0,0.0,1.06,0.0
3,29.0,1.0,1.0,1.0,1.0,11.19,0.0
4,19.0,1.0,0.0,2.0,0.0,22.27,1.0
...,...,...,...,...,...,...,...
5273,30.0,0.0,0.0,15.0,0.0,-0.20,1.0
5274,27.0,0.0,1.0,1.0,0.0,1.29,1.0
5275,35.0,0.0,0.0,1.0,1.0,0.85,0.0
5276,19.0,0.0,1.0,2.0,0.0,14.74,1.0


In [None]:
train, test = train_test_split(df, test_size=1/7,stratify=df.two_year_recid, random_state=0)

In [None]:
print(len(train[train.two_year_recid==1])/len(train))
print(len(train))

0.47038019451812557
4524


In [None]:
print(len(test[test.two_year_recid==1])/len(test))
print(len(test))

0.4708222811671088
754


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
loc = '/content/drive/Shared drives/AM207 - CLUE Final Project/Dataset/COMPAS/'
train.to_csv(loc + 'compas_train.csv', index=False)
test.to_csv(loc + 'compas_test.csv', index=False)