In [1]:
# Loads CSV data
import pandas as pd
data = pd.read_csv("data/training_data.csv", index_col=0)

# Views column names and values for first subject in the data
data.iloc[0,:]

degree of charge (felony/misdemeanor)                   F
risk assessment date                           2013-12-23
gender                                             Female
number of prior convictions                             1
recidivist                                              0
recidivism risk score                                   4
last name                                             Doe
full name                                     Jane A. Doe
number of juvenile felonies                             0
date of birth                                  1993-04-09
age                                                    23
number of juvenile misdemeanors                         0
number of other juvenile crimes                         0
race                                     African-American
first name                                           Jane
recidivist guess                                        0
Name: 6904, dtype: object

In [2]:
# Write some code to answer the following questions, according to the data.

In [3]:
# How old are you?
idx = data[(data["full name"] == "John F. Doe")].index[0]
print(data.loc[idx,"age"])

22


In [4]:
# Did you commit any felonies, misdemeanors, or other crimes as a juvenile?
print(data.loc[idx,["number of juvenile felonies", "number of juvenile misdemeanors", "number of other juvenile crimes"]])

number of juvenile felonies        0
number of juvenile misdemeanors    1
number of other juvenile crimes    2
Name: 8132, dtype: object


In [5]:
# How many total prior convictions have you had?
print(data.loc[idx,"number of prior convictions"])


1


In [6]:
# Were you a recidivist? (Did you commit a subsequent crime after your COMPAS assessment)?
print(["No","Yes"][data.loc[idx,"recidivist"]]) 

No


In [7]:
# What was your risk of recidivism according to the COMPAS assessment?
print(data.loc[idx,"recidivism risk score"])

5


In [8]:
# Convert everyone's "recidivism risk score" from COMPAS to a binary label:
#    1 if the score is greater than 5, 0 otherwise.
# Store the result in a new column called "recidivist compas".
# According to this scheme, did your COMPAS score predict that you were a recidivist, and was it correct?
# Did your classmates guess that you were a recidivist, and were they correct?
data["recidivist compas"] = (data["recidivism risk score"] > 5).astype(int)
print(data.loc[idx,"recidivist compas"])
print(data.loc[idx,"recidivist compas"] == data.loc[idx,"recidivist"])
print(data.loc[idx,"recidivist guess"])
print(data.loc[idx,"recidivist guess"] == data.loc[idx,"recidivist"])


0
True
1.0
False


In [9]:
# How many subjects are there in the data?
# What was the total number of false positives and false negatives for the COMPAS recidivist predictions?
# What about for your classmates' recidivist predictions?
print(data.shape[0])
print(((data["recidivist compas"] == 1) & (data["recidivist"] == 0)).sum())
print(((data["recidivist compas"] == 0) & (data["recidivist"] == 1)).sum())
print(((data["recidivist guess"] == 1) & (data["recidivist"] == 0)).sum())
print(((data["recidivist guess"] == 0) & (data["recidivist"] == 1)).sum())

52
6
7
3
9


In [10]:
# Calculate the F-scores for the COMPAS predictions and your classmate guesses.  Which score is higher?
# https://en.wikipedia.org/wiki/F1_score
tp_c = ((data["recidivist compas"] == 1) & (data["recidivist"] == 1)).sum()
tn_c = ((data["recidivist compas"] == 0) & (data["recidivist"] == 0)).sum()
fp_c = ((data["recidivist compas"] == 1) & (data["recidivist"] == 0)).sum()
fn_c = ((data["recidivist compas"] == 0) & (data["recidivist"] == 1)).sum()

tp_g = ((data["recidivist guess"] == 1) & (data["recidivist"] == 1)).sum()
tn_g = ((data["recidivist guess"] == 0) & (data["recidivist"] == 0)).sum()
fp_g = ((data["recidivist guess"] == 1) & (data["recidivist"] == 0)).sum()
fn_g = ((data["recidivist guess"] == 0) & (data["recidivist"] == 1)).sum()

p_c, r_c = float(tp_c) / (tp_c + fp_c), float(tp_c) / (tp_c + fn_c)
p_g, r_g = float(tp_g) / (tp_g + fp_g), float(tp_g) / (tp_g + fn_g)

f1_c = 2*p_c*r_c / (p_c + r_c)
f1_g = 2*p_g*r_g / (p_g + r_g)
print(f1_c, f1_g)

(0.3157894736842105, 0.14285714285714288)


In [11]:
# Extracts columns to use as training examples.

examples = data[["gender",
                 "age",
                 "race",
                 "number of prior convictions",
                 "number of juvenile felonies",
                 "number of juvenile misdemeanors",
                 "number of other juvenile crimes"]].copy()

# Extract columns to use as labels.
labels = data["recidivist"]
labels_c = data["recidivist compas"]
labels_g = data["recidivist guess"]

examples.iloc[0,:]

gender                                       Female
age                                              23
race                               African-American
number of prior convictions                       1
number of juvenile felonies                       0
number of juvenile misdemeanors                   0
number of other juvenile crimes                   0
Name: 6904, dtype: object

In [12]:
# Factorize the categorical features.
# For example, replace the "gender" column with two columns "male" and "female",
# where "male" is 1 for male subjects and 0 for female subjects, and "female" is the reverse.
# Similarly factorize the "race" column into one binary column per race category in the dataset.
# You can do this from scratch, or use sklearn's OneHotEncoder:
#   https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

examples["male"] = (examples["gender"] == "Male").astype(int)
examples["female"] = (examples["gender"] == "Female").astype(int)
examples = examples.drop(columns="gender")

examples["african-american"] = (examples["race"] == "African-American").astype(int)
examples["caucasian"] = (examples["race"] == "Caucasian").astype(int)
examples["hispanic"] = (examples["race"] == "Hispanic").astype(int)
examples["asian"] = (examples["race"] == "Asian").astype(int)
examples["native american"] = (examples["race"] == "Native American").astype(int)
examples["other"] = (examples["race"] == "Other").astype(int)
examples = examples.drop(columns="race")

examples.iloc[0,:]

age                                23
number of prior convictions         1
number of juvenile felonies         0
number of juvenile misdemeanors     0
number of other juvenile crimes     0
male                                0
female                              1
african-american                    1
caucasian                           0
hispanic                            0
asian                               0
native american                     0
other                               0
Name: 6904, dtype: int64

In [13]:
# Split the data into roughly 7/8 for training and 1/8 for testing.
# Train an sklearn decision tree to predict recidivism, based on the ground-truth recidivism label.
# What are the training and testing accuracies?
# Repeat this experiment on max tree depths ranging from 1 to the number of features.

import numpy as np
train_idx = (np.random.rand(examples.shape[0]) < 7./8)

from sklearn.tree import DecisionTreeClassifier
for max_depth in range(1, examples.shape[1]):
    clf = DecisionTreeClassifier(max_depth=max_depth).fit(examples.iloc[train_idx,:], labels.iloc[train_idx])
    pred = clf.predict(examples.iloc[~train_idx,:])
    print(max_depth, (pred == labels.iloc[~train_idx]).astype(float).sum() / (~train_idx).sum())

(1, 0.7142857142857143)
(2, 0.7142857142857143)
(3, 0.7142857142857143)
(4, 0.7142857142857143)
(5, 0.7142857142857143)
(6, 0.42857142857142855)
(7, 0.42857142857142855)
(8, 0.5714285714285714)
(9, 0.42857142857142855)
(10, 0.5714285714285714)
(11, 0.5714285714285714)
(12, 0.42857142857142855)


In [14]:
# Repeat the previous exercise using the COMPAS-based recidivism labels and then the classmate-guessed labels.
# Measure accuracy twice - once using the COMPAS/guessed labels, and once using the ground-truth labels.

print("COMPAS")
for max_depth in range(1, examples.shape[1]):
    clf = DecisionTreeClassifier(max_depth=max_depth).fit(examples.iloc[train_idx,:], labels_c.iloc[train_idx])
    pred = clf.predict(examples.iloc[~train_idx,:])
    acc_c = (pred == labels_c.iloc[~train_idx]).astype(float).sum() / (~train_idx).sum()
    acc = (pred == labels.iloc[~train_idx]).astype(float).sum() / (~train_idx).sum()
    print(max_depth, acc_c, acc)

print("guesses")
for max_depth in range(1, examples.shape[1]):
    clf = DecisionTreeClassifier(max_depth=max_depth).fit(examples.iloc[train_idx,:], labels_g.iloc[train_idx])
    pred = clf.predict(examples.iloc[~train_idx,:])
    acc_g = (pred == labels_g.iloc[~train_idx]).astype(float).sum() / (~train_idx).sum()
    acc = (pred == labels.iloc[~train_idx]).astype(float).sum() / (~train_idx).sum()
    print(max_depth, acc_g, acc)

COMPAS
(1, 1.0, 0.7142857142857143)
(2, 0.8571428571428571, 0.5714285714285714)
(3, 0.8571428571428571, 0.5714285714285714)
(4, 0.8571428571428571, 0.5714285714285714)
(5, 0.7142857142857143, 0.42857142857142855)
(6, 0.8571428571428571, 0.5714285714285714)
(7, 0.8571428571428571, 0.5714285714285714)
(8, 0.8571428571428571, 0.5714285714285714)
(9, 0.8571428571428571, 0.5714285714285714)
(10, 0.8571428571428571, 0.5714285714285714)
(11, 0.8571428571428571, 0.5714285714285714)
(12, 0.8571428571428571, 0.5714285714285714)
guesses
(1, 1.0, 0.5714285714285714)
(2, 1.0, 0.5714285714285714)
(3, 1.0, 0.5714285714285714)
(4, 1.0, 0.5714285714285714)
(5, 1.0, 0.5714285714285714)
(6, 1.0, 0.5714285714285714)
(7, 1.0, 0.5714285714285714)
(8, 1.0, 0.5714285714285714)
(9, 1.0, 0.5714285714285714)
(10, 1.0, 0.5714285714285714)
(11, 1.0, 0.5714285714285714)
(12, 1.0, 0.5714285714285714)


In [15]:
# What is the accuracy of the COMPAS-based labels themselves (not the tree that was trained on them)?
# How does that accuracy compare to the best tree that was trained on the ground truth?
print(float((labels == labels_c).sum()) / len(labels))

0.75
