<a href="https://colab.research.google.com/github/gauthamys/CS517-SRAI/blob/main/SRAI_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SRAI HW 1

*   Gautham Satyanarayana
*   Pradeep Raj



IMP : Upload 'compas-scores-two-years.csv' before running

In [17]:
import pandas as pd

In [39]:
df = pd.read_csv('compas-scores-two-years.csv')
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0




# Fairness Metrics

## Demographic Parity

In [19]:
for race in df['race'].unique():
    race_df = df[df['race'] == race]
    percentage = (race_df['two_year_recid'] == 1).sum() / len(race_df)
    print(f"{race} : {percentage:.2f}")


Other : 0.35
African-American : 0.51
Caucasian : 0.39
Hispanic : 0.36
Native American : 0.56
Asian : 0.28


## Equalised Odds

In [20]:
from sklearn.metrics import confusion_matrix

for race in df['race'].unique():
    race_df = df[df['race'] == race]

    tn, fp, fn, tp = confusion_matrix(race_df['two_year_recid'], race_df['event']).ravel()

    true_positive_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"Race: {race}")
    print(f"  True Positive Rate: {true_positive_rate:.2f}")
    print(f"  False Positive Rate: {false_positive_rate:.2f}")
    print("-" * 20)


Race: Other
  True Positive Rate: 0.83
  False Positive Rate: 0.03
--------------------
Race: African-American
  True Positive Rate: 0.80
  False Positive Rate: 0.05
--------------------
Race: Caucasian
  True Positive Rate: 0.80
  False Positive Rate: 0.03
--------------------
Race: Hispanic
  True Positive Rate: 0.84
  False Positive Rate: 0.03
--------------------
Race: Native American
  True Positive Rate: 0.60
  False Positive Rate: 0.00
--------------------
Race: Asian
  True Positive Rate: 0.67
  False Positive Rate: 0.09
--------------------


## Equalised Opportunity / Predictive Rate Parity

- The True Positive rates for all races are comparable, except "Native American" and "Asian"
- Caucasian and African-American have equal odds
- Asian and Native American are comparable, although Asian has higher odds
- Hispanic and Other have similar odds

# Bias Mitigation Strategies

## Conterfactual Data Augmentation

## Fairness Constraints

In [21]:
df.columns

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [82]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime

In [83]:
def calculate_age(d):
    """
    Calculate age from date of birth.
    """
    today = datetime.today()
    return today.year - d.year - ((today.month, today.day) < (d.month, d.day))

def custom_loss(y_true, y_pred):
    """
    Custom loss function: Mean Squared Error with penalty for large deviations.
    """
    mse = tf.reduce_mean(tf.square(y_true - y_pred))
    penalty = tf.reduce_mean(tf.cast(tf.abs(y_true - y_pred) > 5, tf.float32)) * 10  # Additional penalty for errors > 5
    return mse + penalty


In [101]:
data = df.rename(columns={
    'priors_count': 'history_violence',
    'juv_other_count': 'vocational_edu',  # Placeholder, adjust as needed
    'juv_misd_count': 'history_nc',
    'two_year_recid': 'risk_score'
})
data['c_offense_date'] = pd.to_datetime(data['c_offense_date'])
data['age_first'] = data['c_offense_date'].apply(calculate_age)
features = ['age', 'age_first', 'history_violence', 'vocational_edu', 'history_nc']
data = data.dropna(subset= features + ['risk_score'])

X = data[features]
y = data['risk_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape[0], X_test.shape[0]

(4844, 1211)

### Loss Function = Mean Squared Error

In [102]:
model_keras = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])
#model_keras.compile(optimizer='adam', loss=custom_loss)
model_keras.compile(optimizer='adam', loss='mean_squared_error')
model_keras.fit(X_train, y_train, epochs=20, batch_size=4844, verbose=1)

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 2.5459
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.9156
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 0.4064
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 0.6930
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 1.0511
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 1.0771
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.8446
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.5562
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.3682
Epoch 10/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 0.3369
Epoch 11/20
[1m1/1[0m [32m━━━

In [103]:
# accuracy
accuracy = model_keras.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy}")

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.3593
Accuracy: 0.36724603176116943


### Loss Function with Fairness Constraints

## MISC

In [11]:
df2 = pd.read_csv('compas-scores-raw.csv')
df2.columns

Index(['Person_ID', 'AssessmentID', 'Case_ID', 'Agency_Text', 'LastName',
       'FirstName', 'MiddleName', 'Sex_Code_Text', 'Ethnic_Code_Text',
       'DateOfBirth', 'ScaleSet_ID', 'ScaleSet', 'AssessmentReason',
       'Language', 'LegalStatus', 'CustodyStatus', 'MaritalStatus',
       'Screening_Date', 'RecSupervisionLevel', 'RecSupervisionLevelText',
       'Scale_ID', 'DisplayText', 'RawScore', 'DecileScore', 'ScoreText',
       'AssessmentType', 'IsCompleted', 'IsDeleted'],
      dtype='object')

In [87]:
df2[['Case_ID','RecSupervisionLevel', 'Ethnic_Code_Text', 'DateOfBirth', 'ScaleSet_ID', 'Screening_Date', 'MaritalStatus', 'RawScore']].head()

Unnamed: 0,Case_ID,RecSupervisionLevel,Ethnic_Code_Text,DateOfBirth,ScaleSet_ID,Screening_Date,MaritalStatus,RawScore
0,51950,1,Caucasian,12/05/92,22,1/1/13 0:00,Single,-2.08
1,51950,1,Caucasian,12/05/92,22,1/1/13 0:00,Single,-1.06
2,51950,1,Caucasian,12/05/92,22,1/1/13 0:00,Single,15.0
3,51956,1,Caucasian,09/16/84,22,1/1/13 0:00,Married,-2.84
4,51956,1,Caucasian,09/16/84,22,1/1/13 0:00,Married,-1.5
