In [103]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
#from pandas_profiling import ProfileReportofileReport

import warnings
warnings.filterwarnings('ignore')

In [104]:
from fairlearn.metrics import MetricFrame

from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier


from category_encoders.target_encoder import TargetEncoder
from category_encoders.m_estimate import MEstimateEncoder

from tqdm.notebook import tqdm

In [105]:
#pd.read_csv('propublica_data_for_fairml.csv').head()

In [106]:
df = pd.read_csv("compas-scores-raw.csv")

df["Score"] = df["DecileScore"]

#df.loc[df["DecileScore"] > 7, "Score"] = 2
#df.loc[(df["DecileScore"] > 4) & (df["DecileScore"] < 8), "Score"] = 1
#df.loc[df["DecileScore"] < 5, "Score"] = 0

df.loc[df["DecileScore"] > 4, "Score"] = 1
df.loc[df["DecileScore"] <= 4, "Score"] = 0


cols = [
    "Person_ID",
    "AssessmentID",
    "Case_ID",
    "LastName",
    "FirstName",
    "MiddleName",
    "DateOfBirth",
    "ScaleSet_ID",
    "Screening_Date",
    "RecSupervisionLevel",
    "Agency_Text",
    "AssessmentReason",
    "Language",
    "Scale_ID",
    "IsCompleted",
    "IsDeleted",
    "AssessmentType",
    "DecileScore",
]

df = df.drop(columns=cols)

possible_targets = ["RawScore", "ScoreText",'Score']

X = df.drop(columns=possible_targets)
y = df[['Score']]

In [107]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.33, random_state=42)

In [108]:
te = TargetEncoder()
model = LogisticRegression()
pipe = Pipeline([('encoder', te), ('model', model)])

pipe.fit(X_tr,y_tr)

preds = pipe.predict(X_te)

In [109]:
gm = MetricFrame(metrics=accuracy_score, y_true=y_te, y_pred=preds, sensitive_features=X_te['Sex_Code_Text'])
print(gm.overall)

print(gm.by_group)

0.82892574331391
Sex_Code_Text
Female    0.849897
Male      0.823114
Name: accuracy_score, dtype: object


In [110]:
def fit_predict(modelo, enc, data, target, test):
    pipe = Pipeline([("encoder", enc), ("model", modelo)])
    pipe.fit(data, target)
    return pipe.predict(test)

In [None]:
gms = []
ms = []
for m in tqdm(np.linspace(0,100,11)):
    encoder = MEstimateEncoder(m=m)
    model = GradientBoostingClassifier()

    preds = fit_predict(
        modelo=model, enc=encoder, data=X_tr, target=y_tr, test=X_te
    )
    gm = MetricFrame(
        metrics=accuracy_score,
        y_true=y_te,
        y_pred=preds,
        sensitive_features=X_te['Ethnic_Code_Text'],
    )
    gms.append(gm)
    ms.append(m)
    


  0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
plt.figure()
plt.plot([gm.by_group['Caucasian'] for gm in gms],label='Caucasian')
plt.plot([gm.by_group['African-Am'] for gm in gms],label='AfricanAm')
plt.plot([gm.by_group['African-American'] for gm in gms],label='AfricanAmerican')
plt.plot([gm.by_group['Arabic'] for gm in gms],label='Arabic')
plt.plot([gm.by_group['Hispanic'] for gm in gms],label='Hispanic')
plt.plot([gm.by_group['Oriental'] for gm in gms],label='Oriental')
plt.legend(bbox_to_anchor=(1.1, 1.05))
plt.show()

# Other Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
data = fetch_openml(data_id=1590, as_frame=True)
X = pd.get_dummies(data.data)
y_true = (data.target == '>50K') * 1
sex = data.data['sex']
sex.value_counts()

In [None]:
from fairlearn.metrics import MetricFrame
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=4)
classifier.fit(X, y_true)
DecisionTreeClassifier()
y_pred = classifier.predict(X)
gm = MetricFrame(metrics=accuracy_score, y_true=y_true, y_pred=y_pred, sensitive_features=sex)

In [None]:
data.data['sex']