In [88]:
from fairlearn.metrics import demographic_parity_ratio

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn

df = pd.read_csv('hiring_decisions.csv')
df.head(5)

Unnamed: 0,id,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,HiringDecision
0,0,26,1,2,0,3,26.783828,48,78,91,1,1
1,1,39,1,4,12,3,25.862694,35,68,80,2,1
2,2,48,0,2,3,2,9.920805,20,67,13,2,0
3,3,34,1,2,5,2,6.407751,36,27,70,3,0
4,4,30,0,1,6,1,43.105343,23,52,85,2,0


1. Quante sono le istanze contenute nel dataset? Il dataset è completo (cioè per ogni istanza sono sempre specificati tutti i valori di ogni attributo)? Il dataset è bilanciato rispetto alla classe da predire?

In [89]:
df['HiringDecision'].value_counts()

HiringDecision
0    1035
1     465
Name: count, dtype: int64

2. Caricare il dataset, eliminare eventuali attributi inutili (motivare la scelta), eliminare eventuali istanze con valori nulli,
dividere il dataset in train (75%) e test (25%), preservando le proporzioni delle classi del target.

In [90]:
y=df['HiringDecision']
x = df.drop(['HiringDecision', 'id'], axis=1)

In [91]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y)

coltran = ColumnTransformer(transformers=[('std', StandardScaler(), ['Age', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies' ,'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy'])], remainder='passthrough')

coltran.fit(X_train)
X_train = coltran.transform(X_train)
X_test = coltran.transform(X_test)

transformed_feautures = coltran.named_transformers_['std'].get_feature_names_out()
output_columns = list (transformed_feautures) + ['Gender']

X_train = pd.DataFrame(X_train, columns=output_columns)
X_test = pd.DataFrame(X_test, columns=output_columns)


3. Valutare le performance sia sul dataset train sia sul dataset test del modello SGDClassifier, tenendo in considerazione
F1-score e la confusion matrix.

In [92]:
model =SGDClassifier(random_state=123)
model.fit(X_train, y_train)

trainpred = model.predict(X_train)
testpred = model.predict(X_test)

print(f1_score(y_train, trainpred))
print(f1_score(y_test, testpred))
print("Confusion Matrix Training")
print(confusion_matrix(y_train, trainpred))
print("Confusion Matrix Testing")
print(confusion_matrix(y_test, testpred))

0.6490683229813664
0.6390532544378699
Confusion Matrix Training
[[765  63]
 [163 209]]
Confusion Matrix Testing
[[185  22]
 [ 39  54]]


In [93]:
model.feature_names_in_

array(['Age', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies',
       'DistanceFromCompany', 'InterviewScore', 'SkillScore',
       'PersonalityScore', 'RecruitmentStrategy', 'Gender'], dtype=object)

In [94]:
model.coef_

array([[ 9.49540212e-01,  7.26618056e-01,  8.01889077e-01,
         3.75680204e-01,  4.95264998e-01,  9.40899001e-01,
         8.46557172e-01,  5.26783099e-01, -1.68135886e+00,
        -2.84783934e-15]])

4. Analisi della fairness del modello: valutare, con i dati del test set e rispetto al modello SGDClassifier, se la probabilità
di predire 0 è la stessa per uomini (0) e donne (1). Il modello ha le stesse performance sul dataset degli uomini e sul
dataset delle donne? Calcolando la metrica demographic_parity_ratio della libreria fairlearn, è possibile stabilire che il
modello rispetta la “parità demografica”? Eliminare l’attributo Gender e valutare se le performance del modello ottenute
negli uomini sono le stesse ottenute nelle donne.

In [95]:
X_test.head(5)

Unnamed: 0,Age,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,Gender
0,-1.398704,0.940188,-0.159345,-0.710514,-0.046046,0.443175,-0.534921,-0.975029,0.146795,1.0
1,-0.10444,-0.220538,1.131191,-0.000592,-0.348544,-0.253367,-0.126346,-0.225487,0.146795,0.0
2,1.621246,-0.220538,0.485923,-1.420437,-0.67807,-1.367835,1.405809,-0.021067,1.590676,0.0
3,-0.535861,-0.220538,-1.449882,1.419254,-0.841533,0.408348,0.316276,-0.804679,0.146795,1.0
4,1.51339,0.940188,-1.234792,0.709331,0.407941,0.094904,-1.318022,-0.123277,-1.297087,0.0


In [96]:
X_test['pred'] = testpred
uomo = X_test[X_test['Gender'] == 0]
donna = X_test[X_test['Gender'] == 1]

prob_uomo = uomo[uomo['pred']==0].shape[0] / uomo.shape[0]
prob_donna = donna[donna['pred']==0].shape[0] / donna.shape[0]

print('Probabilita che un uomo non venga assunto: ', prob_uomo)
print('Probabilita che una donna non venga assunta: ', prob_donna)

y_test

Probabilita che un uomo non venga assunto:  0.7597402597402597
Probabilita che una donna non venga assunta:  0.7328767123287672


1056    0
558     0
893     0
1219    0
1379    0
       ..
538     0
945     1
404     0
1406    0
1004    0
Name: HiringDecision, Length: 300, dtype: int64

In [99]:
X_test['real'] = y_test.values
X_test

Unnamed: 0,Age,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,Gender,pred,real
0,-1.398704,0.940188,-0.159345,-0.710514,-0.046046,0.443175,-0.534921,-0.975029,0.146795,1.0,0,0
1,-0.104440,-0.220538,1.131191,-0.000592,-0.348544,-0.253367,-0.126346,-0.225487,0.146795,0.0,0,0
2,1.621246,-0.220538,0.485923,-1.420437,-0.678070,-1.367835,1.405809,-0.021067,1.590676,0.0,0,0
3,-0.535861,-0.220538,-1.449882,1.419254,-0.841533,0.408348,0.316276,-0.804679,0.146795,1.0,0,0
4,1.513390,0.940188,-1.234792,0.709331,0.407941,0.094904,-1.318022,-0.123277,-1.297087,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
295,1.297680,0.940188,-1.019703,-0.000592,-0.192508,0.791446,-0.330633,0.217424,0.146795,1.0,0,0
296,-0.859427,-0.220538,-0.804614,-1.420437,0.507384,1.244199,0.895090,-0.429908,-1.297087,0.0,0,1
297,0.219126,-0.220538,-1.234792,-1.420437,-0.776257,0.234212,-0.500873,1.409877,-1.297087,0.0,0,0
298,1.297680,-0.220538,1.346280,0.709331,-0.163708,-0.253367,-1.011591,0.285564,-1.297087,0.0,1,0


In [104]:
print(f1_score(X_test[X_test['Gender'] == 0]['real'], X_test[X_test['Gender'] == 0]['pred']))
print(demographic_parity_ratio(y_test.values, testpred, sensitive_features=X_test['Gender']))


0.55
0.8994338994338994


In [102]:
if prob_uomo > prob_donna :
    print(prob_uomo/prob_donna)
else:
    print(prob_donna/prob_uomo)

1.0366549338511954


0.8994338994338994
