In [82]:
from sqlalchemy import create_engine
import configparser
import pickle
import pandas as pd
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot
from numpy import arange


In [83]:
config = configparser.ConfigParser()
config.read_file(open('./settings_development.ini'))

In [84]:
engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(
    user = config.get('DATABASE','user'),
    password = config.get('DATABASE','password'),
    host = config.get('DATABASE','host'),
    port = 5432,
    database = config.get('DATABASE','database'),
)
engine = create_engine(engine_string)

In [85]:
rforest = pickle.load(open( "random_forest.pckl", "rb"))

In [86]:
tabla_3 = pd.read_sql_table('centers', engine, schema="transformed")
tabla_4 = pd.read_sql_table('inspections', engine, schema="transformed")

In [87]:
centros = tabla_3.copy()
centros.rename(columns={"dc_id":"center_id"}, inplace=True)
inspecciones = tabla_4.copy()
last_inspections = inspecciones.sort_values(by="inspectiondate").drop_duplicates(subset=["center_id"], keep="last")
centros = centros.drop(['centername', 'legalname', 'building', 'street', 'zipcode', 'phone', 'permitnumber', 'permitexp', 'status',  'agerange', 'childcaretype', 'bin', 'url', 'datepermitted', 'actual','violationratepercent','violationavgratepercent', 'publichealthhazardviolationrate','averagepublichealthhazardiolationrate','criticalviolationrate','avgcriticalviolationrate'], axis=1)
centros = centros.reset_index(drop=True)
tabla_5 = pd.merge(last_inspections, centros)
tabla_5.sort_values(['inspectiondate'], ascending=[False], inplace=True)
tabla_5['maximumcapacity'] = tabla_5['maximumcapacity'].astype(int)

tabla_5['totaleducationalworkers'] = tabla_5['totaleducationalworkers'].astype(int)

tabla_5['totaleducationalworkers'] = tabla_5['totaleducationalworkers'].astype(int)

tabla_5['averagetotaleducationalworkers'] = tabla_5['averagetotaleducationalworkers'].astype(float)

tabla_5 = tabla_5.drop(['regulationsummary', 'healthcodesubsection', 'violationstatus', 'borough', 'reason', 'inspectiondate', 'violationcategory_nan'], axis=1)

tabla_5 = tabla_5.set_index(['center_id'])
tabla_5 = tabla_5.fillna(0)

for col in tabla_5.select_dtypes(object):
    tabla_5[col] = tabla_5[col].astype(float)

tabla_5 = tabla_5.fillna(0)

In [88]:
prds = rforest.predict(tabla_5.drop(['violationcategory_public_health_hazard'],axis=1))
probas = rforest.predict_proba(tabla_5.drop(['violationcategory_public_health_hazard'],axis=1))

In [89]:
res = pd.DataFrame({
    "center":tabla_5.index,
    "etiqueta":prds,
    "proba_0":probas[:,0],
    "proba_1":probas[:,1]
})

res.loc[res['proba_0'] > res['proba_1'], 'score'] = res['proba_0']
res.loc[res['proba_0'] < res['proba_1'], 'score'] = res['proba_1']

In [90]:
categorias_1 = ["programtype_all_age_camp","programtype_infant_toddler","programtype_preschool", "programtype_preschool_camp", "programtype_school_age_camp"]
programtype = pd.get_dummies(centros[categorias_1]).idxmax(1)
categorias_2 = ["borough_bronx","borough_brooklyn","borough_manhattan", "borough_queens", "borough_staten_island"]
borough = pd.get_dummies(centros[categorias_2]).idxmax(1)
ambas = pd.concat([borough, programtype], axis=1,)
ambas = ambas.rename(columns={0:'borough', 1:'programtype'})
tabla_1 = pd.concat([centros, ambas], axis=1)
tabla_2 = pd.merge(res, tabla_1, left_on='center', right_on='center_id')

In [91]:
for i in list(tabla_2.index):
    if str(tabla_2.iloc[i].borough_bronx) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "bronx"
    elif str(tabla_2.iloc[i].borough_brooklyn) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "brooklyn"
    elif str(tabla_2.iloc[i].borough_manhattan) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "manhattan"
    elif str(tabla_2.iloc[i].borough_queens) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "queens"
    elif str(tabla_2.iloc[i].borough_staten_island) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "staten_island"

In [92]:
tabla_2.drop(categorias_2, axis=1, inplace=True)

In [93]:
for i in list(tabla_2.index):
    if str(tabla_2.iloc[i].programtype_all_age_camp) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "all_age_camp"
    elif str(tabla_2.iloc[i].programtype_infant_toddler) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "infant_toddler"
    elif str(tabla_2.iloc[i].programtype_preschool) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "preschool"
    elif str(tabla_2.iloc[i].programtype_preschool_camp) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "preschool_camp"
    elif str(tabla_2.iloc[i].programtype_school_age_camp) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "school_age_camp"

In [94]:
tabla_2.drop(categorias_1, axis=1, inplace=True)

In [95]:
tabla_6 = tabla_2.loc[:, ['center', 'etiqueta', 'score', 'borough', 'programtype']]

In [96]:
tabla_6 =  tabla_6.rename(columns = {'etiqueta':'label_value'})

In [97]:
tabla_6.programtype.value_counts()

preschool          2191
infant_toddler      440
all_age_camp        291
school_age_camp      12
preschool_camp        3
Name: programtype, dtype: int64

In [98]:
tabla_6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2937 entries, 0 to 2936
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   center       2937 non-null   object 
 1   label_value  2937 non-null   float64
 2   score        2937 non-null   float64
 3   borough      2937 non-null   object 
 4   programtype  2937 non-null   object 
dtypes: float64(2), object(3)
memory usage: 137.7+ KB


In [99]:
tabla_6.set_index('center', inplace=True)

In [100]:
g = Group()

xtab, _ = g.get_crosstabs(tabla_6)

absolute_metrics = g.list_absolute_metrics(xtab)

df_group = xtab[[col for col in xtab.columns if col not in absolute_metrics]]
df_group.head()

model_id, score_thresholds 1 {'rank_abs': [261]}


  divide = lambda x, y: x / y if y != 0 else pd.np.nan


Unnamed: 0,model_id,score_threshold,k,attribute_name,attribute_value,pp,pn,fp,fn,tn,tp,group_label_pos,group_label_neg,group_size,total_entities
0,1,binary 0/1,261,borough,bronx,11,349,11,19,330,0,19,341,360,2937
1,1,binary 0/1,261,borough,brooklyn,149,965,149,37,928,0,37,1077,1114,2937
2,1,binary 0/1,261,borough,manhattan,33,629,33,24,605,0,24,638,662,2937
3,1,binary 0/1,261,borough,queens,40,593,40,35,558,0,35,598,633,2937
4,1,binary 0/1,261,borough,staten_island,28,140,28,7,133,0,7,161,168,2937


In [101]:
b = Bias()

bdf = b.get_disparity_predefined_groups(xtab, original_df=tabla_6, ref_groups_dict={'borough':'brooklyn', 'programtype':'preschool'}, alpha=0.05, mask_significance=True)

hbdf = b.get_disparity_predefined_groups(xtab, original_df=tabla_6,
                                         ref_groups_dict={'borough':'brooklyn', 'programtype':'preschool'},
                                         alpha=0.05,
                                         mask_significance=False)


majority_bdf = b.get_disparity_major_group(xtab, original_df=tabla_6, mask_significance=True)

get_disparity_predefined_group()
get_disparity_predefined_group()
get_disparity_major_group()


  df = df.replace(pd.np.inf, fill_divbyzero)
  df = df.replace(pd.np.inf, fill_divbyzero)


In [102]:
f = Fairness()

fdf = f.get_group_value_fairness(bdf)


  self.fair_eval = lambda tau: lambda x: pd.np.nan if pd.np.isnan(x) else \
  self.high_level_pair_eval = lambda col1, col2: lambda x: pd.np.nan if (pd.np.isnan(x[col1]) and pd.np.isnan(x[col2])) \


In [103]:
fdf

Unnamed: 0,model_id,score_threshold,k,attribute_name,attribute_value,tpr,tnr,for,fdr,fpr,...,FNR Parity,TPR Parity,TNR Parity,NPV Parity,Precision Parity,TypeI Parity,TypeII Parity,Equalized Odds,Unsupervised Fairness,Supervised Fairness
0,1,binary 0/1,261,borough,bronx,0.0,0.967742,0.054441,1.0,0.032258,...,True,,True,True,,False,False,False,False,False
1,1,binary 0/1,261,borough,brooklyn,0.0,0.861653,0.038342,1.0,0.138347,...,True,,True,True,,True,True,False,True,True
2,1,binary 0/1,261,borough,manhattan,0.0,0.948276,0.038156,1.0,0.051724,...,True,,True,True,,False,True,False,False,False
3,1,binary 0/1,261,borough,queens,0.0,0.93311,0.059022,1.0,0.06689,...,True,,True,True,,False,False,False,False,False
4,1,binary 0/1,261,borough,staten_island,0.0,0.826087,0.05,1.0,0.173913,...,True,,True,True,,False,False,False,False,False
5,1,binary 0/1,261,programtype,all_age_camp,0.0,0.62963,0.109948,1.0,0.37037,...,True,,False,True,,False,False,False,False,False
6,1,binary 0/1,261,programtype,infant_toddler,0.0,0.946136,0.031175,1.0,0.053864,...,True,,True,True,,True,False,False,False,False
7,1,binary 0/1,261,programtype,preschool,0.0,0.936312,0.042295,1.0,0.063688,...,True,,True,True,,True,True,False,True,True
8,1,binary 0/1,261,programtype,preschool_camp,,0.666667,0.0,1.0,0.333333,...,,,False,True,,False,False,False,False,False
9,1,binary 0/1,261,programtype,school_age_camp,0.0,0.727273,0.111111,1.0,0.272727,...,True,,False,True,,False,False,False,False,False
