In [1]:
from sqlalchemy import create_engine
import configparser
import pickle
import pandas as pd
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot
from numpy import arange


  import pandas.util.testing as tm


In [49]:
config = configparser.ConfigParser()
config.read_file(open('./settings_development.ini'))

In [50]:
engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(
    user = config.get('DATABASE','user'),
    password = config.get('DATABASE','password'),
    host = config.get('DATABASE','host'),
    port = 5432,
    database = config.get('DATABASE','database'),
)
engine = create_engine(engine_string)

In [51]:
rforest = pickle.load(open( "random_forest.pckl", "rb"))

In [52]:
tabla_3 = pd.read_sql_table('centers', engine, schema="transformed")
tabla_4 = pd.read_sql_table('inspections', engine, schema="transformed")

In [53]:
centros = tabla_3.copy()
centros.rename(columns={"dc_id":"center_id"}, inplace=True)
inspecciones = tabla_4.copy()
last_inspections = inspecciones.sort_values(by="inspectiondate").drop_duplicates(subset=["center_id"], keep="last")
centros = centros.drop(['centername', 'legalname', 'building', 'street', 'zipcode', 'phone', 'permitnumber', 'permitexp', 'status',  'agerange', 'childcaretype', 'bin', 'url', 'datepermitted', 'actual','violationratepercent','violationavgratepercent', 'publichealthhazardviolationrate','averagepublichealthhazardiolationrate','criticalviolationrate','avgcriticalviolationrate'], axis=1)
centros = centros.reset_index(drop=True)
tabla_5 = pd.merge(last_inspections, centros)
tabla_5.sort_values(['inspectiondate'], ascending=[False], inplace=True)
tabla_5['maximumcapacity'] = tabla_5['maximumcapacity'].astype(int)

tabla_5['totaleducationalworkers'] = tabla_5['totaleducationalworkers'].astype(int)

tabla_5['totaleducationalworkers'] = tabla_5['totaleducationalworkers'].astype(int)

tabla_5['averagetotaleducationalworkers'] = tabla_5['averagetotaleducationalworkers'].astype(float)

tabla_5 = tabla_5.drop(['regulationsummary', 'healthcodesubsection', 'violationstatus', 'borough', 'reason', 'inspectiondate', 'violationcategory_nan'], axis=1)

tabla_5 = tabla_5.set_index(['center_id'])
tabla_5 = tabla_5.fillna(0)

for col in tabla_5.select_dtypes(object):
    tabla_5[col] = tabla_5[col].astype(float)

tabla_5 = tabla_5.fillna(0)

In [54]:
prds = rforest.predict(tabla_5.drop(['violationcategory_public_health_hazard'],axis=1))
probas = rforest.predict_proba(tabla_5.drop(['violationcategory_public_health_hazard'],axis=1))

In [55]:
res = pd.DataFrame({
    "center":tabla_5.index,
    "etiqueta":prds,
    "proba_0":probas[:,0],
    "proba_1":probas[:,1]
})

res.loc[res['proba_0'] > res['proba_1'], 'score'] = res['proba_0']
res.loc[res['proba_0'] < res['proba_1'], 'score'] = res['proba_1']

In [56]:
categorias_1 = ["programtype_all_age_camp","programtype_infant_toddler","programtype_preschool", "programtype_preschool_camp", "programtype_school_age_camp"]
programtype = pd.get_dummies(centros[categorias_1]).idxmax(1)
categorias_2 = ["borough_bronx","borough_brooklyn","borough_manhattan", "borough_queens", "borough_staten_island"]
borough = pd.get_dummies(centros[categorias_2]).idxmax(1)
ambas = pd.concat([borough, programtype], axis=1,)
ambas = ambas.rename(columns={0:'borough', 1:'programtype'})
tabla_1 = pd.concat([centros, ambas], axis=1)
tabla_2 = pd.merge(res, tabla_1, left_on='center', right_on='center_id')

In [57]:
for i in list(tabla_2.index):
    if str(tabla_2.iloc[i].borough_bronx) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "bronx"
    elif str(tabla_2.iloc[i].borough_brooklyn) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "brooklyn"
    elif str(tabla_2.iloc[i].borough_manhattan) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "manhattan"
    elif str(tabla_2.iloc[i].borough_queens) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "queens"
    elif str(tabla_2.iloc[i].borough_staten_island) == "1":
        tabla_2.loc[tabla_2.index == i ,"borough"] = "staten_island"

In [58]:
tabla_2.drop(categorias_2, axis=1, inplace=True)

In [59]:
for i in list(tabla_2.index):
    if str(tabla_2.iloc[i].programtype_all_age_camp) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "all_age_camp"
    elif str(tabla_2.iloc[i].programtype_infant_toddler) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "infant_toddler"
    elif str(tabla_2.iloc[i].programtype_preschool) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "preschool"
    elif str(tabla_2.iloc[i].programtype_preschool_camp) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "preschool_camp"
    elif str(tabla_2.iloc[i].programtype_school_age_camp) == "1":
        tabla_2.loc[tabla_2.index == i ,"programtype"] = "school_age_camp"

In [60]:
tabla_2.drop(categorias_1, axis=1, inplace=True)

In [61]:
tabla_6 = tabla_2.loc[:, ['center', 'etiqueta', 'score', 'borough', 'programtype']]

In [62]:
tabla_6 =  tabla_6.rename(columns = {'etiqueta':'label_value'})

In [63]:
tabla_6.programtype.value_counts()

preschool          2170
infant_toddler      431
all_age_camp        291
school_age_camp      12
preschool_camp        3
Name: programtype, dtype: int64

In [66]:
tabla_6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2907 entries, 0 to 2906
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   center       2907 non-null   object 
 1   label_value  2907 non-null   float64
 2   score        2907 non-null   float64
 3   borough      2907 non-null   object 
 4   programtype  2907 non-null   object 
dtypes: float64(2), object(3)
memory usage: 136.3+ KB


In [64]:
g = Group()

xtab, _ = g.get_crosstabs(tabla_6)

absolute_metrics = g.list_absolute_metrics(xtab)

df_group = xtab[[col for col in xtab.columns if col not in absolute_metrics]]
df_group.head()

model_id, score_thresholds 1 {'rank_abs': [402]}


  divide = lambda x, y: x / y if y != 0 else pd.np.nan


Unnamed: 0,model_id,score_threshold,k,attribute_name,attribute_value,pp,pn,fp,fn,tn,tp,group_label_pos,group_label_neg,group_size,total_entities
0,1,binary 0/1,402,center,dc1000,1,0,1,0,0,0,0,1,1,2907
1,1,binary 0/1,402,center,dc1017,0,1,0,0,1,0,0,1,1,2907
2,1,binary 0/1,402,center,dc1021,0,1,0,0,1,0,0,1,1,2907
3,1,binary 0/1,402,center,dc10244,0,1,0,0,1,0,0,1,1,2907
4,1,binary 0/1,402,center,dc1025,0,1,0,0,1,0,0,1,1,2907


In [65]:
b = Bias()

bdf = b.get_disparity_predefined_groups(xtab, original_df=tabla_6, ref_groups_dict={'borough':'brooklyn', 'programtype':'preschool'}, alpha=0.05, mask_significance=True)

#hbdf = b.get_disparity_predefined_groups(xtab, original_df=tabla_6,
                                         #ref_groups_dict={'borough':'brooklyn', 'programtype':'preschool'},
                                         #alpha=0.05,
                                         #mask_significance=False)


#majority_bdf = b.get_disparity_major_group(xtab, original_df=tabla, mask_significance=True)

get_disparity_predefined_group()


Exception: Bias.get_disparity_predefined_groups(): the number of predefined group values to use as reference is less than the actual number of attributes in the input dataframe.

In [None]:
f = Fairness()

fdf = f.get_group_value_fairness(bdf)

fg = aqp.plot_fairness_group_all(fdf, ncols=5, metrics = "all")

a_tm = aqp.plot_fairness_disparity_all(fdf, attributes=['borough'], metrics='all',
                                       significance_alpha=0.05)