In [1]:
from sqlalchemy import create_engine
import configparser
import pickle
import pandas as pd
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot
from numpy import arange

  import pandas.util.testing as tm


In [2]:
config = configparser.ConfigParser()
config.read_file(open('./settings_development.ini'))

In [3]:
engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(
    user = config.get('DATABASE','user'),
    password = config.get('DATABASE','password'),
    host = config.get('DATABASE','host'),
    port = 5432,
    database = config.get('DATABASE','database'),
)
engine = create_engine(engine_string)

In [4]:
rforest = pickle.load(open( "random_forest.pckl", "rb"))

In [5]:
tabla_3 = pd.read_sql_table('centers', engine, schema="transformed")
tabla_4 = pd.read_sql_table('inspections', engine, schema="transformed")

In [6]:
centros = tabla_3.copy()
centros.rename(columns={"dc_id":"center_id"}, inplace=True)
inspecciones = tabla_4.copy()
last_inspections = inspecciones.sort_values(by="inspectiondate").drop_duplicates(subset=["center_id"], keep="last")
centros = centros.drop(['centername', 'legalname', 'building', 'street', 'zipcode', 'phone', 'permitnumber', 'permitexp', 'status',  'agerange', 'childcaretype', 'bin', 'url', 'datepermitted', 'actual','violationratepercent','violationavgratepercent', 'publichealthhazardviolationrate','averagepublichealthhazardiolationrate','criticalviolationrate','avgcriticalviolationrate'], axis=1)
centros = centros.reset_index(drop=True)
tabla_5 = pd.merge(last_inspections, centros)
tabla_5.sort_values(['inspectiondate'], ascending=[False], inplace=True)
tabla_5['maximumcapacity'] = tabla_5['maximumcapacity'].astype(int)

tabla_5['totaleducationalworkers'] = tabla_5['totaleducationalworkers'].astype(int)

tabla_5['totaleducationalworkers'] = tabla_5['totaleducationalworkers'].astype(int)

tabla_5['averagetotaleducationalworkers'] = tabla_5['averagetotaleducationalworkers'].astype(float)

tabla_5 = tabla_5.drop(['regulationsummary', 'healthcodesubsection', 'violationstatus', 'borough', 'reason', 'inspectiondate', 'violationcategory_nan'], axis=1)

tabla_5 = tabla_5.set_index(['center_id'])
tabla_5 = tabla_5.fillna(0)

for col in tabla_5.select_dtypes(object):
    tabla_5[col] = tabla_5[col].astype(float)

tabla_5 = tabla_5.fillna(0)

In [7]:
centros.borough_manhattan.value_counts()

0    2286
1     668
Name: borough_manhattan, dtype: int64

In [8]:
prds = rforest.predict(tabla_5.drop(['violationcategory_public_health_hazard'],axis=1))

probas = rforest.predict_proba(tabla_5.drop(['violationcategory_public_health_hazard'],axis=1))

res = pd.DataFrame({
    "center":tabla_5.index,
    "etiqueta":prds,
    "proba_0":probas[:,0],
    "proba_1":probas[:,1]
})

res.loc[res['proba_0'] > res['proba_1'], 'score'] = res['proba_0']
res.loc[res['proba_0'] < res['proba_1'], 'score'] = res['proba_1']

categorias_1 = ["programtype_all_age_camp","programtype_infant_toddler","programtype_preschool", "programtype_preschool_camp", "programtype_school_age_camp"]

programtype = pd.get_dummies(centros[categorias_1]).idxmax(1)

categorias_2 = ["borough_bronx","borough_brooklyn","borough_manhattan", "borough_queens", "borough_staten_island"]

borough = pd.get_dummies(centros[categorias_2]).idxmax(1)

ambas = pd.concat([borough, programtype], axis=1,)

ambas = ambas.rename(columns={0:'borough', 1:'programtype'})

centros = pd.concat([centros, ambas], axis=1)

tabla = pd.merge(res, centros, left_on='center', right_on='center_id')

tabla = tabla.loc[:, ['center', 'etiqueta', 'score', 'borough', 'programtype']]

tabla =  tabla.rename(columns = {'etiqueta':'label_value'})

tabla = tabla.set_index(['center'])


In [9]:
tabla.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2938 entries, dc33957 to dc35851
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   label_value  2938 non-null   float64
 1   score        2938 non-null   float64
 2   borough      2938 non-null   object 
 3   programtype  2938 non-null   object 
dtypes: float64(2), object(2)
memory usage: 114.8+ KB


In [10]:
tabla.head()

Unnamed: 0_level_0,label_value,score,borough,programtype
center,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dc33957,0.0,0.999619,borough_bronx_0,programtype_all_age_camp_0
dc37573,0.0,0.999147,borough_bronx_0,programtype_all_age_camp_0
dc14450,0.0,0.999629,borough_bronx_0,programtype_all_age_camp_0
dc3053,0.0,0.999992,borough_bronx_0,programtype_all_age_camp_0
dc25685,0.0,0.999619,borough_bronx_0,programtype_all_age_camp_0


In [11]:
tabla.borough.value_counts()

borough_bronx_0    2578
borough_bronx_1     360
Name: borough, dtype: int64

In [12]:
g = Group()

xtab, _ = g.get_crosstabs(tabla)

absolute_metrics = g.list_absolute_metrics(xtab)

df_group = xtab[[col for col in xtab.columns if col not in absolute_metrics]]
df_group.head()

model_id, score_thresholds 1 {'rank_abs': [250]}


  divide = lambda x, y: x / y if y != 0 else pd.np.nan


Unnamed: 0,model_id,score_threshold,k,attribute_name,attribute_value,pp,pn,fp,fn,tn,tp,group_label_pos,group_label_neg,group_size,total_entities
0,1,binary 0/1,250,borough,borough_bronx_0,239,2339,239,83,2256,0,83,2495,2578,2938
1,1,binary 0/1,250,borough,borough_bronx_1,11,349,11,23,326,0,23,337,360,2938
2,1,binary 0/1,250,programtype,programtype_all_age_camp_0,150,2477,150,86,2391,0,86,2541,2627,2938
3,1,binary 0/1,250,programtype,programtype_all_age_camp_1,100,191,100,20,171,0,20,271,291,2938
4,1,binary 0/1,250,programtype,programtype_infant_toddler_0,0,15,0,0,15,0,0,15,15,2938


In [13]:
b = Bias()

bdf = b.get_disparity_predefined_groups(xtab, original_df=tabla, ref_groups_dict={'borough':'borough_brooklyn', 'programtype':'programtype_preschool'}, alpha=0.05, mask_significance=True)

hbdf = b.get_disparity_predefined_groups(xtab, original_df=tabla,
                                         ref_groups_dict={'borough':'borough_brooklyn', 'programtype':'programtype_preschool'},
                                         alpha=0.05,
                                         mask_significance=False)


majority_bdf = b.get_disparity_major_group(xtab, original_df=tabla, mask_significance=True)

get_disparity_predefined_group()


Exception: get_disparity_predefined_groups(): reference groups and values provided do not exist as columns/values in the input dataframe.(Note: check for syntax errors)