In [68]:
# Cargar librerías
import numpy as np
import pandas as pd
from sodapy import Socrata
import datetime


In [None]:
#Leyendo los datos
MyAppToken = ''

client = Socrata("data.cityofnewyork.us", MyAppToken)

results = client.get("dsg6-ifza", limit = 100000)

df = pd.DataFrame.from_records(results)

In [43]:
df.shape

(51234, 34)

In [56]:
#df.head()

In [45]:
def clean_string(astr):
    '''
    Esta función reemplaza caracteres por espacios, espacios por guiones bajos, y el caracter ñ por 'ni'. 
    También quita acentos. 
    '''
    return astr.lower().replace('.', '') \
        .replace(',', '') \
        .replace(';', '') \
        .replace(':', '') \
        .replace('á', 'a') \
        .replace('é', 'e') \
        .replace('í', 'i') \
        .replace('ó', 'o') \
        .replace('ú', 'u') \
        .replace(' ', '_') \
        .replace('ñ', 'ni')

def clean_columns(df):
    '''
    Esta función corre la función clean_string pero para los nombres de las columnas. 
    '''
    for series in df:
        df.rename(columns={series:clean_string(series)}, inplace=True)
    
#Limpiando la base

print("\t-> Limpando los nombres de las columnas")
clean_columns(df)

print("\t-> Reemplazando espacios en blanco")
for col in df.select_dtypes('object'):
    df[col] = df[col].replace('\s+', ' ', regex=True)
    
print("\t-> Limpiando las observaciones")
for col in df.select_dtypes('object'):
        df[col] = df[col].str.strip()
        df[col] = df[col].str.lower()
        df[col] = df[col].str.replace('á', 'a')
        df[col] = df[col].str.replace('é', 'e')
        df[col] = df[col].str.replace('í', 'i')
        df[col] = df[col].str.replace('ó', 'o')
        df[col] = df[col].str.replace('ú', 'u')
        df[col] = df[col].str.replace(' ', '_')

print("\t-> Cambiando NA por np.nan")
for col in df.select_dtypes('object'):
    df.loc[df[col] == 'na', col] = np.nan

print("\t-> Eliminando duplicados")
df = df.drop_duplicates()
df.shape

	-> Limpando los nombres de las columnas
	-> Reemplazando espacios en blanco
	-> Limpiando las observaciones
	-> Cambiando NA por np.nan
	-> Eliminando duplicados


(38862, 34)

In [46]:
#df.head()

Unnamed: 0,centername,legalname,building,street,borough,zipcode,phone,permitexp,status,agerange,...,healthcodesubsection,violationstatus,inspectionsummaryresult,permitnumber,url,datepermitted,actual,violationavgratepercent,averagepublichealthhazardiolationrate,avgcriticalviolationrate
0,yeshivat_ohr_haiim,yeshivat_ohr_haiim,86-06,135th_street,queens,11418,718-658-7066,2115-01-23t00:00:00.000,active,3_years_-_5_years,...,43.07(b)(4),corrected,initial_annual_inspection_-_reinspection_required,,,,,,,
1,yeshivat_ohr_haiim,yeshivat_ohr_haiim,86-06,135th_street,queens,11418,718-658-7066,2115-01-23t00:00:00.000,active,3_years_-_5_years,...,,,compliance_inspection_of_open_violations_-_pre...,,,,,,,
2,yeshivat_ohr_haiim,yeshivat_ohr_haiim,86-06,135th_street,queens,11418,718-658-7066,2115-01-23t00:00:00.000,active,3_years_-_5_years,...,43.17(a)(1),corrected,initial_annual_inspection_-_reinspection_required,,,,,,,
3,yeshivat_ohr_haiim,yeshivat_ohr_haiim,86-06,135th_street,queens,11418,718-658-7066,2115-01-23t00:00:00.000,active,3_years_-_5_years,...,,,compliance_inspection_of_open_violations_-_pre...,,,,,,,
4,yeshivat_ohr_haiim,yeshivat_ohr_haiim,86-06,135th_street,queens,11418,718-658-7066,2115-01-23t00:00:00.000,active,3_years_-_5_years,...,131.09(d)(2),corrected,initial_annual_inspection_-_reinspection_required,,,,,,,


In [57]:
# Checar los tipos de variables

In [40]:
df.dtypes

centername                               object
legalname                                object
building                                 object
street                                   object
borough                                  object
zipcode                                  object
phone                                    object
permitexp                                object
status                                   object
agerange                                 object
maximumcapacity                          object
dc_id                                    object
programtype                              object
facilitytype                             object
childcaretype                            object
bin                                      object
violationratepercent                     object
totaleducationalworkers                  object
averagetotaleducationalworkers           object
publichealthhazardviolationrate          object
criticalviolationrate                   

In [None]:
# Dar formato adecuado a ciertas variables

In [49]:
df['violationavgratepercent'] = df['violationavgratepercent'].astype('float64')
df['averagepublichealthhazardiolationrate'] = df['averagepublichealthhazardiolationrate'].astype('float64')
df['avgcriticalviolationrate'] = df['avgcriticalviolationrate'].astype('float64')

In [71]:
#df['fecha'] = df['inspectiondate'].astype('datetime64')
df['inspectiondate'] = pd.to_datetime(df.inspectiondate, infer_datetime_format=False)

In [78]:
df['inspection_year'] = df['inspectiondate'].dt.year
df['inspection_month'] = df['inspectiondate'].dt.month
df['inspection_day'] = df['inspectiondate'].dt.day
df['inspection_day_name'] = df['inspectiondate'].dt.day_name()

In [51]:
df['unos'] = 1

In [None]:
# Checar missings por variable

In [60]:
df.isnull().sum()

centername                                   0
legalname                                    0
building                                    11
street                                       0
borough                                      0
zipcode                                     50
phone                                       46
permitexp                                    0
status                                       0
agerange                                  1783
maximumcapacity                              0
dc_id                                        0
programtype                                  0
facilitytype                                 0
childcaretype                                0
bin                                          0
violationratepercent                        86
totaleducationalworkers                      0
averagetotaleducationalworkers               0
publichealthhazardviolationrate             86
criticalviolationrate                       86
inspectiondat

In [59]:
## Conteos simples ##

In [75]:
df.groupby(['inspection_year']).agg({'unos':['sum']})

Unnamed: 0_level_0,unos
Unnamed: 0_level_1,sum
inspection_year,Unnamed: 1_level_2
2017.0,7248
2018.0,14582
2019.0,13262
2020.0,3713


In [91]:
# Promedio de Inspecciones por año
df.groupby(['inspection_year']).agg({'unos':['sum']}).mean()

unos  sum    9701.25
dtype: float64

In [76]:
df.groupby(['inspection_month']).agg({'unos':['sum']})

Unnamed: 0_level_0,unos
Unnamed: 0_level_1,sum
inspection_month,Unnamed: 1_level_2
1.0,4292
2.0,3997
3.0,3506
4.0,2116
5.0,3158
6.0,3051
7.0,2481
8.0,2467
9.0,3621
10.0,4350


In [77]:
df.groupby(['inspection_day']).agg({'unos':['sum']})

Unnamed: 0_level_0,unos
Unnamed: 0_level_1,sum
inspection_day,Unnamed: 1_level_2
1.0,1095
2.0,1000
3.0,1083
4.0,1114
5.0,1372
6.0,1484
7.0,1390
8.0,1331
9.0,1167
10.0,1235


In [81]:
df.groupby(['inspection_day_name']).agg({'unos':['sum']})

Unnamed: 0_level_0,unos
Unnamed: 0_level_1,sum
inspection_day_name,Unnamed: 1_level_2
Friday,4423
Monday,2860
Saturday,7
Sunday,1
Thursday,10831
Tuesday,9038
Wednesday,11645


In [62]:
df.groupby(['borough']).agg({'unos':['sum']})

Unnamed: 0_level_0,unos
Unnamed: 0_level_1,sum
borough,Unnamed: 1_level_2
bronx,8109
brooklyn,12173
manhattan,8212
queens,8824
staten_island,1544


In [63]:
df.groupby(['programtype']).agg({'unos':['sum']})

Unnamed: 0_level_0,unos
Unnamed: 0_level_1,sum
programtype,Unnamed: 1_level_2
all_age_camp,1161
infant_toddler,5985
preschool,31668
preschool_camp,4
school_age_camp,44


In [64]:
df.groupby(['facilitytype']).agg({'unos':['sum']})

Unnamed: 0_level_0,unos
Unnamed: 0_level_1,sum
facilitytype,Unnamed: 1_level_2
camp,1209
gdc,33677
sbcc,3976


In [65]:
df.groupby(['childcaretype']).agg({'unos':['sum']})

Unnamed: 0_level_0,unos
Unnamed: 0_level_1,sum
childcaretype,Unnamed: 1_level_2
camp,1209
child_care_-_infants/toddlers,5985
child_care_-_pre_school,27692
school_based_child_care,3976


In [66]:
df.groupby(['violationcategory']).agg({'unos':['sum']})

Unnamed: 0_level_0,unos
Unnamed: 0_level_1,sum
violationcategory,Unnamed: 1_level_2
critical,8167
general,12003
public_health_hazard,3323


In [None]:
# Cruce de variables

In [96]:
agrupacion1 = df.groupby(['inspection_year','inspection_month']).agg({'unos':['sum']})
agrupacion1

Unnamed: 0_level_0,Unnamed: 1_level_0,unos
Unnamed: 0_level_1,Unnamed: 1_level_1,sum
inspection_year,inspection_month,Unnamed: 2_level_2
2017.0,5.0,548
2017.0,6.0,1031
2017.0,7.0,585
2017.0,8.0,627
2017.0,9.0,1022
2017.0,10.0,1170
2017.0,11.0,1215
2017.0,12.0,1050
2018.0,1.0,1261
2018.0,2.0,1378


In [95]:
# Promedio de inspecciones por mes
agrupacion1.mean()

unos  sum    1077.916667
dtype: float64

In [97]:
agrupacion2 = df.groupby(['inspection_year','inspection_month','inspection_day']).agg({'unos':['sum']})
agrupacion2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unos
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum
inspection_year,inspection_month,inspection_day,Unnamed: 3_level_2
2017.0,5.0,11.0,79
2017.0,5.0,12.0,23
2017.0,5.0,13.0,1
2017.0,5.0,15.0,22
2017.0,5.0,16.0,41
...,...,...,...
2020.0,3.0,13.0,15
2020.0,3.0,16.0,7
2020.0,3.0,24.0,2
2020.0,3.0,25.0,1


In [98]:
# Promedio de inspecciones por día
agrupacion2.mean()

unos  sum    54.272727
dtype: float64

In [99]:
agrupacion3 = df.groupby(['inspection_year','inspection_month','inspection_day_name']).agg({'unos':['sum']})
agrupacion3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unos
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum
inspection_year,inspection_month,inspection_day_name,Unnamed: 3_level_2
2017.0,5.0,Friday,72
2017.0,5.0,Monday,54
2017.0,5.0,Saturday,1
2017.0,5.0,Thursday,195
2017.0,5.0,Tuesday,92
...,...,...,...
2020.0,3.0,Monday,66
2020.0,3.0,Thursday,178
2020.0,3.0,Tuesday,123
2020.0,3.0,Wednesday,264


In [101]:
# promedio de Inspecciones por día de la semana
agrupacion3.mean()

unos  sum    214.392265
dtype: float64

In [105]:
agrupacion4 = df.groupby(['inspection_year','inspection_month','violationcategory']).agg({'unos':['sum']})
agrupacion4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unos
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum
inspection_year,inspection_month,violationcategory,Unnamed: 3_level_2
2017.0,5.0,critical,172
2017.0,5.0,general,152
2017.0,5.0,public_health_hazard,66
2017.0,6.0,critical,325
2017.0,6.0,general,277
...,...,...,...
2020.0,2.0,general,363
2020.0,2.0,public_health_hazard,95
2020.0,3.0,critical,152
2020.0,3.0,general,202


In [106]:
# promedio de inspecciones por categoría de violación
agrupacion4.mean()

unos  sum    223.742857
dtype: float64

In [102]:
df.groupby(['borough','facilitytype']).agg({'unos':['sum']})

Unnamed: 0_level_0,Unnamed: 1_level_0,unos
Unnamed: 0_level_1,Unnamed: 1_level_1,sum
borough,facilitytype,Unnamed: 2_level_2
bronx,camp,64
bronx,gdc,7306
bronx,sbcc,739
brooklyn,camp,441
brooklyn,gdc,9889
brooklyn,sbcc,1843
manhattan,camp,301
manhattan,gdc,7457
manhattan,sbcc,454
queens,camp,228


In [None]:
# Promedio de inspecciones por condado e instalación

In [103]:
df.groupby(['borough','facilitytype']).agg({'unos':['sum']}).mean()

unos  sum    2590.8
dtype: float64

In [104]:
df.groupby(['borough','violationcategory']).agg({'unos':['sum']})

Unnamed: 0_level_0,Unnamed: 1_level_0,unos
Unnamed: 0_level_1,Unnamed: 1_level_1,sum
borough,violationcategory,Unnamed: 2_level_2
bronx,critical,2218
bronx,general,2854
bronx,public_health_hazard,753
brooklyn,critical,2215
brooklyn,general,3614
brooklyn,public_health_hazard,833
manhattan,critical,1528
manhattan,general,2545
manhattan,public_health_hazard,695
queens,critical,1989


In [107]:
df.groupby(['facilitytype','violationcategory']).agg({'unos':['sum']})

Unnamed: 0_level_0,Unnamed: 1_level_0,unos
Unnamed: 0_level_1,Unnamed: 1_level_1,sum
facilitytype,violationcategory,Unnamed: 2_level_2
camp,critical,193
camp,general,207
camp,public_health_hazard,134
gdc,critical,7312
gdc,general,10264
gdc,public_health_hazard,2764
sbcc,critical,662
sbcc,general,1532
sbcc,public_health_hazard,425


In [109]:
#import matplotlib.pyplot as plt
#plt.style.use('ggplot')
#%matplotlib inline

In [115]:
# Graficas de barras
#plt.title('Inspecciones por día')
#plt.hist(agrupacion1, bins = 30)
#plt.grid(True)
#plt.show()

In [None]:
# código de featurin engineering

In [116]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns #Control figure 
import numpy as np
import os
from datetime import date
matplotlib.style.use('ggplot')
%matplotlib inline
from sodapy import Socrata


In [117]:

tabla_4 = df.loc[:, ['dc_id', 'inspectiondate', 'regulationsummary', 'violationcategory', 'healthcodesubsection', 
                     'violationstatus', 'inspectionsummaryresult', 'borough']]

print("\t-> Reagrupar en tres variables Inspection Summary Result: reason, result_1 y result_2")

tabla_4['inspectionsummaryresult'] = tabla_4['inspectionsummaryresult'].astype('str')

df_2 = pd.DataFrame(tabla_4.inspectionsummaryresult.str.split('_-_',1).tolist(), columns= ['reason', 'result'])

df_2['result'] = df_2['result'].astype('str')

df_3 = pd.DataFrame(df_2.result.str.split(';_',1).tolist(), columns = ['result_1', 'result_2'])

df_2 = df_2.drop(df_2.columns[[1]], axis=1) 

df_2 = df_2.join(df_3)

tabla_4 = tabla_4.join(df_2)

tabla_4 = tabla_4.drop(['inspectionsummaryresult'], axis = 1) #Eliminar inspection_summary_result

print("\t-> Únicamente nos quedamos con initial_annual_inspection")

tabla_4.reason.value_counts(dropna=False)

tabla_4 = tabla_4.loc[tabla_4['reason'] == 'initial_annual_inspection']

print("\t-> Creamos categorias para las variables result_1 y result_2")

categorias = ["result_1", "result_2"]

df_4 = pd.get_dummies(tabla_4[categorias])

tabla_4 = tabla_4.join(df_4)

tabla_4 = tabla_4.drop(['result_1', 'result_2'], axis = 1) #Eliminamos variables que no necesitamos

print("\t-> Creamos variables de año, mes y día a partir de Inspection date")

tabla_4['inspectiondate'] = tabla_4['inspectiondate'].astype('str')

tabla_4['inspectiondate'] = pd.to_datetime(tabla_4.inspectiondate, infer_datetime_format=False)

tabla_4['inspection_year'] = tabla_4['inspectiondate'].dt.year

tabla_4['inspection_month'] = tabla_4['inspectiondate'].dt.month

tabla_4['inspection_day_name'] = tabla_4['inspectiondate'].dt.day_name()

print("\t-> Eliminamos los días no hábiles: sábado y domingo")

tabla_4 = tabla_4.drop(tabla_4.loc[tabla_4['inspection_day_name']== 'Saturday'].index)

tabla_4 = tabla_4.drop(tabla_4.loc[tabla_4['inspection_day_name']== 'Sunday'].index)

dias = {"Monday":'1', 
                    "Tuesday":'2', 
                    "Wednesday":'3', 
                    "Thursday":'4',
                    "Friday":'5'}

tabla_4['inspection_day_name'] = tabla_4['inspection_day_name'].map(dias)

tabla_4['inspection_day_name'] = tabla_4['inspection_day_name'].astype('float')

print("\t-> Renombramos la variabla dc_id por center_id")

tabla_4.rename(columns={'dc_id':'center_id'}, inplace=True)

print("\t-> Ordenamos la base por year, month y day en forma descendente")

tabla_4.sort_values(['inspectiondate'], ascending=[False], inplace=True)

print("\t-> One-hot encoding de la variable violationcategory")

categorias = ["violationcategory"]

df_5 = pd.get_dummies(tabla_4[categorias])

tabla_4 = tabla_4.join(df_5)

tabla_4 = tabla_4.drop(['violationcategory'], axis = 1) #Eliminamos variables que no necesitamos

print("\t-> Se crea la variable: ultima_inspección, que son los días que han pasado desde la última inspección anual")

df_6 = tabla_4.loc[tabla_4['inspection_year']!=2020.0]

df_7 = pd.DataFrame(df_6.groupby(["center_id"], sort=False)["inspectiondate"].max().reset_index())

year = str(pd.datetime.now().year)
month = str(pd.datetime.now().month)
day = str(pd.datetime.now().day)

fechas = year + "-" + month + "-" + day

df_7["today"] = pd.to_datetime(fechas)

df_7['dias_ultima_inspeccion'] = df_7['today'] - df_7['inspectiondate']

df_7['dias_ultima_inspeccion'] = df_7['dias_ultima_inspeccion'].dt.days

tabla_4 = pd.merge(tabla_4, df_7, left_on='center_id', right_on='center_id', how='left')

tabla_4 =  tabla_4.rename(columns = {'inspectiondate_x':'inspectiondate'})

tabla_4 = tabla_4.drop(['today', 'inspectiondate_y'], axis = 1)

print("\t-> Creamos la variable violaciones_hist_salud_publica: Número de violaciones de salud pública históricas (2017-2019) por centro")

df_8 = pd.DataFrame(df_6.groupby(["center_id"], sort=False)["violationcategory_public_health_hazard"].sum().reset_index())

df_8 =  df_8.rename(columns = {'violationcategory_public_health_hazard':'violaciones_hist_salud_publica'})

tabla_4 = pd.merge(tabla_4, df_8, left_on='center_id', right_on='center_id', how='left')

print("\t-> Creamos la variable violaciones_2019_salud_publica: Número de violaciones de salud pública en el 2019 por centro")

df_9 = tabla_4.loc[tabla_4['inspection_year']==2019.0]

df_10 = pd.DataFrame(df_9.groupby(["center_id"], sort=False)["violationcategory_public_health_hazard"].sum().reset_index())

df_10 =  df_10.rename(columns = {'violationcategory_public_health_hazard':'violaciones_2019_salud_publica'})

tabla_4 = pd.merge(tabla_4, df_10, left_on='center_id', right_on='center_id', how='left')

print("\t-> Creamos la variable violaciones_hist_criticas: Número de violaciones críticas históricas anteriores (2016-2019) por centro")

df_11 = pd.DataFrame(df_6.groupby(["center_id"], sort=False)["violationcategory_critical"].sum().reset_index())

df_11 =  df_11.rename(columns = {'violationcategory_critical':'violaciones_hist_criticas'})

tabla_4 = pd.merge(tabla_4, df_11, left_on='center_id', right_on='center_id', how='left')

print("\t-> Creamos la variable violaciones_2019_criticas: Número de violaciones críticas en el 2019 por centro")

df_12 = pd.DataFrame(df_9.groupby(["center_id"], sort=False)["violationcategory_critical"].sum().reset_index())

df_12 =  df_12.rename(columns = {'violationcategory_critical':'violaciones_2019_criticas'})

tabla_4 = pd.merge(tabla_4, df_12, left_on='center_id', right_on='center_id', how='left')

print("\t-> Creamos la variable ratio_violaciones_hist: Número de inspecciones en total de primera vez que resultaron en violación crítica o de salud pública/ número de inspecciones de primera vez por centro")

df_13 = pd.merge(df_8, df_11)

df_13['total'] = df_13['violaciones_hist_salud_publica'] + df_13['violaciones_hist_criticas']

df_14 = pd.DataFrame(df_6.groupby(["center_id"], sort=False)["reason"].count().reset_index())

df_15 = pd.merge(df_13, df_14)

df_15['ratio_violaciones_hist'] = df_15['total'] / df_15['reason']

df_15 = df_15.drop(['violaciones_hist_salud_publica', 'violaciones_hist_criticas', 'total', 'reason'], axis = 1)

tabla_4 = pd.merge(tabla_4, df_15, left_on='center_id', right_on='center_id', how='left')

print("\t-> Creamos la variable ratio_violaciones_2019: Número de inspecciones en total de primera vez que resultaron en violación crítica o de salud pública en el 2019 / número de inspecciones de primera vez por centro")

df_16 = pd.merge(df_10, df_12)

df_16['total'] = df_16['violaciones_2019_salud_publica'] + df_16['violaciones_2019_criticas']

df_17 = pd.DataFrame(df_9.groupby(["center_id"], sort=False)["reason"].count().reset_index())

df_18 = pd.merge(df_16, df_17)

df_18['ratio_violaciones_2019'] = df_18['total'] / df_18['reason']

df_18 = df_18.drop(['violaciones_2019_salud_publica', 'violaciones_2019_criticas', 'total', 'reason'], axis = 1)

tabla_4 = pd.merge(tabla_4, df_18, left_on='center_id', right_on='center_id', how='left')

print("\t-> Creamos la variable prom_violaciones_hist_borough: Promedio de violaciones históricas por distrito")

df_19 = pd.DataFrame(df_6.groupby(["borough"], sort=False)[["violationcategory_critical", "violationcategory_general", "violationcategory_public_health_hazard"]].sum().reset_index())

df_19['prom_violaciones_hist_borough'] = df_19[['violationcategory_critical', 'violationcategory_general', 'violationcategory_public_health_hazard']].mean(axis=1)

df_19 = df_19.drop(['violationcategory_critical', 'violationcategory_general', 'violationcategory_public_health_hazard'], axis = 1)

tabla_4 = pd.merge(tabla_4, df_19, left_on='borough', right_on='borough', how='left')

print("\t-> Creamos la variable prom_violaciones_2019_borough: Promedio de violaciones en el 2019 por distrito")

df_20 = pd.DataFrame(df_9.groupby(["borough"], sort=False)[["violationcategory_critical", "violationcategory_general", "violationcategory_public_health_hazard"]].sum().reset_index())

df_20['prom_violaciones_2019_borough'] = df_20[['violationcategory_critical', 'violationcategory_general', 'violationcategory_public_health_hazard']].mean(axis=1)

df_20 = df_20.drop(['violationcategory_critical', 'violationcategory_general', 'violationcategory_public_health_hazard'], axis = 1)

tabla_4 = pd.merge(tabla_4, df_20, left_on='borough', right_on='borough', how='left')

print("\t-> Creamos la variable ratio_violaciones_hist_sp: Número de violaciones de salud pública de primera vez por centro históricas (2017-2019)/ número de violaciones de primera vez de todo tipo por centro históricas (2017-2019) ")

df_21 = pd.DataFrame(df_6.groupby(["center_id"], sort=False)[["violationcategory_critical", "violationcategory_general", "violationcategory_public_health_hazard"]].sum().reset_index())

df_21['total'] = df_21['violationcategory_critical'] + df_21['violationcategory_general'] + df_21['violationcategory_public_health_hazard']

df_21['ratio_violaciones_hist_sp'] = df_21['violationcategory_public_health_hazard'] / df_21['total']

df_21 = df_21.drop(['violationcategory_critical', 'violationcategory_general', 'violationcategory_public_health_hazard', 'total'], axis = 1)

tabla_4 = pd.merge(tabla_4, df_21, left_on='center_id', right_on='center_id', how='left')

print("\t-> Creamos la variable ratio_violaciones_2019_sp: Número de violaciones de salud pública de primera vez por centro en el 2019 / número de violaciones de primera vez de todo tipo por centro en el 2019 ")

df_22 = pd.DataFrame(df_9.groupby(["center_id"], sort=False)[["violationcategory_critical", "violationcategory_general", "violationcategory_public_health_hazard"]].sum().reset_index())

df_22['total'] = df_22['violationcategory_critical'] + df_22['violationcategory_general'] + df_22['violationcategory_public_health_hazard']

df_22['ratio_violaciones_2019_sp'] = df_22['violationcategory_public_health_hazard'] / df_22['total']

df_22 = df_22.drop(['violationcategory_critical', 'violationcategory_general', 'violationcategory_public_health_hazard', 'total'], axis = 1)

tabla_4 = pd.merge(tabla_4, df_22, left_on='center_id', right_on='center_id', how='left')

print("\t-> Creamos la variable ratio_violaciones_hist_criticas: Número de violaciones críticas de primera vez por centro históricas (2017-2019)/ número de violaciones de primera vez de todo tipo por centro históricas (2017-2019)")

df_23 = pd.DataFrame(df_6.groupby(["center_id"], sort=False)[["violationcategory_critical", "violationcategory_general", "violationcategory_public_health_hazard"]].sum().reset_index())

df_23['total'] = df_23['violationcategory_critical'] + df_23['violationcategory_general'] + df_23['violationcategory_public_health_hazard']

df_23['ratio_violaciones_hist_criticas'] = df_23['violationcategory_critical'] / df_23['total']

df_23 = df_23.drop(['violationcategory_critical', 'violationcategory_general', 'violationcategory_public_health_hazard', 'total'], axis = 1)

tabla_4 = pd.merge(tabla_4, df_23, left_on='center_id', right_on='center_id', how='left')

print("\t-> Creamos la variable ratio_violaciones_2019_criticas: Número de violaciones críticas de primera vez por centro en el 2019/ número de violaciones de primera vez de todo tipo por centro en el 2019")

df_24 = pd.DataFrame(df_9.groupby(["center_id"], sort=False)[["violationcategory_critical", "violationcategory_general", "violationcategory_public_health_hazard"]].sum().reset_index())

df_24['total'] = df_24['violationcategory_critical'] + df_24['violationcategory_general'] + df_24['violationcategory_public_health_hazard']

df_24['ratio_violaciones_2019_criticas'] = df_24['violationcategory_critical'] / df_24['total']

df_24 = df_24.drop(['violationcategory_critical', 'violationcategory_general', 'violationcategory_public_health_hazard', 'total'], axis = 1)

tabla_4 = pd.merge(tabla_4, df_24, left_on='center_id', right_on='center_id', how='left')

tabla_4.info()

	-> Reagrupar en tres variables Inspection Summary Result: reason, result_1 y result_2
	-> Únicamente nos quedamos con initial_annual_inspection
	-> Creamos categorias para las variables result_1 y result_2
	-> Creamos variables de año, mes y día a partir de Inspection date
	-> Eliminamos los días no hábiles: sábado y domingo
	-> Renombramos la variabla dc_id por center_id
	-> Ordenamos la base por year, month y day en forma descendente
	-> One-hot encoding de la variable violationcategory
	-> Se crea la variable: ultima_inspección, que son los días que han pasado desde la última inspección anual
	-> Creamos la variable violaciones_hist_salud_publica: Número de violaciones de salud pública históricas (2017-2019) por centro
	-> Creamos la variable violaciones_2019_salud_publica: Número de violaciones de salud pública en el 2019 por centro
	-> Creamos la variable violaciones_hist_criticas: Número de violaciones críticas históricas anteriores (2016-2019) por centro
	-> Creamos la variable

In [None]:
def plot_totales(df):
    save_path = 'docs/img/catplot_consumos_totales.png'
    dff = df.groupby(['alcaldia']).agg({'consumo_total_mixto':'sum', 'consumo_prom_dom':'sum', 'consumo_total_dom':'sum','consumo_prom_mixto':'sum', 'consumo_total':'sum', 'consumo_prom':'sum', 'consumo_prom_no_dom':'sum', 'consumo_total_no_dom':'sum'}).reset_index()
    dff_long = dff.melt(id_vars=['alcaldia'], var_name="medicion", value_name="valor")
    dff_long = dff_long[(dff_long.medicion == 'consumo_total_mixto') | (dff_long.medicion == 'consumo_total_dom') | (dff_long.medicion == 'consumo_total_no_dom')]
    order = dff_long.groupby(['alcaldia']).agg({'valor':'sum'}).reset_index().sort_values(['valor'], ascending=False).alcaldia.values
    p = sns.catplot(x="alcaldia", y="valor", hue='medicion',data=dff_long, kind='bar',order=order, palette=color_dict)
    plt.xticks(rotation=90)
    plt.ticklabel_format(style='plain', axis='y')
    plt.ylabel('Consumos totales en métros cúbicos')
    plt.xlabel('Alcaldía')
    p.savefig(save_path,bbox_inches='tight')
    plt.clf()

In [None]:
def plot_totales_como_porcentajes(df):
    save_path = 'docs/img/catplot_consumos_totales_porcentajes.png'
    dff = df.groupby(['alcaldia']).agg({'consumo_total':'sum',
                                    'consumo_total_dom': 'sum',
                                    'consumo_total_no_dom': 'sum',
                                    'consumo_total_mixto':'sum'}).reset_index()
    dff.consumo_total = dff.consumo_total / dff.consumo_total.sum()
    dff.consumo_total_dom = dff.consumo_total_dom / dff.consumo_total_dom.sum()
    dff.consumo_total_no_dom = dff.consumo_total_no_dom / dff.consumo_total_no_dom.sum()
    dff.consumo_total_mixto = dff.consumo_total_mixto / dff.consumo_total_mixto.sum()

    dff_long = dff.drop(['consumo_total'], axis=1).melt(id_vars=['alcaldia'], var_name="medicion", value_name="valor")

    order = dff.sort_values(['consumo_total'], ascending=False).alcaldia.values
    p = sns.catplot(x="alcaldia", y="valor", hue='medicion',data=dff_long, kind='bar', palette=color_dict, order=order)
    plt.xticks(rotation=90)
    plt.ylabel('Consumos totales en porcentajes')
    plt.xlabel('Alcaldía')
    p.savefig(save_path,bbox_inches='tight')
    plt.clf()