In [1]:
import json
import pandas as pd
import re
import string
import numpy as np
import pickle

## Load data

In [2]:
with open("../data/raw/text.json") as json_file:
    data = json.load(json_file)

df_text = pd.DataFrame(data)
# make sure there are no missing values
df_text.dropna(axis=0, how='any', inplace=True)
df_text.shape

(2685, 2)

In [3]:
df_text

Unnamed: 0,file_id,text
0,1ZEZSJXkKDSIyDULhRKfHXCLXpm-eYY0Z,"Buenos Aires, de junio de 2019. AUTOS Y V..."
1,1srRixiO6V3FMxoumtAxMfEPblqoD8-gn,"Buenos Aires, de diciembre de 2019. En ..."
2,1kpIno_4VKnxAcfmpXihrqTkpPbr15EvG4nqwfD6-xqc,Resumen: resolución interlocutoria por la que ...
3,1LHE9syGgdHcRRA7Ns9X31UPdYVy-GVJf,...
4,1N_D97m1WHrO9PqVpULeT1t_E0pI7XwO4,"Buenos Aires, 11 de enero de 2019. La in..."
...,...,...
2729,1KlV9Qcsff6ouhoSa-obG_Ci-FGgcPyp-zpYMsPYDjcQ,Resumen: resolución interlocutoria que rechaza...
2730,1tgQM3WYrdelhcozMMDB_6-hP_3CuvOlNWkcCfSxUgaw,AUDIENCIA DE PRUEBA (Art. 210 CPP) VIDEOCONFER...
2731,1DrlPAykTylCmuLOnHnCeuuJXl70gq_Q-,ACTA DE AUDIENCIA “ XX s/ Art. 67 CC 2do párr-...
2732,0B9wNhp3GjjazWHRubGpjY0VXMEU,"Buenos Aires, 27 de junio de 2017 Hora de inic..."


In [4]:
df_meta = pd.read_csv("../data/processed/metadata_test.csv", index_col=0)
# We will focus exclusively on cases of gender violence
df_meta = df_meta[df_meta['VIOLENCIA_DE_GENERO'] == 1]
df_meta

Unnamed: 0,N,NRO_REGISTRO,FECHA_RESOLUCION,FIRMA,MATERIA,ART_INFRINGIDO,CODIGO_O_LEY,CONDUCTA,CONDUCTA_DESCRIPCION,VIOLENCIA_DE_GENERO,...,N_REGISTRO_Y_TOMO_CAMARA_2,LINK_CAMARA_2,DECISION_DE_FONDO_TSJ,N_REGISTRO_Y_TOMO_TSJ,LINK_TSJ,RECURSO_EXTRAORDINARIO_Y_RECURRENTE,DECISION_CSJN,N_REGISTRO_Y_TOMO_CSJN,LINK_CSJN,file_id
1871,1655.0,1810,15_02_2019,Pablo_Casas,contravencional,52,codigo_contravencional,hostigamiento,,1,...,,,,,,,,,,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg
598,577.0,829,13_09_2017,Pablo_Casas,penal,149bis,codigo_penal_de_la_nacion,amenazas,simples,1,...,,,,,,,,,,0B9wNhp3GjjazbTkyblEyWDVCcU0
3362,2753.0,2862,15_07_20,Pablo_Casas,penal,92,codigo_penal_de_la_nacion,lesiones,agravado,1,...,,,,,,,,,,1Ix43U_-cc6EyYFNtekOQqVsaYV3dnRhu
179,185.0,672,2_12_2016,Pablo_Casas,contravencional,52,codigo_contravencional,hostigamiento,,1,...,,,,,,,,,,0B9wNhp3GjjazUlJ2aGNfV3NWWDA
3144,,2695,26_02_20,Pablo_Casas,penal,149bis,codigo_penal_de_la_nacion,amenazas,simples,1,...,,,,,,,,,,1AygkwBWx3kTfgpqKUNS5Hz0xmipUy8vY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2682,,2368,23_09_2019,Pablo_Casas,penal,189bis,codigo_penal_de_la_nacion,tenencia_de_arma,de_fuego_uso_civil,1,...,,,,,,,,,,1Jm7ioiBCmgJDWx7O_5meoAatVhY_J4Ii
2676,2237.0,2364,19_09_2019,juez_interinamente_a_cargo,penal,92,codigo_penal_de_la_nacion,lesiones,leves_agravadas,1,...,,,,,,,,,,1g3zKkiQ1n8KcXzOF8b7rOixFCkD4dlja
1189,1106.0,1302,15_6_2018,Pablo_Casas,penal,149bis,codigo_penal_de_la_nacion,amenazas,simples,1,...,No,No,No,,,,,,,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8
96,99.0,,28_9_2016,Pablo_Casas,contravencional,52,codigo_contravencional,hostigamiento,,1,...,,,,,,,,,,0B9wNhp3GjjazejRSMURGNU5BcUU


In [5]:
df = pd.merge(df_meta, df_text, 
              on='file_id', how='inner', 
              right_index=True)
df.shape

(78, 63)

In [6]:
df

Unnamed: 0,N,NRO_REGISTRO,FECHA_RESOLUCION,FIRMA,MATERIA,ART_INFRINGIDO,CODIGO_O_LEY,CONDUCTA,CONDUCTA_DESCRIPCION,VIOLENCIA_DE_GENERO,...,LINK_CAMARA_2,DECISION_DE_FONDO_TSJ,N_REGISTRO_Y_TOMO_TSJ,LINK_TSJ,RECURSO_EXTRAORDINARIO_Y_RECURRENTE,DECISION_CSJN,N_REGISTRO_Y_TOMO_CSJN,LINK_CSJN,file_id,text
1871,1655.0,1810,15_02_2019,Pablo_Casas,contravencional,52,codigo_contravencional,hostigamiento,,1,...,,,,,,,,,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg,"ACTA DE AUDIENCIA “XX s/ ART. 52 HOSTIGAR, MA..."
598,577.0,829,13_09_2017,Pablo_Casas,penal,149bis,codigo_penal_de_la_nacion,amenazas,simples,1,...,,,,,,,,,0B9wNhp3GjjazbTkyblEyWDVCcU0,"Buenos Aires, 13 de septiembre de 2017 Causa N..."
3362,2753.0,2862,15_07_20,Pablo_Casas,penal,92,codigo_penal_de_la_nacion,lesiones,agravado,1,...,,,,,,,,,1Ix43U_-cc6EyYFNtekOQqVsaYV3dnRhu,AUDIENCIA DE PRUEBA (Art. 210 CPP) VIDEOCONF...
179,185.0,672,2_12_2016,Pablo_Casas,contravencional,52,codigo_contravencional,hostigamiento,,1,...,,,,,,,,,0B9wNhp3GjjazUlJ2aGNfV3NWWDA,"<1> \n\n=� \n,---�?. \n§8\n\n¡·\ \n\nV \n\n�--..."
3144,,2695,26_02_20,Pablo_Casas,penal,149bis,codigo_penal_de_la_nacion,amenazas,simples,1,...,,,,,,,,,1AygkwBWx3kTfgpqKUNS5Hz0xmipUy8vY,RESOLUCION INTERLOCUTORIA Rechaza la excepción...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2682,,2368,23_09_2019,Pablo_Casas,penal,189bis,codigo_penal_de_la_nacion,tenencia_de_arma,de_fuego_uso_civil,1,...,,,,,,,,,1Jm7ioiBCmgJDWx7O_5meoAatVhY_J4Ii,ACTA DE AUDIENCIA “XX y otros s. 149BIS – AMEN...
2676,2237.0,2364,19_09_2019,juez_interinamente_a_cargo,penal,92,codigo_penal_de_la_nacion,lesiones,leves_agravadas,1,...,,,,,,,,,1g3zKkiQ1n8KcXzOF8b7rOixFCkD4dlja,ACTA DE AUDIENCIA “XX SOBRE 89 - LESIONES LE...
1189,1106.0,1302,15_6_2018,Pablo_Casas,penal,149bis,codigo_penal_de_la_nacion,amenazas,simples,1,...,No,No,,,,,,,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8,"Buenos Aires, 15 de junio de 2018. Solicitud d..."
96,99.0,,28_9_2016,Pablo_Casas,contravencional,52,codigo_contravencional,hostigamiento,,1,...,,,,,,,,,0B9wNhp3GjjazejRSMURGNU5BcUU,"llJ � \n·- :, \n= (IS \no> \n...J .¡; \nCl1 ..."


## Predict probabilities

In [7]:
vect_path = str("../models/gender_violence_vectorizer.sav")
with open(vect_path, 'rb') as f:
    vectorizer_gender_violence = pickle.load(f)
    print(vectorizer_gender_violence)

Pipeline(steps=[('preprocessor',
                 <text_preprocessing.TextPreprocessor object at 0x7f8fb28eb250>),
                ('vectorizer',
                 TfidfVectorizer(max_df=0.85, min_df=0.1,
                                 tokenizer=<text_preprocessing.SpacyTokenizer object at 0x7f8fb295a760>))])


In [8]:
# vectorize text
text_features = vectorizer_gender_violence.transform(df.text)

In [9]:
# load best model
model_path = str("../models/gender_violence_model.sav")
with open(model_path, 'rb') as f:
    gender_violence_model = pickle.load(f)

# get predictions
gender_violence_proba = gender_violence_model.predict_proba(text_features)
gender_violence_proba = gender_violence_proba[:,1]
len(gender_violence_proba)

78

In [10]:
# load vectorizer
vect_path = str("../models/violence_type_vectorizer.sav")
with open(vect_path, 'rb') as f:
    vectorizer_violence_type = pickle.load(f)
    print(vectorizer_violence_type)

# vectorize text
text_features = vectorizer_violence_type.transform(df.text)

Pipeline(steps=[('preprocessor',
                 <text_preprocessing.TextPreprocessor object at 0x7f8f8e6de9d0>),
                ('vectorizer',
                 TfidfVectorizer(max_df=0.85, min_df=0.1,
                                 tokenizer=<text_preprocessing.SpacyTokenizer object at 0x7f8f8e6de190>))])


In [11]:
# load best model
model_path = str("../models/violence_type_model.sav")
with open(model_path, 'rb') as f:
    violence_type_model = pickle.load(f)

target_vars = ["V_FISICA", "V_PSIC", "V_ECON", "V_SEX",
               "V_SOC", "V_AMB", "V_SIMB"]

# get predictions for each type of violence
predicted_probas = {}
for var in target_vars:
    model = violence_type_model[var]['fitted_model']
    col_name = str(var + "_proba")
    predicted_probas[col_name] = model.predict_proba(text_features)[:,1]

In [12]:
# consolidate results
df_probas = pd.DataFrame({'file_id': df.file_id, 'gender_violence_proba': gender_violence_proba})
df_probas.reset_index(drop=True, inplace=True)

In [13]:
df_probas = pd.concat([df_probas, pd.DataFrame(predicted_probas)], axis=1)
len(df_probas)

78

In [14]:
df_probas.set_index(df.index, inplace=True)
df_probas

Unnamed: 0,file_id,gender_violence_proba,V_FISICA_proba,V_PSIC_proba,V_ECON_proba,V_SEX_proba,V_SOC_proba,V_AMB_proba,V_SIMB_proba
1871,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg,0.714492,0.150000,0.988806,0.075000,0.01,0.069999,0.040000,0.939651
598,0B9wNhp3GjjazbTkyblEyWDVCcU0,0.526438,0.280000,0.896549,0.455000,0.19,0.160987,0.040000,0.649925
3362,1Ix43U_-cc6EyYFNtekOQqVsaYV3dnRhu,0.802236,0.950000,0.890221,0.071224,0.05,0.037253,0.060000,0.893951
179,0B9wNhp3GjjazUlJ2aGNfV3NWWDA,0.920000,0.840000,0.694358,0.110000,0.02,0.095422,0.030000,0.959612
3144,1AygkwBWx3kTfgpqKUNS5Hz0xmipUy8vY,0.969636,0.850000,0.988178,0.640000,0.73,0.055347,0.060000,0.999758
...,...,...,...,...,...,...,...,...,...
2682,1Jm7ioiBCmgJDWx7O_5meoAatVhY_J4Ii,0.989868,0.980000,0.995620,0.035000,0.02,0.006320,0.990000,0.994612
2676,1g3zKkiQ1n8KcXzOF8b7rOixFCkD4dlja,0.570576,0.563889,0.790525,0.270000,0.06,0.190171,0.080000,0.854273
1189,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8,0.880000,0.850000,0.988145,0.758067,0.08,0.711254,0.050000,0.988560
96,0B9wNhp3GjjazejRSMURGNU5BcUU,0.697669,0.580000,0.981541,0.147199,0.04,0.251227,0.189192,0.932527


## Generate features

In [15]:
# get the names of features use for training
file = open("../models/column_names.txt","r+") 
col_names = file.read().split(",")
col_names.remove("")
col_names

['file_id',
 'MATERIA',
 'GENERO_ACUSADO/A',
 'NACIONALIDAD_ACUSADO/A',
 'NIVEL_INSTRUCCION_ACUSADO/A',
 'GENERO_DENUNCIANTE',
 'NACIONALIDAD_DENUNCIANTE',
 'FRECUENCIA_EPISODIOS',
 'RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE',
 'HIJOS/AS_EN_COMUN',
 'TIPO_DE_RESOLUCION']

In [16]:
df = df[col_names]
original_index = df.index
df

Unnamed: 0,file_id,MATERIA,GENERO_ACUSADO/A,NACIONALIDAD_ACUSADO/A,NIVEL_INSTRUCCION_ACUSADO/A,GENERO_DENUNCIANTE,NACIONALIDAD_DENUNCIANTE,FRECUENCIA_EPISODIOS,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE,HIJOS/AS_EN_COMUN,TIPO_DE_RESOLUCION
1871,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg,contravencional,masculino,argentino,secundario_incompleto,femenino,argentina,primera_vez,ex_pareja,1,interlocutoria
598,0B9wNhp3GjjazbTkyblEyWDVCcU0,penal,masculino,argentino,primario_completo,femenino,argentina,eventual,ex_pareja,0,interlocutoria
3362,1Ix43U_-cc6EyYFNtekOQqVsaYV3dnRhu,penal,masculino,argentino,universitario_completo,femenino,,,,,interlocutoria
179,0B9wNhp3GjjazUlJ2aGNfV3NWWDA,contravencional,masculino,argentino,secundario_completo,femenino,peruana,primera_vez,ex_pareja,1,definitiva
3144,1AygkwBWx3kTfgpqKUNS5Hz0xmipUy8vY,penal,masculino,paraguayo,primario_completo,femenino,paraguaya,habitual,ex_pareja,1,interlocutoria
...,...,...,...,...,...,...,...,...,...,...,...
2682,1Jm7ioiBCmgJDWx7O_5meoAatVhY_J4Ii,penal,masculino,argentino,secundario_incompleto,femenino,argentina,eventual,ex_pareja,1,interlocutoria
2676,1g3zKkiQ1n8KcXzOF8b7rOixFCkD4dlja,penal,masculino,argentino,terciario_incompleto,femenino,argentina,habitual,ex_pareja,1,interlocutoria
1189,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8,penal,masculino,argentino,,femenino,venezolana,habitual,pareja,0,interlocutoria
96,0B9wNhp3GjjazejRSMURGNU5BcUU,contravencional,masculino,,,femenino,Peruana,habitual,ex_pareja,0,interlocutoria


In [17]:
# make all responses lowercase
for var in list(col_names):
    if var == "file_id":
        continue
    else:
        df[var] = df[var].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var] = df[var].str.lower()


In [18]:
# load imputer previosly used
with open("../models/metadata_imp.sav", 'rb') as f:
    meta_imputer = pickle.load(f)

In [19]:
df_imp = meta_imputer.transform(df)
# reconvert the imputed data into a pandas DataFrame
df_imp = pd.DataFrame(df_imp, index=original_index, columns=df.columns)
df_imp

Unnamed: 0,file_id,MATERIA,GENERO_ACUSADO/A,NACIONALIDAD_ACUSADO/A,NIVEL_INSTRUCCION_ACUSADO/A,GENERO_DENUNCIANTE,NACIONALIDAD_DENUNCIANTE,FRECUENCIA_EPISODIOS,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE,HIJOS/AS_EN_COMUN,TIPO_DE_RESOLUCION
1871,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg,contravencional,masculino,argentino,secundario_incompleto,femenino,argentina,primera_vez,ex_pareja,1,interlocutoria
598,0B9wNhp3GjjazbTkyblEyWDVCcU0,penal,masculino,argentino,primario_completo,femenino,argentina,eventual,ex_pareja,0,interlocutoria
3362,1Ix43U_-cc6EyYFNtekOQqVsaYV3dnRhu,penal,masculino,argentino,universitario_completo,femenino,argentina,habitual,ex_pareja,1,interlocutoria
179,0B9wNhp3GjjazUlJ2aGNfV3NWWDA,contravencional,masculino,argentino,secundario_completo,femenino,peruana,primera_vez,ex_pareja,1,definitiva
3144,1AygkwBWx3kTfgpqKUNS5Hz0xmipUy8vY,penal,masculino,paraguayo,primario_completo,femenino,paraguaya,habitual,ex_pareja,1,interlocutoria
...,...,...,...,...,...,...,...,...,...,...,...
2682,1Jm7ioiBCmgJDWx7O_5meoAatVhY_J4Ii,penal,masculino,argentino,secundario_incompleto,femenino,argentina,eventual,ex_pareja,1,interlocutoria
2676,1g3zKkiQ1n8KcXzOF8b7rOixFCkD4dlja,penal,masculino,argentino,terciario_incompleto,femenino,argentina,habitual,ex_pareja,1,interlocutoria
1189,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8,penal,masculino,argentino,secundario_incompleto,femenino,venezolana,habitual,pareja,0,interlocutoria
96,0B9wNhp3GjjazejRSMURGNU5BcUU,contravencional,masculino,argentino,secundario_incompleto,femenino,peruana,habitual,ex_pareja,0,interlocutoria


In [20]:
cols = list(df_imp.columns)
cols.remove('file_id')
# generate dummies from categorical columns
df_features = pd.get_dummies(df_imp, columns=cols)
df_features

Unnamed: 0,file_id,MATERIA_contravencional,MATERIA_penal,GENERO_ACUSADO/A_masculino,NACIONALIDAD_ACUSADO/A_argentino,NACIONALIDAD_ACUSADO/A_boliviano,NACIONALIDAD_ACUSADO/A_español,NACIONALIDAD_ACUSADO/A_paraguayo,NACIONALIDAD_ACUSADO/A_peruano,NIVEL_INSTRUCCION_ACUSADO/A_primario_completo,...,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_ex_pareja,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_familiar,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_ninguna,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_pareja,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_profesor,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_vecino/a,HIJOS/AS_EN_COMUN_0,HIJOS/AS_EN_COMUN_1,TIPO_DE_RESOLUCION_definitiva,TIPO_DE_RESOLUCION_interlocutoria
1871,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg,1,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
598,0B9wNhp3GjjazbTkyblEyWDVCcU0,0,1,1,1,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,1
3362,1Ix43U_-cc6EyYFNtekOQqVsaYV3dnRhu,0,1,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
179,0B9wNhp3GjjazUlJ2aGNfV3NWWDA,1,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
3144,1AygkwBWx3kTfgpqKUNS5Hz0xmipUy8vY,0,1,1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2682,1Jm7ioiBCmgJDWx7O_5meoAatVhY_J4Ii,0,1,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
2676,1g3zKkiQ1n8KcXzOF8b7rOixFCkD4dlja,0,1,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
1189,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8,0,1,1,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
96,0B9wNhp3GjjazejRSMURGNU5BcUU,1,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1


In [21]:
file = open("../models/features_names.txt","r+") 
features_names = file.read().split(",")
features_names.remove("")

In [22]:
features_filter = [feature for feature in features_names if feature in list(df_features.columns)]
df_features = df_features[features_filter]
df_features

Unnamed: 0,file_id,MATERIA_contravencional,MATERIA_penal,GENERO_ACUSADO/A_masculino,NACIONALIDAD_ACUSADO/A_argentino,NACIONALIDAD_ACUSADO/A_boliviano,NACIONALIDAD_ACUSADO/A_español,NACIONALIDAD_ACUSADO/A_paraguayo,NACIONALIDAD_ACUSADO/A_peruano,NIVEL_INSTRUCCION_ACUSADO/A_primario_completo,...,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_ex_pareja,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_familiar,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_ninguna,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_pareja,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_profesor,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_vecino/a,HIJOS/AS_EN_COMUN_0,HIJOS/AS_EN_COMUN_1,TIPO_DE_RESOLUCION_definitiva,TIPO_DE_RESOLUCION_interlocutoria
1871,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg,1,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
598,0B9wNhp3GjjazbTkyblEyWDVCcU0,0,1,1,1,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,1
3362,1Ix43U_-cc6EyYFNtekOQqVsaYV3dnRhu,0,1,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
179,0B9wNhp3GjjazUlJ2aGNfV3NWWDA,1,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
3144,1AygkwBWx3kTfgpqKUNS5Hz0xmipUy8vY,0,1,1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2682,1Jm7ioiBCmgJDWx7O_5meoAatVhY_J4Ii,0,1,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
2676,1g3zKkiQ1n8KcXzOF8b7rOixFCkD4dlja,0,1,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
1189,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8,0,1,1,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
96,0B9wNhp3GjjazejRSMURGNU5BcUU,1,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1


In [23]:
# However, now there might be columns that were in the 
# training dataset but not in the test new
# We are going to add them with zero values for all rows

not_in_test = [f for f in features_names if f not in list(df_features.columns)]

for f in not_in_test:
    df_features = df_features.join(pd.DataFrame({f: [0]*df_features.shape[0]}))
    
#lastly, we need to give the columns the same order as they had in the training data
df_features = df_features[features_names]
df_features.drop(columns=['file_id'], inplace=True)
df_features.replace(np.nan, 0, inplace=True)
df_features

Unnamed: 0,MATERIA_contravencional,MATERIA_penal,GENERO_ACUSADO/A_masculino,NACIONALIDAD_ACUSADO/A_argentino,NACIONALIDAD_ACUSADO/A_armenio,NACIONALIDAD_ACUSADO/A_boliviano,NACIONALIDAD_ACUSADO/A_brasilero,NACIONALIDAD_ACUSADO/A_dominicano,NACIONALIDAD_ACUSADO/A_español,NACIONALIDAD_ACUSADO/A_italiano,...,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_pareja,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_profesor,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_propietario_del_inmueble_de_residencia,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_vecino/a,RELACION_Y_TIPO_ENTRE_ACUSADO/A_Y_DENUNCIANTE_zona_sur,HIJOS/AS_EN_COMUN_0,HIJOS/AS_EN_COMUN_1,HIJOS/AS_EN_COMUN_en_domicilio_particular,TIPO_DE_RESOLUCION_definitiva,TIPO_DE_RESOLUCION_interlocutoria
1871,1,0,1,1,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0.0,0,1,0.0,0,1
598,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0.0,1,0,0.0,0,1
3362,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0.0,0,1,0.0,0,1
179,1,0,1,1,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0.0,0,1,0.0,1,0
3144,0,1,1,0,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0.0,0,1,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2682,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0.0,0,1,0.0,0,1
2676,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0.0,0,1,0.0,0,1
1189,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,1,0,0.0,0,0.0,1,0,0.0,0,1
96,1,0,1,1,0.0,0,0.0,0.0,0,0.0,...,0,0,0.0,0,0.0,1,0,0.0,0,1


In [24]:
df_final = df_features.join(df_probas)
df_final

Unnamed: 0,MATERIA_contravencional,MATERIA_penal,GENERO_ACUSADO/A_masculino,NACIONALIDAD_ACUSADO/A_argentino,NACIONALIDAD_ACUSADO/A_armenio,NACIONALIDAD_ACUSADO/A_boliviano,NACIONALIDAD_ACUSADO/A_brasilero,NACIONALIDAD_ACUSADO/A_dominicano,NACIONALIDAD_ACUSADO/A_español,NACIONALIDAD_ACUSADO/A_italiano,...,TIPO_DE_RESOLUCION_interlocutoria,file_id,gender_violence_proba,V_FISICA_proba,V_PSIC_proba,V_ECON_proba,V_SEX_proba,V_SOC_proba,V_AMB_proba,V_SIMB_proba
1871,1,0,1,1,0.0,0,0.0,0.0,0,0.0,...,1,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg,0.714492,0.150000,0.988806,0.075000,0.01,0.069999,0.040000,0.939651
598,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,1,0B9wNhp3GjjazbTkyblEyWDVCcU0,0.526438,0.280000,0.896549,0.455000,0.19,0.160987,0.040000,0.649925
3362,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,1,1Ix43U_-cc6EyYFNtekOQqVsaYV3dnRhu,0.802236,0.950000,0.890221,0.071224,0.05,0.037253,0.060000,0.893951
179,1,0,1,1,0.0,0,0.0,0.0,0,0.0,...,0,0B9wNhp3GjjazUlJ2aGNfV3NWWDA,0.920000,0.840000,0.694358,0.110000,0.02,0.095422,0.030000,0.959612
3144,0,1,1,0,0.0,0,0.0,0.0,0,0.0,...,1,1AygkwBWx3kTfgpqKUNS5Hz0xmipUy8vY,0.969636,0.850000,0.988178,0.640000,0.73,0.055347,0.060000,0.999758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2682,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,1,1Jm7ioiBCmgJDWx7O_5meoAatVhY_J4Ii,0.989868,0.980000,0.995620,0.035000,0.02,0.006320,0.990000,0.994612
2676,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,1,1g3zKkiQ1n8KcXzOF8b7rOixFCkD4dlja,0.570576,0.563889,0.790525,0.270000,0.06,0.190171,0.080000,0.854273
1189,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,1,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8,0.880000,0.850000,0.988145,0.758067,0.08,0.711254,0.050000,0.988560
96,1,0,1,1,0.0,0,0.0,0.0,0,0.0,...,1,0B9wNhp3GjjazejRSMURGNU5BcUU,0.697669,0.580000,0.981541,0.147199,0.04,0.251227,0.189192,0.932527


## Generate predictions

In [25]:
X = df_final.drop(columns=["file_id"])
X

Unnamed: 0,MATERIA_contravencional,MATERIA_penal,GENERO_ACUSADO/A_masculino,NACIONALIDAD_ACUSADO/A_argentino,NACIONALIDAD_ACUSADO/A_armenio,NACIONALIDAD_ACUSADO/A_boliviano,NACIONALIDAD_ACUSADO/A_brasilero,NACIONALIDAD_ACUSADO/A_dominicano,NACIONALIDAD_ACUSADO/A_español,NACIONALIDAD_ACUSADO/A_italiano,...,TIPO_DE_RESOLUCION_definitiva,TIPO_DE_RESOLUCION_interlocutoria,gender_violence_proba,V_FISICA_proba,V_PSIC_proba,V_ECON_proba,V_SEX_proba,V_SOC_proba,V_AMB_proba,V_SIMB_proba
1871,1,0,1,1,0.0,0,0.0,0.0,0,0.0,...,0,1,0.714492,0.150000,0.988806,0.075000,0.01,0.069999,0.040000,0.939651
598,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,0,1,0.526438,0.280000,0.896549,0.455000,0.19,0.160987,0.040000,0.649925
3362,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,0,1,0.802236,0.950000,0.890221,0.071224,0.05,0.037253,0.060000,0.893951
179,1,0,1,1,0.0,0,0.0,0.0,0,0.0,...,1,0,0.920000,0.840000,0.694358,0.110000,0.02,0.095422,0.030000,0.959612
3144,0,1,1,0,0.0,0,0.0,0.0,0,0.0,...,0,1,0.969636,0.850000,0.988178,0.640000,0.73,0.055347,0.060000,0.999758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2682,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,0,1,0.989868,0.980000,0.995620,0.035000,0.02,0.006320,0.990000,0.994612
2676,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,0,1,0.570576,0.563889,0.790525,0.270000,0.06,0.190171,0.080000,0.854273
1189,0,1,1,1,0.0,0,0.0,0.0,0,0.0,...,0,1,0.880000,0.850000,0.988145,0.758067,0.08,0.711254,0.050000,0.988560
96,1,0,1,1,0.0,0,0.0,0.0,0,0.0,...,0,1,0.697669,0.580000,0.981541,0.147199,0.04,0.251227,0.189192,0.932527


In [26]:
# load best model
model_path = str("../models/complete_model.sav")
with open(model_path, 'rb') as f:
    complete_model = pickle.load(f)

In [1]:
complete_model

NameError: name 'complete_model' is not defined

In [33]:
target_vars = ["V_FISICA", "V_PSIC", "V_ECON", "V_SEX",
               "V_SOC", "V_AMB", "V_SIMB"]

# get predictions for each type of violence
predictions = {}
for var in target_vars:
    model = complete_model['gridcv'][var]['fitted_model']
    col_name = str(var + "_pred")
    predictions[col_name] = model.predict(X)

In [34]:
predictions = pd.DataFrame(predictions, index=X.index)
predictions

Unnamed: 0,V_FISICA_pred,V_PSIC_pred,V_ECON_pred,V_SEX_pred,V_SOC_pred,V_AMB_pred,V_SIMB_pred
1871,0,1,0,0,0,0,1
598,0,0,0,0,0,0,1
3362,1,1,0,0,0,0,1
179,1,1,0,0,0,0,1
3144,1,1,1,1,0,0,1
...,...,...,...,...,...,...,...
2682,1,1,0,0,0,1,1
2676,1,1,0,0,0,0,1
1189,1,1,1,0,1,0,1
96,1,1,0,0,0,0,1


## Evaluate model

In [35]:
y = pd.read_csv("../data/processed/target_vars_test.csv", index_col=0)
y

Unnamed: 0,file_id,VIOLENCIA_DE_GENERO,V_FISICA,V_PSIC,V_ECON,V_SEX,V_SOC,V_AMB,V_SIMB
1718,1iW7U2yEvu-LS6TLo1nxP5bceWCV5ng8A,0,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde
1871,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg,1,0,1,0,0,0,0,1
598,0B9wNhp3GjjazbTkyblEyWDVCcU0,1,1,1,0,0,0,0,1
3233,1AFvaihoOqtF1aNf9GNkxe1gTeRDXSp5s,0,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde
690,1fHzd-xwdemgFpHiq0glrLcwgs0jYvyOM,0,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde
...,...,...,...,...,...,...,...,...,...
2435,180ZWXDC5z5-AYEZYz0rCgmh1waJVJ-n6,0,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde,no_corresponde
1189,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8,1,1,1,1,0,1,0,1
96,0B9wNhp3GjjazejRSMURGNU5BcUU,1,0,1,0,0,0,0,1
2795,1GICSuiv-b8_0eTaWNGOJX1EIhNIK2mIE,1,1,1,0,0,0,0,1


In [41]:
results = predictions.join(y)
results

Unnamed: 0,V_FISICA_pred,V_PSIC_pred,V_ECON_pred,V_SEX_pred,V_SOC_pred,V_AMB_pred,V_SIMB_pred,file_id,VIOLENCIA_DE_GENERO,V_FISICA,V_PSIC,V_ECON,V_SEX,V_SOC,V_AMB,V_SIMB
1871,0,1,0,0,0,0,1,1ui2-sTtmwtq5CTIcwGYs6__ffMEHqFyg,1,0,1,0,0,0,0,1
598,0,0,0,0,0,0,1,0B9wNhp3GjjazbTkyblEyWDVCcU0,1,1,1,0,0,0,0,1
3362,1,1,0,0,0,0,1,1Ix43U_-cc6EyYFNtekOQqVsaYV3dnRhu,1,1,1,0,0,0,0,1
179,1,1,0,0,0,0,1,0B9wNhp3GjjazUlJ2aGNfV3NWWDA,1,1,1,0,0,0,0,1
3144,1,1,1,1,0,0,1,1AygkwBWx3kTfgpqKUNS5Hz0xmipUy8vY,1,1,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2682,1,1,0,0,0,1,1,1Jm7ioiBCmgJDWx7O_5meoAatVhY_J4Ii,1,1,1,0,0,0,1,1
2676,1,1,0,0,0,0,1,1g3zKkiQ1n8KcXzOF8b7rOixFCkD4dlja,1,1,1,0,1,1,0,1
1189,1,1,1,0,1,0,1,1VBwFMBy4cuPCnO2FoIh3wY55JfjJJLfLS2am1vMW9d8,1,1,1,1,0,1,0,1
96,1,1,0,0,0,0,1,0B9wNhp3GjjazejRSMURGNU5BcUU,1,0,1,0,0,0,0,1


In [48]:
results[str(target_vars[0])].unique()

array(['0', '1'], dtype=object)

In [50]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

for var in target_vars:
    y_test = results[var].astype(int)
    y_hat = results[str(var + "_pred")]
    
    print("\n**** Results for %s variable ****\n" % var)
    
    print(" Accuracy: %s" % accuracy_score(y_test, y_hat))
    
    print(" F1: %s" % f1_score(y_test, y_hat))
    
    print(" Precision: %s" % precision_score(y_test, y_hat))
    
    print(" Recall: %s" % recall_score(y_test, y_hat))    


**** Results for V_FISICA variable ****

 Accuracy: 0.9102564102564102
 F1: 0.9292929292929293
 Precision: 0.9387755102040817
 Recall: 0.92

**** Results for V_PSIC variable ****

 Accuracy: 0.9743589743589743
 F1: 0.9861111111111112
 Precision: 0.9861111111111112
 Recall: 0.9861111111111112

**** Results for V_ECON variable ****

 Accuracy: 0.9743589743589743
 F1: 0.9411764705882353
 Precision: 1.0
 Recall: 0.8888888888888888

**** Results for V_SEX variable ****

 Accuracy: 0.9743589743589743
 F1: 0.8
 Precision: 1.0
 Recall: 0.6666666666666666

**** Results for V_SOC variable ****

 Accuracy: 0.9615384615384616
 F1: 0.9090909090909091
 Precision: 1.0
 Recall: 0.8333333333333334

**** Results for V_AMB variable ****

 Accuracy: 0.9871794871794872
 F1: 0.923076923076923
 Precision: 1.0
 Recall: 0.8571428571428571

**** Results for V_SIMB variable ****

 Accuracy: 0.9871794871794872
 F1: 0.993103448275862
 Precision: 0.9863013698630136
 Recall: 1.0
