In [1]:
import requests

import pandas as pd
import pandas_profiling

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
google_key = 'AIzaSyBja5iAzSsWaxQQnrIzABkJDH8e43oCyK0'
country = 'Colombia'

# Data Loading and Preparation - Capacity Dataset

In [3]:
ips_df = pd.read_csv( '../data/IPSs-colombia.csv', dtype = { 'nit IPS ' : 'str' } )

In [4]:
#del ips_df[ 'nit IPS ' ]
del ips_df[ 'num digito_verificion' ]

In [5]:
ips_df.rename( columns = { 'nit IPS ' : 'NIT', 
                          'nom sede IPS' : 'Nombre', 
                          'naturaleza' : 'Naturaleza', 
                          'num nivel atencion' : 'Nivel de Atención',
                         'nom grupo capacidad ' : 'Grupo',
                         'nom descripcion capacidad ' : 'Descripción',
                         'num cantidad capacidad instalada' : 'Capacidad Instalada' }, inplace = True )

In [6]:
ips_df = ips_df.loc[ ips_df[ 'Grupo' ].notnull() ]

In [7]:
ips_df.drop_duplicates( inplace = True )

In [8]:
ips_df.loc[ ips_df[ 'Grupo' ] == 'CAMAS', 'Grupo' ] = 'Camas'
ips_df.loc[ ips_df[ 'Grupo' ] == 'SALAS', 'Grupo' ] = 'Salas'

In [9]:
ips_df.dtypes

Nombre                  object
NIT                     object
Naturaleza              object
Nivel de Atención      float64
Grupo                   object
Descripción             object
Capacidad Instalada    float64
dtype: object

In [10]:
ips_df.shape

(10330, 7)

In [11]:
#pandas_profiling.ProfileReport( ips_df )

In [12]:
ips_df[ 'Descripción Capacidad' ] = ips_df[ 'Grupo' ] + ' ' + ips_df[ 'Descripción' ]

In [13]:
del ips_df[ 'Grupo' ]
del ips_df[ 'Descripción' ]

In [14]:
ips_df.head()

Unnamed: 0,Nombre,NIT,Naturaleza,Nivel de Atención,Capacidad Instalada,Descripción Capacidad
0,CLINICA VIVE,900450008,Privada,,44.0,Camas Psiquiatría
1,CAMPONUEVO CISNE 2,830100595,Privada,,15.0,Camas Psiquiatría
2,COVEN,900280825,Privada,,12.0,Camas Adultos
3,GASTROCAL SAS,900690781,Privada,,2.0,Salas Procedimientos
4,DACARE IPS,900883992,Privada,,2.0,Camas Adultos


In [15]:
ips_df[ 'Nombre' ].nunique()

3723

In [16]:
ips_df[ 'Naturaleza' ].value_counts( dropna = False )

Privada    5674
Pública    4592
Mixta        64
Name: Naturaleza, dtype: int64

In [17]:
ips_df[ 'Nivel de Atención' ].value_counts( dropna = False )

NaN     5739
 1.0    3386
 2.0     782
 3.0     423
Name: Nivel de Atención, dtype: int64

In [18]:
ips_df[ 'Capacidad Instalada' ].value_counts( dropna = False ).head()

1.0    3635
2.0    1544
3.0     769
4.0     761
6.0     470
Name: Capacidad Instalada, dtype: int64

In [19]:
ips_df[ 'Descripción Capacidad' ].value_counts( dropna = False )

Salas Procedimientos                                 1866
Camas Adultos                                        1544
Camas Obstetricia                                    1228
Camas Pediátrica                                     1226
Salas Partos                                         1176
Salas Quirófano                                      1138
Camas Cuidado Intensivo Adulto                        391
Camas Cuidado Intermedio Adulto                       379
Camas Cuidado Intermedio Neonatal                     243
Camas Cuidado Intensivo Neonatal                      237
Camas Cuidado básico neonatal                         201
Camas Psiquiatría                                     168
Camas Cuidado Intensivo Pediátrico                    133
Camas Cuidado Intermedio Pediátrico                   126
Camas Farmacodependencia                               98
Camas Institución Paciente Crónico                     58
Camas Cuidado Agudo Mental                             46
Camas Salud Me

# Splitting by Nature

In [20]:
ips_publicas_df = ips_df.loc[ ips_df[ 'Naturaleza' ] == 'Pública' ]

In [21]:
ips_publicas_df = ips_publicas_df.loc[ ips_publicas_df[ 'Nivel de Atención' ].notnull() ]

In [22]:
ips_publicas_df.shape

(4591, 6)

In [23]:
ips_no_publicas_df = ips_df.loc[ ips_df[ 'Naturaleza' ] != 'Pública' ]

In [24]:
ips_no_publicas_df.shape

(5738, 6)

# Unstacking

### Públicas

In [25]:
ips_publicas_df.sort_values( by = [ 'NIT', 'Nombre', 'Capacidad Instalada' ], inplace = True )
ips_publicas_df.drop_duplicates( subset = [ 'NIT', 'Nombre', 'Naturaleza', 'Nivel de Atención', 'Descripción Capacidad' ], keep = 'last', inplace = True )

In [26]:
ips_publicas_unstacked_df = ips_publicas_df.set_index( [ 'NIT', 'Nombre', 'Naturaleza', 'Nivel de Atención', 'Descripción Capacidad' ], inplace = False ).unstack( level = -1 )

In [27]:
ips_publicas_unstacked_df.columns = ips_publicas_unstacked_df.columns.droplevel()

In [28]:
ips_publicas_unstacked_df = ips_publicas_unstacked_df.reset_index()

In [29]:
ips_publicas_unstacked_df.shape

(1342, 26)

In [30]:
ips_publicas_unstacked_df.head()

Descripción Capacidad,NIT,Nombre,Naturaleza,Nivel de Atención,Camas Adultos,Camas Cuidado Agudo Mental,Camas Cuidado Intensivo Adulto,Camas Cuidado Intensivo Neonatal,Camas Cuidado Intensivo Pediátrico,Camas Cuidado Intermedio Adulto,...,Camas Obstetricia,Camas Pediátrica,Camas Psiquiatría,Camas Salud Mental,Camas Transplante de progenitores hematopoyeticos,Camas Unidad de Quemados Adulto,Camas Unidad de Quemados Pediátrico,Salas Partos,Salas Procedimientos,Salas Quirófano
0,800000118,EMPRESA SOCIAL DEL ESTADO HOSPITAL UNIVERSITAR...,Pública,3.0,212.0,,14.0,3.0,,4.0,...,20.0,30.0,7.0,,,,,,,6.0
1,800006850,CENTRO DE CONSULTA EXTERNA,Pública,1.0,,,,,,,...,,,,,,,,,1.0,
2,800006850,CENTRO DE SALUD DE GRANADA - (253120038003),Pública,1.0,,,,,,,...,,,,,,,,,1.0,
3,800006850,CENTRO DE SALUD DE SIBATE - (257400038002),Pública,1.0,,,,,,,...,,,,,,,,,1.0,
4,800006850,E.S.E. HOSPITAL MARIO GAITAN YANGUAS DE SOACHA...,Pública,1.0,38.0,,,,,,...,18.0,25.0,,,,,,2.0,1.0,3.0


### No Públicas

In [31]:
del ips_no_publicas_df[ 'Nivel de Atención' ]

In [32]:
ips_no_publicas_df.sort_values( by = [ 'NIT', 'Nombre', 'Capacidad Instalada' ], inplace = True )
ips_no_publicas_df.drop_duplicates( subset = [ 'NIT', 'Nombre', 'Naturaleza', 'Descripción Capacidad' ], keep = 'last', inplace = True )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [33]:
ips_no_publicas_unstacked_df = ips_no_publicas_df.set_index( [ 'NIT', 'Nombre', 'Naturaleza', 'Descripción Capacidad' ], inplace = False ).unstack( level = -1 )

In [34]:
ips_no_publicas_unstacked_df.columns = ips_no_publicas_unstacked_df.columns.droplevel()

In [35]:
ips_no_publicas_unstacked_df = ips_no_publicas_unstacked_df.reset_index()

In [36]:
ips_no_publicas_unstacked_df.shape

(2421, 25)

In [37]:
ips_no_publicas_unstacked_df.head()

Descripción Capacidad,NIT,Nombre,Naturaleza,Camas Adultos,Camas Cuidado Agudo Mental,Camas Cuidado Intensivo Adulto,Camas Cuidado Intensivo Neonatal,Camas Cuidado Intensivo Pediátrico,Camas Cuidado Intermedio Adulto,Camas Cuidado Intermedio Mental,...,Camas Obstetricia,Camas Pediátrica,Camas Psiquiatría,Camas Salud Mental,Camas Transplante de progenitores hematopoyeticos,Camas Unidad de Quemados Adulto,Camas Unidad de Quemados Pediátrico,Salas Partos,Salas Procedimientos,Salas Quirófano
0,800003765,VIRREY SOLIS I.P.S S.A SAN DIEGO,Privada,,,,,,,,...,,,,,,,,,1.0,
1,800003765,VIRREY SOLIS I.P.S S.A. AMERICAS,Privada,,,,,,,,...,,,,,,,,,1.0,
2,800003765,VIRREY SOLIS I.P.S S.A. OCCIDENTE,Privada,,,,,,,,...,,,,,,,,,1.0,
3,800003765,VIRREY SOLIS I.P.S S.A. OLAYA,Privada,,,,,,,,...,,,,,,,,,1.0,
4,800003765,VIRREY SOLIS IPS S. A MANZANARES,Privada,,,,,,,,...,,,,,,,,,1.0,


In [38]:
set( ips_publicas_unstacked_df.columns.tolist() ).symmetric_difference( set( ips_no_publicas_unstacked_df.columns.tolist() ) )

{'Nivel de Atención'}

# Training classifier

In [39]:
ips_publicas_unstacked_df = ips_publicas_unstacked_df.fillna( 0 )
ips_no_publicas_unstacked_df = ips_no_publicas_unstacked_df.fillna( 0 )

In [40]:
features = ips_publicas_unstacked_df.columns.tolist()
features.remove( 'NIT' )
features.remove( 'Nombre' )
features.remove( 'Naturaleza' )
features.remove( 'Nivel de Atención' )
len( features )

22

In [41]:
model = RandomForestClassifier( n_estimators = 20, random_state = 1, n_jobs = -1 )

In [42]:
model.fit( ips_publicas_unstacked_df[ features ], ips_publicas_unstacked_df[ 'Nivel de Atención' ] )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [43]:
preds = model.predict( ips_publicas_unstacked_df[ features ] )

In [44]:
confusion_matrix( ips_publicas_unstacked_df[ 'Nivel de Atención' ], preds )

array([[1077,    2,    0],
       [  50,  133,    0],
       [  22,    0,   58]], dtype=int64)

In [45]:
# Generating new predictions
ips_no_publicas_unstacked_df[ 'Nivel de Atención' ] = model.predict( ips_no_publicas_unstacked_df[ features ] )

In [46]:
ips_no_publicas_unstacked_df[ 'Nivel de Atención' ].value_counts( dropna = False )

1.0    1642
3.0     477
2.0     302
Name: Nivel de Atención, dtype: int64

# Concatenating dataframes

In [47]:
ips_unstacked_df = pd.concat( [ ips_publicas_unstacked_df, ips_no_publicas_unstacked_df ] )

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [48]:
ips_unstacked_df[ 'Nivel de Atención' ] = ips_unstacked_df[ 'Nivel de Atención' ].astype( int )

In [49]:
ips_unstacked_df.shape

(3763, 26)

In [50]:
ips_unstacked_df.tail()

Unnamed: 0,Camas Adultos,Camas Cuidado Agudo Mental,Camas Cuidado Intensivo Adulto,Camas Cuidado Intensivo Neonatal,Camas Cuidado Intensivo Pediátrico,Camas Cuidado Intermedio Adulto,Camas Cuidado Intermedio Mental,Camas Cuidado Intermedio Neonatal,Camas Cuidado Intermedio Pediátrico,Camas Cuidado básico neonatal,...,Camas Transplante de progenitores hematopoyeticos,Camas Unidad de Quemados Adulto,Camas Unidad de Quemados Pediátrico,NIT,Naturaleza,Nivel de Atención,Nombre,Salas Partos,Salas Procedimientos,Salas Quirófano
2416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,901090890,Privada,1,IPS CENTRO DE MEDICINA INTEGRATIVA SAS,0.0,1.0,0.0
2417,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,901090960,Privada,3,centro hospitalario de cordoba,0.0,0.0,2.0
2418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,901093053,Privada,1,IPS ENDOFERTIL S.A.S,0.0,1.0,0.0
2419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,901107556,Privada,1,NATURQUALITY PLUS TUNJA,0.0,2.0,0.0
2420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,901109443,Privada,1,CENTRO MEDICO MI DOC SAS,0.0,1.0,0.0


In [51]:
ips_unstacked_df.to_csv( '../data/IPSs-colombia-classified.csv', index = False, sep = ';', decimal = ',' )

In [52]:
ips_unstacked_df.to_csv( '../data/IPSs-colombia-classified-comma.csv', index = False, sep = ',' )

In [56]:
ips_unstacked_df[ 'Naturaleza' ].value_counts( dropna = False )

Privada    2404
Pública    1342
Mixta        17
Name: Naturaleza, dtype: int64