## CARGA Y PROCESADO DE OTRAS TABLAS (casos particulares, distinto procesado del automatizado para la mayoría de las variables)

### Librerías necesarias

In [1]:
import numpy as np
import pandas as pd
import os

#### Constante para comprobación

In [41]:
PAISES_ESTUDIO = {'Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France', \
                  'Germany (until 1990 former territory of the FRG)','Greece','Hungary','Iceland','Ireland','Italy', \
                  'Latvia','Lithuania','Luxembourg','Malta','Netherlands','Norway','Poland','Portugal','Romania', \
                  'Slovakia','Slovenia','Spain','Sweden','Turkey','United Kingdom'}


### Procesado tablas con formato EHIS o similar válido, pero con 1 sola categoría de la variable independiente.

En estos casos no es necesario hacer la operacion de pivotar en base a la columna de la variable independiente. Solo una categoria con los valores en una columna que basta con renombrar para darle significado. Nos ahorramos por lo tanto bastante de los pasos del "procesado formato EHIS" usual (con varias categorías) 

In [7]:
def carga_formato_unico_valor(dict_archivos):
    
    PAISES_ESTUDIO = {'Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France', \
                     'Germany (until 1990 former territory of the FRG)','Greece','Hungary','Iceland','Ireland','Italy', \
                     'Latvia','Lithuania','Luxembourg','Malta','Netherlands','Norway','Poland','Portugal','Romania', \
                     'Slovakia','Slovenia','Spain','Sweden','Turkey','United Kingdom'}

    
    # inicialización variables necesarias
    lista_fallos = []
    n_var_indep = 0
    total_var_indep = len(dict_archivos)
        
        
    # carga del archivo original (Eurostat), procesado, comprobación y guardado en su caso
    # del archivo correspondiente a cada variable
    
    for var_indep,archivo in dict_archivos.items():
        
        # inicialización y carga del archivo
        n_var_indep +=1
        ruta = archivo[0]
        print('#'*90)
        print('CARGANDO ARCHIVO {} EN DF ({} de {}) ...'.format(ruta,n_var_indep,total_var_indep))
        print('#'*90,'\n'*2)
        
        df_varind = pd.read_csv(ruta, na_values = [': u',':'])
        
        print(df_varind.info(),'\n'*2)
        print(df_varind.head(),'\n'*2)
        
        
        # Eliminamos columnas no necesarias y reordenamos formato deseado
        print('Eliminamos columnas innecesarias ...')
        df_varind.drop(columns= ['UNIT', 'TIME','ISCED11','Flag and Footnotes'], inplace=True)
        print('Columnas que quedan: ', df_varind.columns.values,'\n')
        
        print('Reordenamos columnas, renombramos columna "Value" ...')
        df_varind = df_varind[['GEO','SEX','AGE','Value']]
        df_varind.columns = ['GEO','SEX','AGE',archivo[1]]
        print('Columnas quedan: ', df_varind.columns.values,'\n')
        
        
        # Comprobación dimensiones
        print('Comprobando longitud dimensiones ...','\n'*2)
        unicos_GEO,unicos_SEX,unicos_AGE = len(df_varind.GEO.unique()),len(df_varind.SEX.unique()),len(df_varind.AGE.unique())
        print('GEO nº valores unicos: {}'.format(unicos_GEO),'\n')
        print('DIFERENCIA: ', PAISES_ESTUDIO-set(df_varind.GEO.unique()),'\n')
        print('SEX nº valores unicos: {}'.format(unicos_SEX),'\n',df_varind.SEX.unique(),'\n')
        print('AGE nº valores unicos: {}'.format(unicos_AGE),'\n',df_varind.AGE.unique(),'\n'*2)
        
        print('Finalmente DF queda:',df_varind.info(),'\n'*2)
        
                         
        # opción de guardar o no según comprobación
        
        if str.upper(input('Pausa comprobación, pulsa "S" si correcto:\t')) == 'S':
            
            # guardado archivo
            
            print('{} procesado con exito ...'.format(var_indep))
            nombre_guardar = 'base_' + var_indep + '.csv'
            guardar_ruta = os.path.join('.\data\procesado', nombre_guardar)
            df_varind.to_csv(guardar_ruta, index=False)
            print('{} guardado como {}.'.format(var_indep,guardar_ruta),'\n'*2)
            
        else:
            
            # salida con los errores
            
            lista_fallos.append(var_indep)
            print('{} no es correcto.'.format(var_indep))
            
    # salida final        
    print('Procesado no dio resultado en {} de {}'.format(len(lista_fallos),total_var_indep))
    print(lista_fallos)
        

### Llamada a la función

In [5]:
variables = {'Chronic_illness': ('./data/origen/SILC/hlth_silc_05_1_Data.csv',"Chron_ill_pct"), \
            'Prescribed_med_use': ('./data/origen/hlth_ehis_md1e_1_Data.csv',"presc_med_use_pct"), \
            'Non_prescrib_med_use': ('./data/origen/hlth_ehis_md2e_1_Data.csv',"non_presc_med_use_pct")}



In [8]:
carga_formato_unico_valor(variables)

##########################################################################################
CARGANDO ARCHIVO ./data/origen/SILC/hlth_silc_05_1_Data.csv EN DF (1 de 3) ...
########################################################################################## 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 8 columns):
TIME                  310 non-null int64
GEO                   310 non-null object
ISCED11               310 non-null object
AGE                   310 non-null object
SEX                   310 non-null object
UNIT                  310 non-null object
Value                 310 non-null float64
Flag and Footnotes    11 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 19.5+ KB
None 


   TIME      GEO                 ISCED11                  AGE      SEX  \
0  2014  Belgium  All ISCED 2011 levels   From 16 to 24 years    Males   
1  2014  Belgium  All ISCED 2011 levels   From 16 to 24 years  Females   
2  2

La tabla de la **variable 'Chronic illnes' pertenece a otra  iniciativa de Eurostat (encuesta SILC). La estructura concuerda pero toma como primer grupo de edad de 16 a 24, en vez de 15 a 24 de las restantes**. Dada la poca diferencia y el muy probable poco impacto sobre el total de la población **se decide aceptar esta division y corregir el nombre del grupo de edad a efectos de compatibilidad en el formato del resto, señalando esta circunstancia en nuestro estudio.**

In [85]:
df_corregir_Chronic = pd.read_csv('./data/procesado/base_Chronic_illness.csv')
df_corregir_Chronic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 4 columns):
GEO              310 non-null object
SEX              310 non-null object
AGE              310 non-null object
Chron_ill_pct    310 non-null float64
dtypes: float64(1), object(3)
memory usage: 9.8+ KB


In [86]:
df_corregir_Chronic.head()

Unnamed: 0,GEO,SEX,AGE,Chron_ill_pct
0,Belgium,Males,From 16 to 24 years,9.2
1,Belgium,Females,From 16 to 24 years,12.5
2,Belgium,Males,From 25 to 34 years,12.4
3,Belgium,Females,From 25 to 34 years,12.6
4,Belgium,Males,From 35 to 44 years,18.4


**Valores variable 'AGE', con el primero de ellos erroneo**:

In [87]:
df_corregir_Chronic.AGE.unique()

array(['From 16 to 24 years', 'From 25 to 34 years',
       'From 35 to 44 years', 'From 45 to 64 years', '65 years or over'],
      dtype=object)

**Corrección del primer valor:**

In [88]:
df_corregir_Chronic.replace(to_replace='From 16 to 24 years', value='From 15 to 24 years',inplace=True)
df_corregir_Chronic.AGE.unique()


array(['From 15 to 24 years', 'From 25 to 34 years',
       'From 35 to 44 years', 'From 45 to 64 years', '65 years or over'],
      dtype=object)

Comprobamos:

In [89]:
df_corregir_Chronic.head(20)

Unnamed: 0,GEO,SEX,AGE,Chron_ill_pct
0,Belgium,Males,From 15 to 24 years,9.2
1,Belgium,Females,From 15 to 24 years,12.5
2,Belgium,Males,From 25 to 34 years,12.4
3,Belgium,Females,From 25 to 34 years,12.6
4,Belgium,Males,From 35 to 44 years,18.4
5,Belgium,Females,From 35 to 44 years,20.1
6,Belgium,Males,From 45 to 64 years,28.0
7,Belgium,Females,From 45 to 64 years,31.2
8,Belgium,Males,65 years or over,38.2
9,Belgium,Females,65 years or over,42.5


In [90]:
df_corregir_Chronic.tail(20)

Unnamed: 0,GEO,SEX,AGE,Chron_ill_pct
290,Norway,Males,From 15 to 24 years,20.1
291,Norway,Females,From 15 to 24 years,25.8
292,Norway,Males,From 25 to 34 years,19.5
293,Norway,Females,From 25 to 34 years,24.4
294,Norway,Males,From 35 to 44 years,23.4
295,Norway,Females,From 35 to 44 years,36.1
296,Norway,Males,From 45 to 64 years,30.6
297,Norway,Females,From 45 to 64 years,41.4
298,Norway,Males,65 years or over,43.3
299,Norway,Females,65 years or over,49.6


In [91]:
df_corregir_Chronic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 4 columns):
GEO              310 non-null object
SEX              310 non-null object
AGE              310 non-null object
Chron_ill_pct    310 non-null float64
dtypes: float64(1), object(3)
memory usage: 9.8+ KB


Finalmente guardamos archivo:

In [93]:
df_corregir_Chronic.to_csv('./data/procesado/base_Chronic_illness_formateado.csv',index=False)

### Procesado particular, tabla "Health Care Expenditure"

El gasto sanitario no distingue por sexo ni grupo de edad. Debemos poner el valor correspondiente a todas las observaciones de un mismo pais en el archivo final.

**Tomamos uno de los archivos ya procesados correctamente como referencia para obtener las columnas de las dimensiones con nuestro formato:**

In [57]:
df_referencia_dim = pd.read_csv('./data/procesado/base_BMI.csv')
df_referencia_dim = df_referencia_dim[['GEO','SEX','AGE']]

**Cargamos el archivo correspondiente al gasto sanitario:**

In [43]:
df_expenditure = pd.read_csv('./data/origen/hlth_sha/hlth_sha11_hp_1_Data.csv', na_values = [': u',':'])
df_expenditure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 6 columns):
TIME                  31 non-null int64
GEO                   31 non-null object
UNIT                  31 non-null object
ICHA11_HP             31 non-null object
Value                 31 non-null float64
Flag and Footnotes    1 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 1.6+ KB


**OJO: nos han 'cambiado' Turquía por Liechtenstein**

In [44]:
PAISES_ESTUDIO - set(df_expenditure.GEO.unique())

{'Turkey'}

In [45]:
set(df_expenditure.GEO.unique()) - PAISES_ESTUDIO

{'Liechtenstein'}

In [46]:
df_expenditure.tail()

Unnamed: 0,TIME,GEO,UNIT,ICHA11_HP,Value,Flag and Footnotes
26,2014,Sweden,Percentage of gross domestic product (GDP),All providers of health care,10.95,
27,2014,United Kingdom,Percentage of gross domestic product (GDP),All providers of health care,9.96,
28,2014,Iceland,Percentage of gross domestic product (GDP),All providers of health care,8.28,
29,2014,Liechtenstein,Percentage of gross domestic product (GDP),All providers of health care,5.64,
30,2014,Norway,Percentage of gross domestic product (GDP),All providers of health care,9.34,


**Eliminamos la sobrante (no pertenece a los paises del estudio) Liechtenstein:**

In [47]:
df_expenditure.drop(index=29, inplace=True)
df_expenditure.tail()

Unnamed: 0,TIME,GEO,UNIT,ICHA11_HP,Value,Flag and Footnotes
25,2014,Finland,Percentage of gross domestic product (GDP),All providers of health care,9.49,
26,2014,Sweden,Percentage of gross domestic product (GDP),All providers of health care,10.95,
27,2014,United Kingdom,Percentage of gross domestic product (GDP),All providers of health care,9.96,
28,2014,Iceland,Percentage of gross domestic product (GDP),All providers of health care,8.28,
30,2014,Norway,Percentage of gross domestic product (GDP),All providers of health care,9.34,


**Para obtener el dato de Turquía buscamos una fuente solvente en internet. Segun datos de la WHO (World Health Organization Global Health Expenditure database) ( apps.who.int/nha/database ) citados en:**

https://data.worldbank.org/indicator/SH.XPD.CHEX.GD.ZS?locations=TR


Current health expenditure (% of GDP) - Turkey (2014): 4.347 

Añadimos el registro que faltaba (Turquia) con este valor:


In [53]:
df_expenditure = df_expenditure.append({'TIME': 2014, 'GEO': 'Turkey', 'UNIT': 'Percentage of gross domestic product (GDP)', \
                                        'ICHA11_HP': 'All providers of health care', 'Value': 4.35, \
                                        'Flag and Footnotes':'NaN'}, ignore_index=True)

df_expenditure.tail()

Unnamed: 0,TIME,GEO,UNIT,ICHA11_HP,Value,Flag and Footnotes
26,2014,Sweden,Percentage of gross domestic product (GDP),All providers of health care,10.95,
27,2014,United Kingdom,Percentage of gross domestic product (GDP),All providers of health care,9.96,
28,2014,Iceland,Percentage of gross domestic product (GDP),All providers of health care,8.28,
29,2014,Norway,Percentage of gross domestic product (GDP),All providers of health care,9.34,
30,2014,Turkey,Percentage of gross domestic product (GDP),All providers of health care,4.35,


**Comprobamos de nuevo los países:**

In [54]:
len(df_expenditure.GEO.unique())

31

In [55]:
PAISES_ESTUDIO - set(df_expenditure.GEO.unique())

set()

In [56]:
df_expenditure.GEO.unique()

array(['Belgium', 'Bulgaria', 'Czechia', 'Denmark',
       'Germany (until 1990 former territory of the FRG)', 'Estonia',
       'Ireland', 'Greece', 'Spain', 'France', 'Croatia', 'Italy',
       'Cyprus', 'Latvia', 'Lithuania', 'Luxembourg', 'Hungary', 'Malta',
       'Netherlands', 'Austria', 'Poland', 'Portugal', 'Romania',
       'Slovenia', 'Slovakia', 'Finland', 'Sweden', 'United Kingdom',
       'Iceland', 'Norway', 'Turkey'], dtype=object)

Paises en mismo orden de nuestro formato. **Para cada país, 2 (SEX) x 5 (AGE), 10 valores repetidos del gasto sanitario necesitamos. Creamos variable auxiliar (lista) para la columna con los valores correctos, que añadiremos al dataset final:**

In [2]:
columna_health_expend = []

for pais in df_expenditure.GEO.unique():
    columna_health_expend.extend(list(df_expenditure.loc[df_expenditure.GEO==pais,'Value'].values)*10)
    


**Solo nos falta añadir la nueva columna, que sigue nuestro orden de países, al dataset con las columnas de las dimensiones.** Añadimos y comprobamos el resultado:

In [65]:
df_referencia_dim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 3 columns):
GEO    310 non-null object
SEX    310 non-null object
AGE    310 non-null object
dtypes: object(3)
memory usage: 7.4+ KB


In [66]:
df_referencia_dim['Health_Expend_%_GDP'] = columna_health_expend

df_referencia_dim.head(15)


Unnamed: 0,GEO,SEX,AGE,Health_Expend_%_GDP
0,Belgium,Males,From 15 to 24 years,10.44
1,Belgium,Males,From 25 to 34 years,10.44
2,Belgium,Males,From 35 to 44 years,10.44
3,Belgium,Males,From 45 to 64 years,10.44
4,Belgium,Males,65 years or over,10.44
5,Belgium,Females,From 15 to 24 years,10.44
6,Belgium,Females,From 25 to 34 years,10.44
7,Belgium,Females,From 35 to 44 years,10.44
8,Belgium,Females,From 45 to 64 years,10.44
9,Belgium,Females,65 years or over,10.44


In [67]:
df_referencia_dim.tail(15)

Unnamed: 0,GEO,SEX,AGE,Health_Expend_%_GDP
295,Norway,Females,From 15 to 24 years,9.34
296,Norway,Females,From 25 to 34 years,9.34
297,Norway,Females,From 35 to 44 years,9.34
298,Norway,Females,From 45 to 64 years,9.34
299,Norway,Females,65 years or over,9.34
300,Turkey,Males,From 15 to 24 years,4.35
301,Turkey,Males,From 25 to 34 years,4.35
302,Turkey,Males,From 35 to 44 years,4.35
303,Turkey,Males,From 45 to 64 years,4.35
304,Turkey,Males,65 years or over,4.35


In [68]:
df_referencia_dim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 4 columns):
GEO                    310 non-null object
SEX                    310 non-null object
AGE                    310 non-null object
Health_Expend_%_GDP    310 non-null float64
dtypes: float64(1), object(3)
memory usage: 9.8+ KB


**Guardamos finalmente el resultado:**

In [69]:
df_referencia_dim.to_csv('./data/procesado/base_Hlth_Expend.csv', index=False)

### Procesado particular, tabla variable dependiente SPH.

Cargamos la tabla:

In [70]:
df_SPH = pd.read_csv('./data/origen/SPH_target/hlth_silc_02_1_Data.csv', na_values = [': u',':'])
df_SPH.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 9 columns):
TIME                  310 non-null int64
GEO                   310 non-null object
ISCED11               310 non-null object
AGE                   310 non-null object
SEX                   310 non-null object
LEVELS                310 non-null object
UNIT                  310 non-null object
Value                 310 non-null float64
Flag and Footnotes    0 non-null float64
dtypes: float64(2), int64(1), object(6)
memory usage: 21.9+ KB


In [71]:
df_SPH.LEVELS.unique()

array(['Very good or good'], dtype=object)

**Nos quedamos solo con las columnas de interés:**

In [72]:
df_SPH = df_SPH[['GEO','SEX','AGE','Value']]

df_SPH.columns

Index(['GEO', 'SEX', 'AGE', 'Value'], dtype='object')

In [73]:
df_SPH.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 4 columns):
GEO      310 non-null object
SEX      310 non-null object
AGE      310 non-null object
Value    310 non-null float64
dtypes: float64(1), object(3)
memory usage: 9.8+ KB


In [74]:
df_SPH.Value.describe()

count    310.000000
mean      71.444516
std       24.017482
min        3.000000
25%       59.500000
50%       80.100000
75%       89.675000
max       98.800000
Name: Value, dtype: float64

**Renombramos la columna 'Value' a su significado (SPH Good or Very Good pctg.):**

In [78]:
df_SPH.columns = ['GEO', 'SEX', 'AGE', 'SPH_G_or_VG_pct']
df_SPH.columns

Index(['GEO', 'SEX', 'AGE', 'SPH_G_or_VG_pct'], dtype='object')

In [79]:
df_SPH.head(15)

Unnamed: 0,GEO,SEX,AGE,SPH_G_or_VG_pct
0,Belgium,Males,From 16 to 24 years,94.8
1,Belgium,Females,From 16 to 24 years,91.7
2,Belgium,Males,From 25 to 34 years,89.4
3,Belgium,Females,From 25 to 34 years,88.8
4,Belgium,Males,From 35 to 44 years,83.8
5,Belgium,Females,From 35 to 44 years,79.8
6,Belgium,Males,From 45 to 64 years,74.5
7,Belgium,Females,From 45 to 64 years,70.1
8,Belgium,Males,65 years or over,53.3
9,Belgium,Females,65 years or over,50.4


**Damos formato al primer grupo de edad (al provenir de la encuesta SILC) por la misma razon y argumentación que con la variable 'Chronic illnes' anterior en este notebook:**

In [96]:
df_SPH.AGE.unique()

array(['From 16 to 24 years', 'From 25 to 34 years',
       'From 35 to 44 years', 'From 45 to 64 years', '65 years or over'],
      dtype=object)

In [97]:
df_SPH.replace(to_replace='From 16 to 24 years', value='From 15 to 24 years', inplace=True)
df_SPH.AGE.unique()


array(['From 15 to 24 years', 'From 25 to 34 years',
       'From 35 to 44 years', 'From 45 to 64 years', '65 years or over'],
      dtype=object)

**Finalmente guardamos el resultado:**

In [98]:
df_SPH.to_csv('./data/procesado/base_SPH_TARGET.csv', index=False)

In [99]:
df_SPH.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 4 columns):
GEO                310 non-null object
SEX                310 non-null object
AGE                310 non-null object
SPH_G_or_VG_pct    310 non-null float64
dtypes: float64(1), object(3)
memory usage: 9.8+ KB


In [100]:
PAISES_ESTUDIO - set(df_SPH.GEO)

set()

In [101]:
len(df_SPH.GEO.unique())

31