### Main params

In [13]:
remote_path = 'http://pacha.datawheel.us/economia/nene/occupational_status/'

local_path = '../data/'

engine_path = 'postgresql://localhost:5433/datachile'

### Imports

In [8]:
from urllib import request
import zipfile
import shutil
import os.path

import json
import pandas as pd
from sqlalchemy import create_engine

### Open file function

In [9]:
def loadFile(file_name):
    remote_file = remote_path + file_name
    local_file = local_path + file_name

    if not os.path.isfile(local_file):
        with request.urlopen(remote_file) as remote_csv,open(local_file, 'wb') as local_csv:
            shutil.copyfileobj(remote_csv, local_csv)
    
    return pd.read_csv(local_file,delimiter=",")

### Load file

In [10]:
df = loadFile('occupational_status.csv')
list(df)

['ano_encuesta',
 'mes_encuesta',
 'region_id',
 'age',
 'age_range_id',
 'sex_id',
 'icse_id',
 'isco_id',
 'isced_id',
 'occupied_id',
 'general_economic_condition',
 'fact']

### Complete NaN

In [11]:
df['isco_id'] = df['isco_id'].fillna('0')
df['icse_id'] = df['icse_id'].fillna('0')
df['isced_id'] = df['isced_id'].fillna('0')

### Rename, to int & load

In [14]:
df = df.rename(columns={'ano_encuesta':'year','mes_encuesta':'month'});
df = df.astype({'year':'int','month':'int','region_id':'int','age':'int','age_range_id':'int','icse_id':'int','sex_id':'int','isced_id':'int','isco_id':'int','occupied_id':'int','icse_id':'int','general_economic_condition':'int'})

engine = create_engine(engine_path)
df.to_sql('fact_occupational_status_nene', engine, schema='economy', if_exists='replace', index=False)

### Indexes & FK

In [15]:
engine.execute("""
ALTER TABLE economy.fact_occupational_status_nene
  ADD COLUMN date_id INTEGER; 
""")

engine.execute("""
UPDATE economy.fact_occupational_status_nene
SET date_id = dim_date.id
FROM public.dim_date
WHERE dim_date.the_year = economy.fact_occupational_status_nene.year
      AND dim_date.month_of_year = economy.fact_occupational_status_nene.month
      AND dim_date.day_of_month = 1
""")

engine.execute("""
CREATE INDEX fact_occupational_status_nene_region_id 
ON economy.fact_occupational_status_nene (region_id)
""")

<sqlalchemy.engine.result.ResultProxy at 0x7f761acf9908>

In [52]:
def inline_table_xml(df, alias, id_column_name, desc_column_name):
    xml = """
<InlineTable alias="%(alias)s">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
  %(rows)s
  </Rows>
</InlineTable>
    """
    
    keys = [id_column_name, desc_column_name, desc_column_name]
    cols = ['id', 'description', 'es_description']
    
    
    rows = [("  <Row>\n%s\n    </Row>" % ("\n".join(["      <Value column=\"%s\">%s</Value>" % (cols[i], r[k]) for i, k in enumerate(keys)]))) for r in df.to_dict('records')]
    
    return xml % { 'alias': alias, 'rows': "\n".join(rows)}
    

### Related dim

In [54]:
remote_path = 'http://pacha.datawheel.us/ids_oficiales/'
d1 = loadFile('cine_isced.csv')
print (inline_table_xml(d1, 'isced', 'isced_id', 'isced'))


<InlineTable alias="isced">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">1</Value>
      <Value column="description">Doctorado</Value>
      <Value column="es_description">Doctorado</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">Educación Preescolar</Value>
      <Value column="es_description">Educación Preescolar</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">Educación Primaria (nivel 1)</Value>
      <Value column="es_description">Educación Primaria (nivel 1)</Value>
    </Row>
  <Row>
      <Value column="id">4</Value>
      <Value column="description">Educación Primaria (nivel 2)</Value>
      <Value column="es_description">Educación Primaria (nivel 2)</Value>
    </Row>
  <Row>
      <Value column="id">5</Val

In [55]:
d2 = loadFile('cise_icse.csv')
print (inline_table_xml(d2, 'icse', 'icse_id', 'icse'))


<InlineTable alias="icse">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">1</Value>
      <Value column="description">Asalariado Sector Privado</Value>
      <Value column="es_description">Asalariado Sector Privado</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">Asalariado Sector Público</Value>
      <Value column="es_description">Asalariado Sector Público</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">Cuenta Propia</Value>
      <Value column="es_description">Cuenta Propia</Value>
    </Row>
  <Row>
      <Value column="id">4</Value>
      <Value column="description">Empleador</Value>
      <Value column="es_description">Empleador</Value>
    </Row>
  <Row>
      <Value column="id">5</Value>
      <Value column="de

In [13]:
d3 = loadFile('ciuo_isco.csv')
d3

Unnamed: 0,isco,isco_id
0,Agricultores y trabajadores calificados agrope...,1
1,Empleados de oficina,2
2,Funcionarios públicos y personal directivo de ...,3
3,"Oficiales, operarios y artesanos de artes mecá...",4
4,Operadores de instalaciones y máquinas y monta...,5
5,Otros no identificados,6
6,Profesionales científicos e intelectuales,7
7,Técnicos y profesionales de nivel medio,8
8,Trabajadores de los servicios y vendedores de ...,9
9,Trabajadores no calificados,10


In [15]:
d4 = loadFile('age_ranges.csv')
d4

Unnamed: 0,age_range_id,age_range
0,1,15 a 19
1,2,20 a 24
2,3,25 a 29
3,4,30 a 65
4,5,65 a maxima edad de encuestados


In [16]:
d5 = loadFile('sex.csv')
d5

Unnamed: 0,sex_id,sex
0,0,Hombre
1,1,Mujer


In [18]:
d6 = loadFile('occupational_situation.csv')
d6

Unnamed: 0,occupied_id,occupied
0,1,Ocupado
1,2,Desocupado
2,3,Inactivo


In [20]:
d7 = loadFile('general_economic_condition.csv')
d7

Unnamed: 0,general_economic_condition_id,general_economic_condition
0,0,Menor de 15 años
1,1,Ocupado tradicional
2,2,Ocupado notradicional
3,3,Ocupado ausente
4,4,Cesante
5,5,Busca trabajo por primera vez
6,6,Iniciador
7,7,Inactivos que buscaron trabajo
8,8,Inactivos que estuvieron disponibles para trab...
9,9,Inactivos que no buscaron trabajo ni estuviero...
