# Import & Load Employability data

### Config

In [1]:
import sys
import configparser
config = configparser.ConfigParser()
config.read_file(open('../../../settings.ini'))

sys.path.insert(0, config.get('PATHS','libs_path'))
engine_path = config.get('DATABASE','engine_path')

### Main params

In [2]:
remote_path = 'http://pacha.datawheel.us/educacion/empleabilidad/'
local_path = '../data/'

### Imports

In [3]:
import postgres #from local file postgres.py
from commons import inline_table_xml, download_file, download_zip_file, extract_zip_file #from local file commons.py

import json
import pandas as pd
from sqlalchemy import create_engine

### Load file

In [4]:
#Download and unzip
df = download_file(remote_path,local_path,'empleabilidad_e_ingresos_todos_los_planteles_db.csv')
df[['ingreso_promedio_4to_anio_id']] = df[['ingreso_promedio_4to_anio_id']].fillna(value=-1)
df[['arancel_anual_2016']] = df[['arancel_anual_2016']].fillna(value=-1)
df = df.astype({'institucion_id':'int','tipo_institucion_id':'int','acreditacion_id':'int','carrera_id':'int','ingreso_promedio_4to_anio_id':'int','arancel_anual_2016':'int'})
df = df.rename(columns={'institucion_id':'higher_educational_institution_id','tipo_institucion_id':'higher_educational_institution_type_id','acreditacion_id':'accreditation_id','carrera_id':'career_id','ingreso_promedio_4to_anio_id':'avg_income_4th_id','arancel_anual_2016':'annual_payment_2016','pcent_subvencionados':'pcent_subsidized','retencion_1er_anio':'1st_year_retention','duracion_real_semestres':'duration_in_semester','empleabilidad_1er_anio':'1st_year_employability'});
list(df)


Already downloaded. Using: ../data/empleabilidad_e_ingresos_todos_los_planteles_db.csv
Encoding: ascii


['higher_educational_institution_id',
 'higher_educational_institution_type_id',
 'accreditation_id',
 'career_id',
 'avg_income_4th_id',
 'pcent_subsidized',
 '1st_year_retention',
 'duration_in_semester',
 '1st_year_employability',
 'annual_payment_2016']

### Ingest

In [5]:
engine = create_engine(engine_path)
db = postgres.PostgresDriver(engine)
db.to_sql(df, 'education', 'fact_employability')

DROP TABLE IF EXISTS education.fact_employability;
CREATE TABLE "education"."fact_employability" (
"higher_educational_institution_id" INTEGER,
  "higher_educational_institution_type_id" INTEGER,
  "accreditation_id" INTEGER,
  "career_id" INTEGER,
  "avg_income_4th_id" INTEGER,
  "pcent_subsidized" REAL,
  "1st_year_retention" REAL,
  "duration_in_semester" REAL,
  "1st_year_employability" REAL,
  "annual_payment_2016" INTEGER
)
COPY "education"."fact_employability" ("higher_educational_institution_id","higher_educational_institution_type_id","accreditation_id","career_id","avg_income_4th_id","pcent_subsidized","1st_year_retention","duration_in_semester","1st_year_employability","annual_payment_2016") FROM STDIN WITH CSV HEADER DELIMITER ',';


### Updates

In [6]:
engine.execute("""
UPDATE education.fact_employability SET annual_payment_2016 = NULL where annual_payment_2016 = -1;
""")

engine.execute("""
UPDATE education.fact_employability SET avg_income_4th_id = NULL where avg_income_4th_id = -1;
""")

<sqlalchemy.engine.result.ResultProxy at 0x11024b0f0>

### Dims

In [7]:
d2 = download_file(remote_path+'ids/',local_path,'anios_acreditacion.csv')
d2 = d2.rename(columns={'acreditacion':'accreditation','acreditacion_id':'accreditation_id'});
print (inline_table_xml(d2, 'accreditation', 'accreditation_id', 'accreditation'))

Already downloaded. Using: ../data/anios_acreditacion.csv
Encoding: utf-8

<InlineTable alias="accreditation">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">1</Value>
      <Value column="description">2 años</Value>
      <Value column="es_description">2 años</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">3 años</Value>
      <Value column="es_description">3 años</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">4 años</Value>
      <Value column="es_description">4 años</Value>
    </Row>
  <Row>
      <Value column="id">4</Value>
      <Value column="description">5 años</Value>
      <Value column="es_description">5 años</Value>
    </Row>
  <Row>
      <Value column="id">5</Value>
      <Value column="description">6 

In [8]:
d4 = download_file(remote_path+'ids/',local_path,'tipo_institucion.csv')
d4 = d4.rename(columns={'tipo_institucion':'higher_educational_institution_type','tipo_institucion_id':'higher_educational_institution_type_id'});
print (inline_table_xml(d4, 'higher_educational_institution_type', 'higher_educational_institution_type_id', 'higher_educational_institution_type'))


Already downloaded. Using: ../data/tipo_institucion.csv
Encoding: ISO-8859-2

<InlineTable alias="higher_educational_institution_type">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">1</Value>
      <Value column="description">Centro de FormaciĂłn TĂŠcnica</Value>
      <Value column="es_description">Centro de FormaciĂłn TĂŠcnica</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">Instituto Profesional</Value>
      <Value column="es_description">Instituto Profesional</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">Universidad</Value>
      <Value column="es_description">Universidad</Value>
    </Row>
  </Rows>
</InlineTable>
    


In [9]:
d3 = download_file(remote_path+'ids/',local_path,'ingreso_promedio_4to_anio.csv')
d3 = d3.rename(columns={'ingreso_promedio_4to_anio':'avg_income_4th','ingreso_promedio_4to_anio_id':'avg_income_4th_id'});
print (inline_table_xml(d3, 'avg_income_4th_id', 'avg_income_4th_id', 'avg_income_4th'))

Already downloaded. Using: ../data/ingreso_promedio_4to_anio.csv
Encoding: utf-8

<InlineTable alias="avg_income_4th_id">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">1</Value>
      <Value column="description">De $1 millón 100 mil a $1 millón 200 mil</Value>
      <Value column="es_description">De $1 millón 100 mil a $1 millón 200 mil</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">De $1 millón 200 mil a $1 millón 300 mil</Value>
      <Value column="es_description">De $1 millón 200 mil a $1 millón 300 mil</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">De $1 millón 300 mil a $1 millón 400 mil</Value>
      <Value column="es_description">De $1 millón 300 mil a $1 millón 400 mil</Value>
    </Row>
  <Row>
      <Valu