# Import & Load Employability data

### Config

In [1]:
import sys
import configparser
config = configparser.ConfigParser()
config.read_file(open('../../../settings.ini'))

sys.path.insert(0, config.get('PATHS','libs_path'))
engine_path = config.get('DATABASE','engine_path')

### Main params

In [2]:
remote_path = 'http://pacha.datawheel.us/datachile/education/empleabilidad/'
local_path = '../data/'

### Imports

In [3]:
import postgres #from local file postgres.py
from commons import inline_table_xml, download_file, download_zip_file, extract_zip_file #from local file commons.py

import json
import pandas as pd
from sqlalchemy import create_engine

### Load file

In [4]:
#Download and unzip
df = download_file(remote_path,local_path,'empleabilidad_e_ingresos_todos_los_planteles_db.csv')
df[['fourth_year_average_salary_id']] = df[['fourth_year_average_salary_id']].fillna(value=-1)
df[['tuition_cost_2016']] = df[['tuition_cost_2016']].fillna(value=-1)
df = df.astype({'institution_id':'int','institution_group_id':'int','institution_subgroup_id':'int','accreditation_id':'int','career_id':'int','fourth_year_average_salary_id':'int','tuition_cost_2016':'int'})
df = df.rename(columns={'institution_id':'higher_educational_institution_id','institution_group_id':'higher_educational_institution_type_id','institution_subgroup_id':'higher_educational_institution_subtype_id'});
list(df)


Downloading... http://pacha.datawheel.us/datachile/education/empleabilidad/empleabilidad_e_ingresos_todos_los_planteles_db.csv


['higher_educational_institution_id',
 'career_id',
 'subsidized_percent',
 'first_year_retention',
 'first_year_employability',
 'tuition_cost_2016',
 'real_duration_in_terms',
 'higher_educational_institution_type_id',
 'higher_educational_institution_subtype_id',
 'accreditation_id',
 'fourth_year_average_salary_id']

### Ingest

In [5]:
engine = create_engine(engine_path)
db = postgres.PostgresDriver(engine)
db.to_sql(df, 'education', 'fact_employability')

DROP TABLE IF EXISTS education.fact_employability;
CREATE TABLE "education"."fact_employability" (
"higher_educational_institution_id" INTEGER,
  "career_id" INTEGER,
  "subsidized_percent" REAL,
  "first_year_retention" REAL,
  "first_year_employability" REAL,
  "tuition_cost_2016" INTEGER,
  "real_duration_in_terms" REAL,
  "higher_educational_institution_type_id" INTEGER,
  "higher_educational_institution_subtype_id" INTEGER,
  "accreditation_id" INTEGER,
  "fourth_year_average_salary_id" INTEGER
)
COPY "education"."fact_employability" ("higher_educational_institution_id","career_id","subsidized_percent","first_year_retention","first_year_employability","tuition_cost_2016","real_duration_in_terms","higher_educational_institution_type_id","higher_educational_institution_subtype_id","accreditation_id","fourth_year_average_salary_id") FROM STDIN WITH CSV HEADER DELIMITER ',';


### Updates

In [6]:
engine.execute("""
UPDATE education.fact_employability SET tuition_cost_2016 = NULL where tuition_cost_2016 = -1;
""")

engine.execute("""
UPDATE education.fact_employability SET fourth_year_average_salary_id = NULL where fourth_year_average_salary_id = -1;
""")

<sqlalchemy.engine.result.ResultProxy at 0x1152b7f28>

### Dims

In [7]:
d2 = download_file(remote_path+'ids/',local_path,'anios_acreditacion.csv')
d2 = d2.rename(columns={'acreditacion':'accreditation','acreditacion_id':'accreditation_id'});
print (inline_table_xml(d2, 'accreditation', 'accreditation_id', 'accreditation'))

Downloading... http://pacha.datawheel.us/datachile/education/empleabilidad/ids/anios_acreditacion.csv

<InlineTable alias="accreditation">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">0</Value>
      <Value column="description">No</Value>
      <Value column="es_description">No</Value>
    </Row>
  <Row>
      <Value column="id">1</Value>
      <Value column="description">2 años</Value>
      <Value column="es_description">2 años</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">3 años</Value>
      <Value column="es_description">3 años</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">4 años</Value>
      <Value column="es_description">4 años</Value>
    </Row>
  <Row>
      <Value column="id">4</Value>
      <Value col

In [8]:
d3 = download_file(remote_path+'ids/',local_path,'ingreso_promedio_4to_anio.csv')
print (inline_table_xml(d3, 'fourth_year_average_salary_id', 'fourth_year_average_salary_id', 'fourth_year_average_salary'))

Downloading... http://pacha.datawheel.us/datachile/education/empleabilidad/ids/ingreso_promedio_4to_anio.csv

<InlineTable alias="fourth_year_average_salary_id">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">0</Value>
      <Value column="description">nan</Value>
      <Value column="es_description">nan</Value>
    </Row>
  <Row>
      <Value column="id">1</Value>
      <Value column="description">De $300 mil a $400 mil</Value>
      <Value column="es_description">De $300 mil a $400 mil</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">De $400 mil a $500 mil</Value>
      <Value column="es_description">De $400 mil a $500 mil</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">De $500 mil a $600 mil</Value>
      <Value colu