# Import & Load Inmigration data

### Config

In [1]:
import sys
import configparser
config = configparser.ConfigParser()
config.read_file(open('../../settings.ini'))

sys.path.insert(0, config.get('PATHS','libs_path'))
engine_path = config.get('DATABASE','engine_path')

### Main params

In [2]:
remote_path = 'http://pacha.datawheel.us/inmigracion/visas/2_datasets_limpios/'
local_path = '../data/'

In [3]:
import postgres #from local file postgres.py
from commons import inline_table_xml, download_file, download_zip_file, extract_zip_file #from local file commons.py

import json
import pandas as pd
from sqlalchemy import create_engine

### Download & unzip file

In [4]:
#Download and unzip
download_zip_file(remote_path,local_path,'visas.zip')
extract_zip_file(local_path,'temp.zip')

Downloading... http://pacha.datawheel.us/inmigracion/visas/2_datasets_limpios/visas.zip
Unzipping... ../data/temp.zip


True

### Load file

In [5]:
#Open unzipped file
df = pd.read_csv(local_path+'visas.csv',delimiter=",")
df = df[['year','comuna_datachile_id','sex_id','activity_id','birth_date','age','studies_id','country_code','visa_type_id']]
df['comuna_datachile_id'] = df['comuna_datachile_id'].fillna(-1)
df = df.astype({'year':'int','comuna_datachile_id':'int','sex_id':'int','activity_id':'int','studies_id':'int','age':'int','country_code':'int','visa_type_id':'int'})
list(df)

['year',
 'comuna_datachile_id',
 'sex_id',
 'activity_id',
 'birth_date',
 'age',
 'studies_id',
 'country_code',
 'visa_type_id']

### Ingest

In [6]:
engine = create_engine(engine_path)
db = postgres.PostgresDriver(engine)
db.to_sql(df, 'public', 'fact_immigration_records')

CREATE SCHEMA IF NOT EXISTS public;
DROP TABLE IF EXISTS public.fact_immigration_records;
CREATE TABLE "public"."fact_immigration_records" (
"year" INTEGER,
  "comuna_datachile_id" INTEGER,
  "sex_id" INTEGER,
  "activity_id" INTEGER,
  "birth_date" TEXT,
  "age" INTEGER,
  "studies_id" INTEGER,
  "country_code" INTEGER,
  "visa_type_id" INTEGER
)
COPY "public"."fact_immigration_records" ("year","comuna_datachile_id","sex_id","activity_id","birth_date","age","studies_id","country_code","visa_type_id") FROM STDIN WITH CSV HEADER DELIMITER ',';


### Add date relation

In [7]:
engine.execute("""
ALTER TABLE public.fact_immigration_records
  ADD COLUMN date_id INTEGER; 
""")

engine.execute("""
UPDATE public.fact_immigration_records
SET date_id = dim_date.id
FROM dim_date
WHERE dim_date.the_year = public.fact_immigration_records.year
      AND dim_date.month_of_year = 1
      AND dim_date.day_of_month = 1
""")

<sqlalchemy.engine.result.ResultProxy at 0x103883198>

### Add Indices to foreign keys

In [8]:
engine.execute("""
UPDATE public.fact_immigration_records SET comuna_datachile_id = NULL where comuna_datachile_id = -1;
""")

engine.execute("""
CREATE INDEX fact_immigration_records_comuna_datachile_id_index 
ON public.fact_immigration_records (comuna_datachile_id)
""")

engine.execute("""
CREATE INDEX fact_immigration_records_date_id_index 
ON public.fact_immigration_records (date_id);
""")



<sqlalchemy.engine.result.ResultProxy at 0x117fa8f98>

### Relation tables

```xml
<InlineTable alias="sex">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">0</Value>
      <Value column="description">No informa</Value>
      <Value column="es_description">No informa</Value>
    </Row>
  <Row>
      <Value column="id">1</Value>
      <Value column="description">Women</Value>
      <Value column="es_description">Mujer</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">Men</Value>
      <Value column="es_description">Hombre</Value>
    </Row>
  </Rows>
</InlineTable>
```

In [9]:
d2 = download_file(remote_path,local_path,'activity_id.csv')
print (inline_table_xml(d2, 'activity', 'activity_id', 'activity'))

Downloading... http://pacha.datawheel.us/inmigracion/visas/2_datasets_limpios/activity_id.csv

<InlineTable alias="activity">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">1</Value>
      <Value column="description">Empleado</Value>
      <Value column="es_description">Empleado</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">Estudiante</Value>
      <Value column="es_description">Estudiante</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">Dueña de Casa</Value>
      <Value column="es_description">Dueña de Casa</Value>
    </Row>
  <Row>
      <Value column="id">4</Value>
      <Value column="description">Inactivo</Value>
      <Value column="es_description">Inactivo</Value>
    </Row>
  <Row>
      <Value column="id">5

In [10]:
d3 = download_file(remote_path,local_path,'studies_id.csv')
print (inline_table_xml(d3, 'studies', 'studies_id', 'studies'))

Downloading... http://pacha.datawheel.us/inmigracion/visas/2_datasets_limpios/studies_id.csv

<InlineTable alias="studies">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">1</Value>
      <Value column="description">No Indica</Value>
      <Value column="es_description">No Indica</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">Educación Universitaria</Value>
      <Value column="es_description">Educación Universitaria</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">No Informa</Value>
      <Value column="es_description">No Informa</Value>
    </Row>
  <Row>
      <Value column="id">4</Value>
      <Value column="description">Educación Media</Value>
      <Value column="es_description">Educación Media</Value>
    </Row>


In [11]:
d4 = download_file(remote_path,local_path,'visa_type.csv')
print (inline_table_xml(d4, 'visa_type', 'visa_type_id', 'visa_type'))

Downloading... http://pacha.datawheel.us/inmigracion/visas/2_datasets_limpios/visa_type.csv

<InlineTable alias="visa_type">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">1</Value>
      <Value column="description">Estudiante</Value>
      <Value column="es_description">Estudiante</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">Temporaria</Value>
      <Value column="es_description">Temporaria</Value>
    </Row>
  <Row>
      <Value column="id">3</Value>
      <Value column="description">Sujeta A Contrato</Value>
      <Value column="es_description">Sujeta A Contrato</Value>
    </Row>
  <Row>
      <Value column="id">4</Value>
      <Value column="description">Sin Beneficio</Value>
      <Value column="es_description">Sin Beneficio</Value>
    </Row>
  <Row>
     