# Import & Load Inmigration data

### Main params

In [1]:
remote_url = 'http://pacha.datawheel.us/inmigracion/visas/3_archivo_limpio/visas.zip'
local_path = '../data'
local_file = local_path+'/temp.zip'
data_file = local_path+'/visas.csv'

engine_path = 'postgresql://localhost:5432/datachile'

### Download & unzip file

In [2]:
from urllib import request
import zipfile
import shutil

with request.urlopen(remote_url) as remote_zip,open(local_file, 'wb') as local_zip:
    shutil.copyfileobj(remote_zip, local_zip)
    zip_ref = zipfile.ZipFile(local_file, 'r')
    zip_ref.extractall(local_path)
    zip_ref.close()


### Load file & database connect

In [3]:
import json
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine(engine_path)
df = pd.read_csv(data_file,delimiter=",")

df = df.rename(columns={'anio':'year','sexo_id':'sex_id','nacimiento':'birth_date','edad':'age','actividad_id':'activity_id','estudios_id':'study_id'})



### Relation tables

In [4]:
sexo_table = df.drop_duplicates(subset = ['sex_id'])
sexo_table[['sex_id','sexo']].sort_values('sex_id')

Unnamed: 0,sex_id,sexo
2,0.0,Masculino
0,1.0,Femenino


In [5]:
actividad_table = df.drop_duplicates(subset = ['activity_id'])
actividad_table[['activity_id','actividad']].sort_values('activity_id')

Unnamed: 0,activity_id,actividad
0,1,Empleado
3,2,Estudiante
8,3,Dueña de Casa
9,4,Inactivo
10,5,Empleado Doméstico
87,6,No Informa
92,7,Otras Actividades
150,8,Religioso
182,9,Trabajador por Cuenta Propia
345,10,Sin Actividad


In [6]:
df.loc[df['study_id'] == 3, 'study_id'] = 1
df.loc[df['estudios'] == 'No Indica', 'estudios'] = 'No Informa'
estudios_table = df.drop_duplicates(subset = ['study_id'])
estudios_table[['study_id','estudios']].sort_values('study_id')

Unnamed: 0,study_id,estudios
0,1,No Informa
1,2,Educación Universitaria
44,4,Educación Media
122,5,Educación Básica
437,6,Educación Técnica
5514,7,Educación Pre-Básica
6087,8,Ninguno


### Insert to DB

In [7]:
df['comuna_datachile_id'] = df['comuna_datachile_id'].fillna('null')
df['region_id'] = df['region_id'].fillna('null')

df = df[['year','sex_id','birth_date','age','activity_id','study_id','comuna_datachile_id','region_id','country_code']]
df = df.astype({'sex_id':'int','activity_id':'int','study_id':'int','comuna_datachile_id':'int','country_code':'int','region_id':'int','age':'int','year':'int'})

df.to_sql('fact_immigration_records', engine, schema='public', if_exists='replace', index=False)

### Add date relation

In [8]:
engine.execute("""
ALTER TABLE public.fact_immigration_records
  ADD COLUMN date_id INTEGER; 
""")

engine.execute("""
UPDATE public.fact_immigration_records
SET date_id = dim_date.id
FROM dim_date
WHERE dim_date.the_year = public.fact_immigration_records.year
      AND dim_date.month_of_year = 1
      AND dim_date.day_of_month = 1
""")

<sqlalchemy.engine.result.ResultProxy at 0x10cf0c390>