# Census 2017 - Population (Ingestion)

In [1]:
import sys
import configparser
config = configparser.ConfigParser()
config.read_file(open("../../settings.ini"))

sys.path.insert(0, config.get('PATHS','libs_path'))
engine_path = config.get('DATABASE','engine_path')

In [2]:
local_path = "../data_final/"
remote_path = "/"

In [3]:
# from local file postgres.py
import postgres
# from local file commons.py
from commons import inline_table_xml, inline_dimension_xml, download_file, download_zip_file, extract_zip_file

import json
import pandas as pd
from sqlalchemy import create_engine

In [4]:
df1 = download_file(remote_path, local_path, "population_census.csv")

Already downloaded. Using: ../data_final/population_census.csv


In [5]:
df1.head()

Unnamed: 0,area_id,district_id,scholarship,residence_country,birth_country,habitual_residence,formal_education,birth_place,highest_level_approved,children_born_alive,...,residence_comuna,age,birth_comuna,year_of_arrival_to_chile,highest_course_approved,cant_per,comuna_datachile_id,comuna_customs_id,aboriginal_people,economic_activity
0,1,1,0,256,222,2,1,8,3,98,...,999,6,999,2011,0,1,113,1101,98,98
1,1,1,0,256,201,1,1,8,3,98,...,999,5,999,2017,0,1,113,1101,98,98
2,1,1,0,256,256,1,1,1,1,98,...,999,3,999,9998,0,1,113,1101,2,98
3,1,1,0,256,256,1,1,1,1,98,...,999,0,999,9998,0,6,113,1101,98,98
4,1,1,0,256,256,1,1,1,1,98,...,999,1,999,9998,0,2,113,1101,98,98


In [6]:
df1 = df1.rename(columns = {"comuna_datachile_id": "comuna_id"})

In [7]:
engine = create_engine(engine_path)
db = postgres.PostgresDriver(engine)
db.to_sql(df1, "census", "fact_population_census")

DROP TABLE IF EXISTS census.fact_population_census;
CREATE TABLE "census"."fact_population_census" (
"area_id" INTEGER,
  "district_id" INTEGER,
  "scholarship" INTEGER,
  "residence_country" INTEGER,
  "birth_country" INTEGER,
  "habitual_residence" INTEGER,
  "formal_education" INTEGER,
  "birth_place" INTEGER,
  "highest_level_approved" INTEGER,
  "children_born_alive" INTEGER,
  "sex" INTEGER,
  "native_people" INTEGER,
  "children_currently_alive" INTEGER,
  "residence_5_years_ago" INTEGER,
  "residence_comuna_5_years_ago" INTEGER,
  "residence_country_5_years_ago" INTEGER,
  "residence_comuna" INTEGER,
  "age" INTEGER,
  "birth_comuna" INTEGER,
  "year_of_arrival_to_chile" INTEGER,
  "highest_course_approved" INTEGER,
  "cant_per" INTEGER,
  "comuna_id" INTEGER,
  "comuna_customs_id" INTEGER,
  "aboriginal_people" INTEGER,
  "economic_activity" INTEGER
)
COPY "census"."fact_population_census" ("area_id","district_id","scholarship","residence_country","birth_country","habitual_res

In [8]:
engine.execute("""
CREATE INDEX fact_population_census_index 
ON census.fact_population_census (comuna_id)
""")

<sqlalchemy.engine.result.ResultProxy at 0x1b77b72e8>

## Inline Tables

In [10]:
# Read questionnaire file
questionnaire = pd.ExcelFile("https://docs.google.com/spreadsheets/d/e/2PACX-1vQ4xZxDpyDY4NursNbrsWlCqjREdmBbfC1EMlz4UGQe7M8wMA7Mqw8tZbAcBdkjgBzloyQdcnhiv10C/pub?output=xlsx")
labels = pd.read_excel(questionnaire, "Labels")

#id_labels = list(df1) - []
for label in list(df1):
    q = labels[labels["label"] == label].iloc[0]["id"]
    if q[0] == "_" or q[0] == "P":
        df = pd.read_excel(questionnaire, q)
        print (inline_dimension_xml(df, label, "id", "es", label))


<Dimension name="Residence Country" foreignKey="residence_country">
  <Hierarchy hasAll="true">
            
<InlineTable alias="residence_country">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
  
  </Rows>
</InlineTable>
    
    <Level name="Residence Country" column="id" nameColumn="description" uniqueMembers="true">
      <Annotations>
        <Annotation name="es_caption">Description ES</Annotation>
      </Annotations>
      <Property name="Description ES" column="es_description" />
    </Level>
  </Hierarchy>
</Dimension>
    

<Dimension name="Birth Country" foreignKey="birth_country">
  <Hierarchy hasAll="true">
            
<InlineTable alias="birth_country">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>

IndexError: single positional indexer is out-of-bounds

In [11]:
df = pd.read_csv("datachile_census_country_id.csv", index_col = 0)
df = df[["my_country_code", "my_country_name"]]
df["my_country_code"] = df["my_country_code"].drop_duplicates()
df = df.dropna()
df["my_country_code"] = df["my_country_code"].astype("int")
df.iloc[7,1] = "País no especificado"

print(inline_dimension_xml(df, "Residence Country 5 Years Ago", "my_country_code", "my_country_name", "residence_country_5_years_ago"))


<Dimension name="Residence Country 5 Years Ago" foreignKey="residence_country_5_years_ago">
  <Hierarchy hasAll="true">
            
<InlineTable alias="Residence Country 5 Years Ago">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">219</Value>
      <Value column="description">Perú</Value>
      <Value column="es_description">Perú</Value>
    </Row>
  <Row>
      <Value column="id">202</Value>
      <Value column="description">Colombia</Value>
      <Value column="es_description">Colombia</Value>
    </Row>
  <Row>
      <Value column="id">201</Value>
      <Value column="description">Venezuela</Value>
      <Value column="es_description">Venezuela</Value>
    </Row>
  <Row>
      <Value column="id">221</Value>
      <Value column="description">Bolivia</Value>
      <Value column="es_description">Bolivia</Value>

In [12]:
df = pd.read_csv("datachile_comunas.csv")
df = df[["comuna_datachile_id", "comuna_name"]]
df = df.append({"comuna_datachile_id": 999, "comuna_name": "Comuna no especificada"}, ignore_index=True)

print(inline_dimension_xml(df, "Residence Comuna 5 Years Ago", "comuna_datachile_id", "comuna_name", "residence_comuna_5_years_ago"))


<Dimension name="Residence Comuna 5 Years Ago" foreignKey="residence_comuna_5_years_ago">
  <Hierarchy hasAll="true">
            
<InlineTable alias="Residence Comuna 5 Years Ago">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">226</Value>
      <Value column="description">Pozo Almonte</Value>
      <Value column="es_description">Pozo Almonte</Value>
    </Row>
  <Row>
      <Value column="id">217</Value>
      <Value column="description">Pica</Value>
      <Value column="es_description">Pica</Value>
    </Row>
  <Row>
      <Value column="id">113</Value>
      <Value column="description">Iquique</Value>
      <Value column="es_description">Iquique</Value>
    </Row>
  <Row>
      <Value column="id">108</Value>
      <Value column="description">Huara</Value>
      <Value column="es_description">Huara</Value>
  

In [13]:
econ = pd.read_csv("economic_activity.csv", sep = ";")
econ["num_id"] = econ.index
econ.iloc[22] = pd.Series({"id": "98", "es": "No aplica", "num_id": "98"})
econ.iloc[23] = pd.Series({"id": "99", "es": "Missing", "num_id": "99"})

print(inline_dimension_xml(econ, "Economic Activity", "num_id", "es", "economic_activity"))


<Dimension name="Economic Activity" foreignKey="economic_activity">
  <Hierarchy hasAll="true">
            
<InlineTable alias="Economic Activity">
  <ColumnDefs>
    <ColumnDef name="id" type="Numeric" />
    <ColumnDef name="description" type="String" />
    <ColumnDef name="es_description" type="String" />
  </ColumnDefs>
  <Rows>
    <Row>
      <Value column="id">0</Value>
      <Value column="description">Agricultura, ganadería, silvicultura y pesca</Value>
      <Value column="es_description">Agricultura, ganadería, silvicultura y pesca</Value>
    </Row>
  <Row>
      <Value column="id">1</Value>
      <Value column="description">Explotación de minas y canteras</Value>
      <Value column="es_description">Explotación de minas y canteras</Value>
    </Row>
  <Row>
      <Value column="id">2</Value>
      <Value column="description">Industrias manufactureras</Value>
      <Value column="es_description">Industrias manufactureras</Value>
    </Row>
  <Row>
      <Value column="id