In [1]:
# Import libraries
import pandas as pd
import json

In [2]:
def search_comuna_id(name):
    name = name.upper().strip().replace(u"\xa0", u" ").strip()
    if name in comuna_datachile_id:
        return comuna_datachile_id[name]
    else:
        return comuna_thesauro_id[name]
    

def get_values(d):
    return [ value for key, value in d.items() ]

In [3]:
# Read Comunas
URL = "https://raw.githubusercontent.com/datachile/datachile-etl/master/official_ids/2017_06_27_comunas_datachile_fixed.csv"
geo = pd.read_csv(URL)
comuna_datachile_id = dict(zip([elm.upper() for elm in geo["comuna_name"]], geo["comuna_datachile_id"]))

age_range_ = pd.read_csv("https://docs.google.com/spreadsheets/d/" + 
                   "152kELSLYe4oiCiH7-Wmt-FWvSYavT0LUQt1C_qXnXVM" +
                   "/export?gid=0&format=csv"
                  )

age_range_.to_csv("data/age_range_id.csv", index=False)

age_range_ids = dict(zip(age_range_["label"], age_range_["id"]))

thesauro_comunas = pd.read_csv("https://docs.google.com/spreadsheets/d/" + 
                   "1ccZd25Q5BfeL2w-KCbe1k7ZJIuskJa0gwoTZsillBGk" +
                   "/export?gid=0&format=csv"
                  )

comuna_thesauro_id = dict(zip([elm.upper() for elm in thesauro_comunas["nombre"]], thesauro_comunas["id"]))

sex_id = {"Mujeres": 1, "Hombres": 2}

In [4]:
tidy = []

URLS = [
    "http://www.deis.cl/wp-content/uploads/2016/05/5.Serie-de-mortalidad-infantil-y-sus-componentes-regi%C3%B3n-y-comuna-de-residencia.-Chile-1997-2013.xlsx",
    "http://www.deis.cl/wp-content/uploads/2016/05/6.Serie-de-mortalidad-en-la-ni%C3%B1ez-regi%C3%B3n-y-comuna-de-residencia.-Chile-1997-2013.xlsx",
    "http://www.deis.cl/wp-content/uploads/2014/01/Serie-de-mortalidad-adolescente-regi%C3%B3n-y-comuna-de-residencia.-Chile-1997-2011.xlsx"
]

for URL_BASE in URLS:
    data = pd.read_excel(URL_BASE)
    # Read title/subtitle of Excel

    title = [ str(item).strip() for item in data.iloc[4] ]
    subtitle = [ str(item).strip() for item in data.iloc[5] ]
    measures = [ str(item).strip() for item in data.iloc[6] ]

    output = []
    for i in range(7, len(data) - 3):
        frame = data.iloc[i]
        #tuples = list(zip(title, subtitle, measures, frame))

        geo = { "comuna": str(frame[1]).strip() } if str(frame[0]).strip() == "nan" else { "country": str(frame[0]).strip() }  if str(frame[0]).strip() == "Total País" else { "region": str(frame[0]).strip() } 
        #tuples = list(filter(lambda x: x[0] != "nan", tuples))

        year = ""
        age_range = ""
        measure = ""

        tuples = []
        for tup in list(zip(title, subtitle, measures, frame))[2:]:
            if tup[0] != "nan":
                year = tup[0]
            if tup[1] != "nan":
                age_range = tup[1]
            if tup[2] != "nan":
                measure = tup[2]
            tuples.append((year, age_range, measure, tup[3]))

        year = ""

        query = []
        for key, tup in enumerate(tuples):
            if tup[2] != "Tasa*":
                year = tup[0]
                count = tup[3]

            if key % 2 != 0:
                item = {
                    "year": int(float(year.replace("*",""))),
                    "count": count,
                    "age_range": tup[1].replace("\n", ""),
                    "rate": tup[3]
                }
                if item["age_range"] != "Total":
                    query.append(item)

        tuples = {**geo, "children": query}
        #print(tuples)
        output.append(tuples)

    # Process data
    country_data = {}
    region_data = {}
    for key, item in enumerate(output):
        if "country" in item:
            country_data = { d["age_range"]: d["rate"] for d in item["children"] }

        if "region" in item:
            region_data = { d["age_range"]: d["rate"] for d in item["children"] }

        if "comuna" in item:
            for d in item["children"]:
                out = {
                    "comuna_id": search_comuna_id(item["comuna"]),
                    "year": d["year"],
                    #"age_range": d["age_range"],
                    "age_range_id": age_range_ids[d["age_range"]],
                    #"sex_id": sex_id[d["sex"]],
                    "rate_country": country_data[d["age_range"]],
                    "rate_region": region_data[d["age_range"]],
                    "rate_comuna": 0 if d["rate"] == "-" else d["rate"],
                    "count": 0 if d["count"] == "-" else d["count"]
                }
                tidy.append(out)

In [5]:
datasets = [
    {
        "source_link": "http://www.deis.cl/wp-content/uploads/2016/08/6.-Defunciones-y-mortalidad-infantil-y-sus-componentes-por-regi%C3%B3n-y-comuna-de-residencia.-Chile-2014.xlsx",
        "year": 2014,
        "slug": "under-one"
    },
    {
        "source_link": "http://www.deis.cl/wp-content/uploads/2016/08/7.-Defunciones-y-mortalidad-en-la-ni%C3%B1ez-por-regi%C3%B3n-y-comuna-de-residencia.-Chile-2014.xlsx",
        "year": 2014,
        "slug": "one-to-ten"
    }, 
    {
        "source_link": "http://www.deis.cl/wp-content/uploads/2016/08/8.-Defunciones-y-mortalidad-en-adolescentes-por-regi%C3%B3n-y-comuna-de-residencia.-Chile-2014.xlsx",
        "year": 2014,
        "slug": "adolescence"
    },
    {
        "source_link": "http://www.deis.cl/wp-content/uploads/2016/08/8.-Defunciones-y-mortalidad-en-adolescentes-por-regi%C3%B3n-y-comuna-de-residencia.-Chile-2013.xlsx",
        "year": 2013,
        "slug": "adolescence"
    },
    {
        "source_link": "http://www.deis.cl/wp-content/uploads/2015/05/8.-Defunciones-y-mortalidad-en-adolescentes-por-regi%C3%B3n-y-comuna-de-residencia.-Chile-2012.xlsx",
        "year": 2012,
        "slug": "adolescence"
    }
]

In [6]:
# Use another label
age_range_ids = dict(zip(age_range_["es"], age_range_["id"]))

In [7]:
for dataset in datasets:
    data = pd.read_excel(dataset["source_link"])
    slug = dataset["slug"]
        
    if slug == "under-one":
        title = [ str(item).strip() for item in data.iloc[2] ]
        subtitle = [ str(item).strip() for item in data.iloc[3] ]
        first_row = 4
        last_row = 2

    else: 
        title = [ str(item).strip() for item in data.iloc[2] ]
        subtitle = [ str(item).strip() for item in data.iloc[3] ]
        measures = [ str(item).strip() for item in data.iloc[4] ]
        first_row = 5
        last_row = 1
        
    # Preprocess excel
    output = []
    for i in range(first_row, len(data) - last_row):
        frame = data.iloc[i]
        geo = { "comuna": str(frame[1]).strip() } if str(frame[0]).strip() == "nan" else { "country": str(frame[0]).strip() }  if str(frame[0]).strip() == "Total País" else { "region": str(frame[0]).strip() } 

        if slug == "under-one":
            tuples = list(zip(subtitle, frame))
            tuples = list(filter(lambda x: x[0] != "nan", tuples))

            age_range = ""
            sex = "Ambos sexos"

            query = []
            for key, tup in enumerate(tuples):
                if tup[0] != "Tasa*":
                    age_range = tup[0]
                    count = tup[1]

                if key % 2 != 0:
                    item = {
                        "age_range": age_range,
                        "count": count,
                        "rate": tup[1],
                        "sex": sex
                    }
                    query.append(item)

            tuples = {**geo, "children": query}
            output.append(tuples)
        
        else:
            age_range = ""
            sex = ""
            measure = ""

            tuples = []
            for tup in list(zip(title, subtitle, measures, frame))[2:]:
                if tup[0] != "nan":
                    age_range = tup[0]
                if tup[1] != "nan":
                    sex = tup[1]
                if tup[2] != "nan":
                    measure = tup[2]
                tuples.append((age_range, sex, measure, tup[3]))

            age_range = ""

            query = []
            for key, tup in enumerate(tuples):
                if tup[2] != "Tasa*":
                    age_range = tup[0]
                    count = tup[3]

                #if key % 2 != 0:

                if tup[1] == "Ambos sexos" and key % 2 != 0:
                    item = {
                        "age_range": tup[0],
                        "count": count,
                        "sex": tup[1],
                        "rate": tup[3]
                    }
                    query.append(item)

            tuples = {**geo, "children": query}
            output.append(tuples)

    # Process data
    country_data = {}
    region_data = {}
        
    for key, item in enumerate(output):
        if "country" in item:
            country_data = { d["age_range"]: d["rate"] for d in item["children"] }

        if "region" in item:
            region_data = { d["age_range"]: d["rate"] for d in item["children"] }

        if "comuna" in item:
            for d in item["children"]:
                if d["sex"] == "Ambos sexos" and item["comuna"] != "Ignorada" and item["comuna"] != "nan":
                    out = {
                        "comuna_id": search_comuna_id(item["comuna"]),
                        "year": dataset["year"],
                        "age_range_id": age_range_ids[d["age_range"]],
                        "rate_country": country_data[d["age_range"]],
                        "rate_region": region_data[d["age_range"]],
                        "rate_comuna": 0 if d["rate"] == "-" else d["rate"],
                        "count": 0 if d["count"] == "-" else d["count"]
                    }
                    tidy.append(out)

In [8]:
# Save tidy in /data/ folder
pd.DataFrame(tidy).to_csv("data/mortality.csv", index=False)