# Data Aggregation

# Goal
Create a aggregation on both dataset (Census and Covid) and join.

# Methodology
- Do multiple aggregation on the census data by city
- Do multiple aggregation on the covid data.
- Join the previous two df.
- Save this df to s3.

### Aggregations
- Range ages by city
- Gender proportion by city
- People by city
- Adequate access to public services by city
- Internet services by city
- Health quality service by city
- Life expectancy by city

## Sections
1. [**Requirements**](#Requirements)
2. [**Functions**](#Functions)
3. [**Inputs**](#Inputs)


# Requirements

In [1]:
#installing packages
sc.install_pypi_package("pandas")
sc.install_pypi_package("boto3")
sc.setCheckpointDir('hdfs:///covid')

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1594851155022_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting pandas
  Downloading https://files.pythonhosted.org/packages/af/f3/683bf2547a3eaeec15b39cef86f61e921b3b187f250fcd2b5c5fb4386369/pandas-1.0.5-cp37-cp37m-manylinux1_x86_64.whl (10.1MB)
Collecting python-dateutil>=2.6.1 (from pandas)
  Downloading https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl (227kB)
Installing collected packages: python-dateutil, pandas
Successfully installed pandas-1.0.5 python-dateutil-2.8.1

Collecting boto3
  Downloading https://files.pythonhosted.org/packages/f5/d4/854d6c11936b9bc7b2341cf9547fedf16f638437592a8d75e927e1ed62b4/boto3-1.14.21-py2.py3-none-any.whl (128kB)
Collecting botocore<1.18.0,>=1.17.21 (from boto3)
  Downloading https://files.pythonhosted.org/packages/21/0c/ee102813cd358589549dd07af13f2a7ed012f06ae231612fd6decdac4be3/botocore-1.17.21-py2.py3-none-any.whl (6.3MB)
Collecting s3transfer<0.4.0,>=0.3.0 (from boto3)
  Downloading https://fil

In [2]:
import time
import os
import boto3
import gc
import sys
import numpy as np
import pandas as pd
import pickle
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (FloatType, DateType, StructType, StructField, StringType, LongType, 
    IntegerType, ArrayType, BooleanType, DoubleType)
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler, QuantileDiscretizer
gc.enable()

spark = SparkSession.builder.config("spark.sql.shuffle.partitions", 20).appName("covid").getOrCreate()
print(spark.sparkContext.getConf().get('spark.driver.memory'))
print(spark.sparkContext.getConf().get("spark.sql.shuffle.partitions"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2048M
20

# Functions

## Loading data

In [3]:
def build_schema_complete(source="vivienda"):
    """
    Build schema for different sources

    Parameters:
    -----------
    source : str
        Table source may be: "VIV", "PER", "HOG", "FALL", "MGN"

    Return:
    -------
    schema : spark.schema
        Spark schema for loading source table
    """
    if source == "fallecidos":
        schema = StructType([StructField("U_DPTO", IntegerType()),
                             StructField("U_MPIO", IntegerType()),
                             StructField("UA_CLASE", IntegerType()),
                             StructField("U_EDIFICA", IntegerType()),
                             StructField("COD_ENCUESTAS", IntegerType()),
                             StructField("U_VIVIENDA", IntegerType()),
                             StructField("F_NROHOG", IntegerType()),
                             StructField("FA1_NRO_FALL", IntegerType()),
                             StructField("FA2_SEXO_FALL", IntegerType()),
                             StructField("FA3_EDAD_FALL", IntegerType()),
                             StructField("FA4_CERT_DEFUN", IntegerType()),
                             StructField("UVA_USO_UNIDAD", IntegerType()),
                             StructField("V_TIPO_VIV", DoubleType()),
                             StructField("V_CON_OCUP", DoubleType()),
                             StructField("V_TOT_HOG", DoubleType()),
                             StructField("V_MAT_PARED", DoubleType()),
                             StructField("V_MAT_PISO", DoubleType()),
                             StructField("VA_EE", DoubleType()),
                             StructField("VA1_ESTRATO", DoubleType()),
                             StructField("VB_ACU", DoubleType()),
                             StructField("VC_ALC", DoubleType()),
                             StructField("VD_GAS", DoubleType()),
                             StructField("VE_RECBAS", DoubleType()),
                             StructField("VE1_QSEM", DoubleType()),
                             StructField("VF_INTERNET", DoubleType()),
                             StructField("V_TIPO_SERSA", DoubleType()),
                             StructField("L_TIPO_INST", DoubleType()),
                             StructField("L_EXISTEHOG", DoubleType()),
                             StructField("L_TOT_PERL", DoubleType()),
                             StructField("H_NRO_CUARTOS_H", DoubleType()),
                             StructField("H_NRO_DORMIT_H", DoubleType()),
                             StructField("H_DONDE_PREPALIM_H", DoubleType()),
                             StructField("H_AGUA_COCIN_H", DoubleType()),
                             StructField("HA_NRO_FALL_H", DoubleType()),
                             StructField("HA_TOT_PER_H", DoubleType()),
                             StructField("UA1_LOCALIDAD", IntegerType()),
                             StructField("U_SECT_RUR", IntegerType()),
                             StructField("U_SECC_RUR", IntegerType()),
                             StructField("UA2_CPOB", IntegerType()),
                             StructField("U_SECT_URB", IntegerType()),
                             StructField("U_SECC_URB", IntegerType()),
                             StructField("U_MZA", IntegerType()),
                             StructField("dpto", StringType()),
                             StructField("nom_mpio", StringType()),
                             StructField("tipo_municipio", StringType())])
    elif source == "personas":
        schema = StructType([StructField("U_DPTO", IntegerType()),
                             StructField("U_MPIO", IntegerType()),
                             StructField("UA_CLASE", IntegerType()),
                             StructField("U_EDIFICA", IntegerType()),
                             StructField("COD_ENCUESTAS", IntegerType()),
                             StructField("U_VIVIENDA", IntegerType()),
                             StructField("P_NROHOG", IntegerType()),
                             StructField("P_NRO_PER", IntegerType()),
                             StructField("P_SEXO", IntegerType()),
                             StructField("P_EDADR", IntegerType()),
                             StructField("P_PARENTESCOR", DoubleType()),
                             StructField("PA_LUG_NAC", IntegerType()),
                             StructField("PA_VIVIA_5ANOS", DoubleType()),
                             StructField("PA_VIVIA_1ANO", DoubleType()),
                             StructField("P_ENFERMO", DoubleType()),
                             StructField("P_QUEHIZO_PPAL", DoubleType()),
                             StructField("PA_LO_ATENDIERON", DoubleType()),
                             StructField("PA1_CALIDAD_SERV", DoubleType()),
                             StructField("CONDICION_FISICA", DoubleType()),
                             StructField("P_ALFABETA", DoubleType()),
                             StructField("PA_ASISTENCIA", DoubleType()),
                             StructField("P_NIVEL_ANOSR", DoubleType()),
                             StructField("P_TRABAJO", DoubleType()),
                             StructField("P_EST_CIVIL", DoubleType()),
                             StructField("PA_HNV", DoubleType()),
                             StructField("PA1_THNV", DoubleType()),
                             StructField("PA2_HNVH", DoubleType()),
                             StructField("PA3_HNVM", DoubleType()),
                             StructField("PA_HNVS", DoubleType()),
                             StructField("PA1_THSV", DoubleType()),
                             StructField("PA2_HSVH", DoubleType()),
                             StructField("PA3_HSVM", DoubleType()),
                             StructField("PA_HFC", DoubleType()),
                             StructField("PA1_THFC", DoubleType()),
                             StructField("PA2_HFCH", DoubleType()),
                             StructField("PA3_HFCM", DoubleType()),
                             StructField("UVA_USO_UNIDAD", IntegerType()),
                             StructField("V_TIPO_VIV", DoubleType()),
                             StructField("V_CON_OCUP", DoubleType()),
                             StructField("V_TOT_HOG", DoubleType()),
                             StructField("V_MAT_PARED", DoubleType()),
                             StructField("V_MAT_PISO", DoubleType()),
                             StructField("VA_EE", DoubleType()),
                             StructField("VA1_ESTRATO", DoubleType()),
                             StructField("VB_ACU", DoubleType()),
                             StructField("VC_ALC", DoubleType()),
                             StructField("VD_GAS", DoubleType()),
                             StructField("VE_RECBAS", DoubleType()),
                             StructField("VE1_QSEM", DoubleType()),
                             StructField("VF_INTERNET", DoubleType()),
                             StructField("V_TIPO_SERSA", DoubleType()),
                             StructField("L_TIPO_INST", DoubleType()),
                             StructField("L_EXISTEHOG", DoubleType()),
                             StructField("L_TOT_PERL", DoubleType()),
                             StructField("H_NRO_CUARTOS_H", DoubleType()),
                             StructField("H_NRO_DORMIT_H", DoubleType()),
                             StructField("H_DONDE_PREPALIM_H", DoubleType()),
                             StructField("H_AGUA_COCIN_H", DoubleType()),
                             StructField("HA_NRO_FALL_H", DoubleType()),
                             StructField("HA_TOT_PER_H", DoubleType()),
                             StructField("UA1_LOCALIDAD", IntegerType()),
                             StructField("U_SECT_RUR", IntegerType()),
                             StructField("U_SECC_RUR", IntegerType()),
                             StructField("UA2_CPOB", IntegerType()),
                             StructField("U_SECT_URB", IntegerType()),
                             StructField("U_SECC_URB", IntegerType()),
                             StructField("U_MZA", IntegerType()),
                             StructField("dpto", StringType()),
                             StructField("nom_mpio", StringType()),
                             StructField("tipo_municipio", StringType())])
    else:
        print("Source not valid. Enter one of the following sources: fallecidos, personas")
    return schema


def build_schema_covid(source="covid"):
    """
    Build schema for different covid sources

    Parameters:
    -----------
    source : str
        Table source may be: "covid", "tests"

    Return:
    -------
    schema : spark.schema
        Spark schema for loading source table
    """
    if source == "covid":
        schema = StructType([StructField("fecha_de_notificaci_n", DateType()),
                             StructField("c_digo_divipola", StringType()),
                             StructField("ciudad_de_ubicaci_n", StringType()),
                             StructField("departamento", StringType()),
                             StructField("atenci_n", StringType()),
                             StructField("edad", IntegerType()),
                             StructField("sexo", StringType()),
                             StructField("tipo", StringType()),
                             StructField("estado", StringType()),
                             StructField("pa_s_de_procedencia", StringType()),
                             StructField("fis", DateType()),
                             StructField("fecha_diagnostico", DateType()),
                             StructField("fecha_recuperado", DateType()),
                             StructField("fecha_reporte_web", DateType()),
                             StructField("tipo_recuperaci_n", StringType()),
                             StructField("codigo_departamento", StringType()),
                             StructField("codigo_pais", StringType()),
                             StructField("pertenencia_etnica", StringType()),
                             StructField("nombre_grupo_etnico", StringType()),
                             StructField("fecha_de_muerte", DateType()),
                             StructField("Asintomatico", IntegerType()),
                             StructField("divipola_dpto", IntegerType()),
                             StructField("divipola_mpio", IntegerType())
                             ])
    elif source == "tests":
        schema = StructType([StructField("fecha", DateType()),
                             StructField("acumuladas", DoubleType()),
                             StructField("amazonas", DoubleType()),
                             StructField("antioquia", DoubleType()),
                             StructField("arauca", DoubleType()),
                             StructField("atlantico", DoubleType()),
                             StructField("bogota", DoubleType()),
                             StructField("bolivar", DoubleType()),
                             StructField("boyaca", DoubleType()),
                             StructField("caldas", DoubleType()),
                             StructField("caqueta", DoubleType()),
                             StructField("casanare", DoubleType()),
                             StructField("cauca", DoubleType()),
                             StructField("cesar", DoubleType()),
                             StructField("choco", DoubleType()),
                             StructField("cordoba", DoubleType()),
                             StructField("cundinamarca", DoubleType()),
                             StructField("guainia", DoubleType()),
                             StructField("guajira", DoubleType()),
                             StructField("guaviare", DoubleType()),
                             StructField("huila", DoubleType()),
                             StructField("magdalena", DoubleType()),
                             StructField("meta", DoubleType()),
                             StructField("narino", DoubleType()),
                             StructField("norte_de_santander", DoubleType()),
                             StructField("putumayo", DoubleType()),
                             StructField("quindio", DoubleType()),
                             StructField("risaralda", DoubleType()),
                             StructField("san_andres", DoubleType()),
                             StructField("santander", DoubleType()),
                             StructField("sucre", DoubleType()),
                             StructField("tolima", DoubleType()),
                             StructField("valle_del_cauca", DoubleType()),
                             StructField("vaupes", DoubleType()),
                             StructField("vichada", DoubleType()),
                             StructField("procedencia_desconocida", DoubleType()),
                             StructField("positivas_acumuladas", DoubleType()),
                             StructField("negativas_acumuladas", DoubleType()),
                             StructField("positividad_acumulada", DoubleType()),
                             StructField("indeterminadas", DoubleType()),
                             StructField("barranquilla", DoubleType()),
                             StructField("cartagena", DoubleType()),
                             StructField("santa_marta", DoubleType())
                             ])
    else:
        print("Source not valid. Enter one of the following sources: 'covid', 'tests'")
    return schema
              
def build_schema_divipola(source="divipola"):
    """
    Build schema for different covid sources

    Parameters:
    -----------
    source : str
        Table source may be: "covid", "tests"

    Return:
    -------
    schema : spark.schema
        Spark schema for loading source table
    """
    if source == "divipola":
        schema = StructType([StructField("cod_depto", IntegerType()),
                             StructField("cod_mpio", IntegerType()),
                             StructField("dpto", StringType()),
                             StructField("nom_mpio", StringType()),
                             StructField("tipo_municipio", StringType())
                             ])
    else:
        print("Source not valid. Enter one of the following sources: 'covid', 'tests'")
    return schema
              
def get_censo_paths(bucket_s3, directory_key):
    """
    Get dictionary of census data for each department
    
    Parameters:
    -----------
    bucket_s3 : s3.Bucket
        Boto3 Bucket object
    directory_key : path
        Directory key in S3
    
    Return:
    -------
    dict_paths_departments : dict
        Dictionary with the data path for each departtment
    """
    dict_paths_departments = {}
    for object_summary in bucket_s3.objects.filter(Prefix=directory_key):
        name = object_summary.key
        if name.endswith(".CSV"):
            list_paths = name.split("/")
            department = list_paths[2].split("_")[1]
            if "MGN" in list_paths[-1]:
                if not(department in dict_paths_departments):
                    dict_paths_departments[department] = {}
                dict_paths_departments[department].update({"MGN": os.path.join(f"s3a://{bucket_s3.name}", name)})                
            elif "FALL" in list_paths[-1]:
                if not(department in dict_paths_departments):
                    dict_paths_departments[department] = {}
                dict_paths_departments[department].update({"FALL": os.path.join(f"s3a://{bucket_s3.name}", name)})
            elif "HOG" in list_paths[-1]:
                if not(department in dict_paths_departments):
                    dict_paths_departments[department] = {}
                dict_paths_departments[department].update({"HOG": os.path.join(f"s3a://{bucket_s3.name}", name)})
            elif "VIV" in list_paths[-1]:
                if not(department in dict_paths_departments):
                    dict_paths_departments[department] = {}
                dict_paths_departments[department].update({"VIV": os.path.join(f"s3a://{bucket_s3.name}", name)})
            elif "PER" in list_paths[-1]:
                if not(department in dict_paths_departments):
                    dict_paths_departments[department] = {}
                dict_paths_departments[department].update({"PER": os.path.join(f"s3a://{bucket_s3.name}", name)})
    return dict_paths_departments

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Preprocessing

In [41]:
def add_suffix_to_cols(df, suffix):
    for col in df.columns:
        df = df.withColumnRenamed(col, col + suffix)
    return df

def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def add_prefix_to_cols(df, prefix, exclude_cols):
    columns_to_prefix = [col for col in df.columns if col not in exclude_cols]
    for col in columns_to_prefix:
        if col.isdigit():
            df = df.withColumnRenamed(col, prefix + col)
        elif isfloat(col):
            df = df.withColumnRenamed(col, prefix + str(int(float(col))))
        else:
            df = df.withColumnRenamed(col, prefix + col)
    return df

def fillna_0(df, exclude_cols):
    columns_to_fill = [col for col in df.columns if col not in exclude_cols]
    df = df.fillna(0, subset=columns_to_fill)
    return df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Inputs

In [32]:
metadata = {"CENSO": {"VIVIENDA": {"useful_columns": ['U_DPTO', 'U_MPIO', 'UA_CLASE', 'U_EDIFICA',
                                                      'COD_ENCUESTAS', 'U_VIVIENDA', 'UVA_USO_UNIDAD',
                                                      'V_TIPO_VIV', 'V_CON_OCUP', 'V_TOT_HOG',
                                                      'V_MAT_PARED', 'V_MAT_PISO', 'VA_EE', 'VA1_ESTRATO', 'VB_ACU', 'VC_ALC',
                                                      'VD_GAS', 'VE_RECBAS', 'VE1_QSEM', 'VF_INTERNET', 'V_TIPO_SERSA',
                                                      'L_TIPO_INST', 'L_EXISTEHOG', 'L_TOT_PERL']
                                   },
                      "HOGAR": {"useful_columns": ['U_DPTO', 'U_MPIO', 'UA_CLASE', 'COD_ENCUESTAS',
                                                   'U_VIVIENDA', 'H_NROHOG', 'H_NRO_CUARTOS', 'H_NRO_DORMIT',
                                                   'H_DONDE_PREPALIM', 'H_AGUA_COCIN', 'HA_NRO_FALL', 'HA_TOT_PER']},
                      "PERSONAS": {"useful_columns": ['U_DPTO', 'U_MPIO', 'UA_CLASE', 'U_EDIFICA',
                                                      'COD_ENCUESTAS', 'U_VIVIENDA', 'P_NROHOG', 'P_NRO_PER', 'P_SEXO',
                                                      'P_EDADR', 'P_PARENTESCOR', 'PA_LUG_NAC',
                                                      'PA_VIVIA_5ANOS', 'PA_VIVIA_1ANO', 'P_ENFERMO', 'P_QUEHIZO_PPAL',
                                                      'PA_LO_ATENDIERON', 'PA1_CALIDAD_SERV', 'CONDICION_FISICA',
                                                      'P_ALFABETA', 'PA_ASISTENCIA', 'P_NIVEL_ANOSR', 'P_TRABAJO',
                                                      'P_EST_CIVIL', 'PA_HNV', 'PA1_THNV', 'PA2_HNVH', 'PA3_HNVM', 'PA_HNVS',
                                                      'PA1_THSV', 'PA2_HSVH', 'PA3_HSVM', 'PA_HFC', 'PA1_THFC', 'PA2_HFCH',
                                                      'PA3_HFCM']},
                      "FALLECIDOS": {"useful_columns": ['U_DPTO', 'U_MPIO', 'UA_CLASE', 'COD_ENCUESTAS',
                                                        'U_VIVIENDA', 'F_NROHOG', 'FA1_NRO_FALL', 'FA2_SEXO_FALL',
                                                        'FA3_EDAD_FALL', 'FA4_CERT_DEFUN']},
                      "GEOREFERENCIACION": {"useful_columns": ['U_DPTO', 'U_MPIO', 'UA_CLASE', 'UA1_LOCALIDAD', 'U_SECT_RUR',
                                                               'U_SECC_RUR', 'UA2_CPOB', 'U_SECT_URB', 'U_SECC_URB', 'U_MZA',
                                                               'U_EDIFICA', 'COD_ENCUESTAS', 'U_VIVIENDA']},
                      "DIVIPOLA": {"useful_columns": ['cod_depto', 'cod_mpio', 'dpto', 'nom_mpio', 'tipo_municipio']}
                      },
            }

bucket='censo-covid'
s3_resource = boto3.resource('s3')
bucket_s3 = s3_resource.Bucket(bucket)
show = True

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

**Paths**

In [33]:
censo_covid_bucket_s3 = f"s3a://{bucket}"

raw_data_path = os.path.join(censo_covid_bucket_s3, "raw-data")
censo_data_path = os.path.join(raw_data_path, "censo")
covid_tests_path = os.path.join(raw_data_path, "covid-tests.csv")
covid_path = os.path.join(raw_data_path, "covid.csv")
divipola_path = os.path.join(raw_data_path, "divipola.csv")

dict_paths_departments = get_censo_paths(bucket_s3, directory_key=os.path.join("raw-data", "censo"))

final_data_path = os.path.join(censo_covid_bucket_s3, "final-data")
complete_personas_path = os.path.join(final_data_path, "complete_personas")
complete_fallecidos_path = os.path.join(final_data_path, "complete_fallecidos")

aggregates_personas_path = os.path.join(final_data_path, "aggregates_personas")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Pipeline

### Load_data

In [35]:
complete_personas_data = spark.read.option("header", "true").csv(complete_personas_path, 
                                  schema=build_schema_complete(source="personas"))
if show:
    complete_personas_data.limit(4).toPandas().T
    print("Length: ", complete_personas_data.count())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                           0             1             2             3
U_DPTO                    11            11            11            11
U_MPIO                     1             1             1             1
UA_CLASE                   1             1             1             1
U_EDIFICA                  1             1             1             1
COD_ENCUESTAS          63612         63612         63612         64570
...                      ...           ...           ...           ...
U_SECC_URB                 3             3             3             3
U_MZA                     13            13            13            21
nom_mpio        BOGOTÁ. D.C.  BOGOTÁ. D.C.  BOGOTÁ. D.C.  BOGOTÁ. D.C.
tipo_municipio     Municipio     Municipio     Municipio     Municipio
dpto            BOGOTÁ. D.C.  BOGOTÁ. D.C.  BOGOTÁ. D.C.  BOGOTÁ. D.C.

[70 rows x 4 columns]
Length:  20386281

### Processing

**Personas**

In [48]:
number_of_people_by_education_level = complete_personas_data.groupby("dpto", "nom_mpio").pivot("P_NIVEL_ANOSR")\
                                            .agg(F.count("P_NIVEL_ANOSR"))
number_of_people_by_education_level = number_of_people_by_education_level.drop("null")
number_of_people_by_education_level = add_prefix_to_cols(number_of_people_by_education_level, 
                                                         prefix="P_NIVEL_ANOSR_",
                                                         exclude_cols=["dpto", "nom_mpio"])
number_of_people_by_education_level = fillna_0(number_of_people_by_education_level, 
                                               exclude_cols=["dpto", "nom_mpio"])
if show:
    number_of_people_by_education_level.limit(5).toPandas().T
    print("Length: ", number_of_people_by_education_level.count())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                          0          1  ...          3             4
dpto              ANTIOQUIA  ANTIOQUIA  ...  ATLÁNTICO     ATLÁNTICO
nom_mpio              ANDES    BRICEÑO  ...    BARANOA  SABANAGRANDE
P_NIVEL_ANOSR_1         595        213  ...       1300           693
P_NIVEL_ANOSR_2       16003       2725  ...      15153          7865
P_NIVEL_ANOSR_3        6255        875  ...      10200          5590
P_NIVEL_ANOSR_4        6223        584  ...      17674          8292
P_NIVEL_ANOSR_5         430         30  ...        776           796
P_NIVEL_ANOSR_6          76         12  ...        364           258
P_NIVEL_ANOSR_7        1300        232  ...       4521          2045
P_NIVEL_ANOSR_8        1280        135  ...       3610          1731
P_NIVEL_ANOSR_9         273         19  ...        502           263
P_NIVEL_ANOSR_10       2873        646  ...       2124          1501
P_NIVEL_ANOSR_99        629         31  ...        601           322

[13 rows x 5 columns]
Length:  37

In [49]:
number_of_people_by_age = complete_personas_data.groupby("dpto", "nom_mpio").pivot("P_EDADR")\
                                            .agg(F.count("P_EDADR"))
number_of_people_by_age = number_of_people_by_age.drop("null")
number_of_people_by_age = add_prefix_to_cols(number_of_people_by_age, 
                                             prefix="P_EDADR_",
                                             exclude_cols=["dpto", "nom_mpio"])
number_of_people_by_age = fillna_0(number_of_people_by_age, 
                                               exclude_cols=["dpto", "nom_mpio"])
if show:
    number_of_people_by_age.limit(5).toPandas().T
    print("Length: ", number_of_people_by_age.count())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                    0          1          2          3                 4
dpto        ANTIOQUIA  ANTIOQUIA  ANTIOQUIA  ANTIOQUIA         ANTIOQUIA
nom_mpio        ANDES    BRICEÑO  CONCORDIA      PEQUE  VIGÍA DEL FUERTE
P_EDADR_1        2207        444        907        577               897
P_EDADR_2        2613        593       1044        630              1054
P_EDADR_3        3006        728       1397        735              1081
P_EDADR_4        3457        599       1556        770               988
P_EDADR_5        3321        511       1242        606               672
P_EDADR_6        2873        445       1155        515               587
P_EDADR_7        2630        438        981        467               474
P_EDADR_8        2689        416       1074        471               408
P_EDADR_9        2508        327       1025        410               395
P_EDADR_10       2396        301       1057        405               362
P_EDADR_11       2345        303       1068        

In [50]:
number_of_people_per_estrato = complete_personas_data.groupby("dpto", "nom_mpio").pivot("VA1_ESTRATO")\
                                            .agg(F.count("VA1_ESTRATO"))
number_of_people_per_estrato = number_of_people_per_estrato.drop("null")
number_of_people_per_estrato = add_prefix_to_cols(number_of_people_per_estrato, 
                                                  prefix="VA1_ESTRATO_",
                                                  exclude_cols=["dpto", "nom_mpio"])
number_of_people_per_estrato = fillna_0(number_of_people_per_estrato, 
                                        exclude_cols=["dpto", "nom_mpio"])
if show:
    number_of_people_per_estrato.limit(5).toPandas().T
    print("Length: ", number_of_people_per_estrato.count())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                       0                       1       2          3        4
dpto           ANTIOQUIA               ANTIOQUIA  BOYACÁ  ATLÁNTICO   CALDAS
nom_mpio       CONCORDIA  SAN JOSÉ DE LA MONTAÑA   CHITA    BARANOA  MARMATO
VA1_ESTRATO_0         47                      21      12       1481       42
VA1_ESTRATO_1       4044                     708    4727      37572     5081
VA1_ESTRATO_2       9631                    2071    1924      17164     3051
VA1_ESTRATO_3       2090                     122      18       4254       63
VA1_ESTRATO_4        125                      15       0         61       22
VA1_ESTRATO_5         15                       5       4         33        1
VA1_ESTRATO_6          9                       0       0          4        0
VA1_ESTRATO_9          7                       5       4         18        3
Length:  374

In [51]:
aggregates_by_city_personas = complete_personas_data.groupby("dpto", "nom_mpio")\
            .agg(F.count(F.col("U_MPIO")).alias("Number_of_people"), 
                 F.avg(F.col("HA_TOT_PER_H")).alias("Avg_Number_people_per_home"), 
                 F.sum(F.when(F.col("P_SEXO")==1, 1)).alias("Number_of_males"), 
                 F.sum(F.when(F.col("P_SEXO")==2, 1)).alias("Number_of_females"), 
                 F.sum(F.when(F.col("VA_EE")==1, 1)).alias("Number_of_people_with_electricity"),
                 F.sum(F.when(F.col("VA_EE")==2, 1)).alias("Number_of_people_without_electricity"),
                 F.sum(F.when(F.col("VB_ACU")==1, 1)).alias("Number_of_people_with_water_access"),
                 F.sum(F.when(F.col("VB_ACU")==2, 1)).alias("Number_of_people_without_water_access"), 
                 F.sum(F.when(F.col("VF_INTERNET")==1, 1)).alias("Number_of_people_with_internet_access"), 
                 F.sum(F.when(F.col("VF_INTERNET")==2, 1)).alias("Number_of_people_without_internet_access"),
                 F.sum(F.when(F.col("P_ALFABETA")==1, 1)).alias("Number_of_literate_people"), 
                 F.sum(F.when(F.col("P_ALFABETA")==2, 1)).alias("Number_of_non_literate_people"),
                 F.sum(F.when(F.col("PA1_CALIDAD_SERV")==1, 1)).alias("Really_Good_health_service"),
                 F.sum(F.when(F.col("PA1_CALIDAD_SERV")==2, 1)).alias("Good_health_service"),
                 F.sum(F.when(F.col("PA1_CALIDAD_SERV")==3, 1)).alias("Bad_health_service"),
                 F.sum(F.when(F.col("PA1_CALIDAD_SERV")==4, 1)).alias("Really_Bad_health_service"), 
                 F.sum(F.col("PA1_THFC")).alias("Number_of_sons_out_of_country"))\
            .orderBy(F.col("Number_of_people").desc())
if show:
    aggregates_by_city_personas.limit(5).toPandas().T
    print("Length: ", aggregates_by_city_personas.count())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                                                     0  ...          4
dpto                                      BOGOTÁ. D.C.  ...  ATLÁNTICO
nom_mpio                                  BOGOTÁ. D.C.  ...    SOLEDAD
Number_of_people                               7181469  ...     535984
Avg_Number_people_per_home                     3.62483  ...    4.56214
Number_of_males                                3433586  ...     260921
Number_of_females                              3747883  ...     275063
Number_of_people_with_electricity              7132834  ...     533101
Number_of_people_without_electricity             16706  ...       2538
Number_of_people_with_water_access             7109875  ...     527848
Number_of_people_without_water_access            39665  ...       7791
Number_of_people_with_internet_access          5498193  ...     270974
Number_of_people_without_internet_access       1593575  ...     262953
Number_of_literate_people                      6484469  ...     471469
Number

In [52]:
aggregates_by_city_personas = aggregates_by_city_personas.join(number_of_people_per_estrato,
                                                               on=["dpto", "nom_mpio"],
                                                               how="left")
aggregates_by_city_personas = aggregates_by_city_personas.join(number_of_people_by_education_level,
                                                               on=["dpto", "nom_mpio"],
                                                               how="left")
aggregates_by_city_personas = aggregates_by_city_personas.join(number_of_people_by_age,
                                                               on=["dpto", "nom_mpio"],
                                                               how="left")
if show:
    aggregates_by_city_personas.limit(5).toPandas().T
    print("Length: ", aggregates_by_city_personas.count())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                                                  0  ...                       4
dpto                                      ANTIOQUIA  ...               ANTIOQUIA
nom_mpio                                      ANDES  ...  SAN JOSÉ DE LA MONTAÑA
Number_of_people                              38144  ...                    2952
Avg_Number_people_per_home                  3.88865  ...                 4.05759
Number_of_males                               19777  ...                    1424
Number_of_females                             18367  ...                    1528
Number_of_people_with_electricity             36580  ...                    2947
Number_of_people_without_electricity            447  ...                       5
Number_of_people_with_water_access            24586  ...                    2265
Number_of_people_without_water_access         12441  ...                     687
Number_of_people_with_internet_access          7550  ...                     729
Number_of_people_without_int

In [54]:
aggregates_by_city_personas.repartition(1).write.partitionBy('dpto')\
    .mode('overwrite').option("header","true").csv(aggregates_personas_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…