# Extracción de matriz OD 

__Librerías__

In [1]:
!pip install pyathena h3 awswrangler

Collecting pyathena
  Downloading PyAthena-2.2.0-py3-none-any.whl (37 kB)
Collecting h3
  Downloading h3-3.7.2-cp36-cp36m-manylinux2010_x86_64.whl (795 kB)
[K     |████████████████████████████████| 795 kB 11.7 MB/s eta 0:00:01
[?25hCollecting awswrangler
  Downloading awswrangler-2.6.0-py3-none-any.whl (174 kB)
[K     |████████████████████████████████| 174 kB 77.5 MB/s eta 0:00:01
[?25hCollecting pymysql<1.1.0,>=0.9.0
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.7 MB/s  eta 0:00:01
Collecting redshift-connector~=2.0.0
  Downloading redshift_connector-2.0.876-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 14.0 MB/s eta 0:00:01
[?25hCollecting pg8000<1.19.0,>=1.16.0
  Downloading pg8000-1.18.0-py3-none-any.whl (34 kB)
Collecting scramp==1.2.2
  Downloading scramp-1.2.2-py3-none-any.whl (7.8 kB)
Collecting requests<2.24.0,>=2.23.0
  Downloading requests-2.23.0-py2.py3-none-any.whl (58 kB)
[K     |█

In [2]:
import pandas as pd
import boto3
from pyathena.pandas.cursor import PandasCursor
from pyathena import connect
import h3
import awswrangler as wr

__Parámetros de área de interés__

In [3]:
#place_name =  "peru_departamento_lima"
#country_code = 'PE'

#bounding box de coordenadas lat/long
#xmin = -77.88659
#xmax = -75.5075
#ymin = -13.32351
#ymax = -10.27419

#place_name =  "argentina_provincia_caba"
#country_code = 'AR'

#bounding box de coordenadas lat/long
#xmin = -58.5314494
#xmax = -58.3351423
#ymin = -34.705637
#ymax = -34.5265535



#place_name =  "argentina_aglomerado_AMBA"
#country_code = 'AR'

#bounding box de coordenadas lat/long
#xmin = -58.9925
#xmax = -58.1506
#ymin = -34.929
#ymax = -34.3174



place_name =  "colombia_departamento_cundinamarca"
country_code = 'CO'

#bounding box de coordenadas lat/long
xmin = -74.8907
xmax = -73.0508
ymin = 3.7243
ymax = 5.8367


__Parámetros de grilla H3__

In [4]:
h3_resolution = 9


__Parámetros de la conexión a recursos en AWS__

In [5]:
s3 = boto3.resource('s3')
s3_staging = 's3://iadbprod-csd-hub-analyticaldata/graphdata-mobility-temporal/athena-results/'
region = 'us-east-1'
schema = 'graphdata'
pings_table = 'historico_pings'
# el bucket donde guardaremos tablas nuevas
s3_bucket = 's3://iadbprod-csd-hub-analyticaldata'
extracted_table_location = f'{s3_bucket}/graphdata-mobility-OD/{place_name}'


__Conexión a la base de datos__

In [6]:
cursor = connect(s3_staging_dir = s3_staging, region_name = region, schema_name = schema, cursor_class=PandasCursor).cursor()

__Función de extracción de matriz Origen-Destino__


Genera las tablas:

* __{place_name}_pings:__           pings identificados dentro del área de interés (tabla intermedia, puede ser descartada)
* __{place_name}_coords_to_h3id:__  tabla de identificación de celda H3 para cada par lat-lon único presente en los pings (descartable)
* __{place_name}_pings_h3idx__      pings identificados dentro del área de interés, junto a su celda H3 correspondiente
* __{place_name}_user_locations__   identifica la celda "hogar" (residencia) y "ocupación" (destino diurno) de cada usuario
* __{place_name}_OD_matrix__        Conteo de cantidad de visitantes, a cada una de las celdas, a cada hora, por celda hogar de origen

In [7]:
def extract_OD_matrix(cursor, place_name, country_code, h3_resolution, extracted_table_location):
    
    # extraer pings dentro del rectángulo del área de interés
    
    query_recortar_por_coordenadas = f'''CREATE TABLE {place_name}_pings
                    WITH (external_location = '{extracted_table_location}/{place_name}_pings', 
                        format = 'PARQUET', 
                        parquet_compression = 'SNAPPY') AS
                    SELECT caid,
                           id_type,
                           ROUND(latitude, 4) AS latitude,
                           ROUND(longitude, 4) AS longitude,
                           year,
                           month,
                           day,
                           hour_of_day
                    FROM {pings_table}
                    WHERE ((longitude > {xmin}) AND (longitude < {xmax}) AND 
                           (latitude > {ymin}) AND (latitude < {ymax}) AND
                           iso_country_code = '{country_code}')'''
    
    cursor.execute(query_recortar_por_coordenadas)
    
    
    
    # Identificar la celda H3 de donde proviene cada ping
    
    query_unique_coords = f'''SELECT DISTINCT latitude, longitude FROM {place_name}_pings'''
    
    unique_coords = cursor.execute(query_unique_coords).as_pandas()

    unique_coords['h3idx'] = unique_coords.apply(lambda x: h3.geo_to_h3(x.latitude, x.longitude, h3_resolution), axis=1)
    
    
    # Guardar en S3, en formato parquet, la tabla de equivalencias lat-lon vs identificador de celda H3 
    
    wr.s3.to_parquet(
        df = unique_coords,
        dataset = True,
        mode = "overwrite",
        database = schema,
        table = f"{place_name}_coords_to_h3idx",
        path = f'{extracted_table_location}/{place_name}_coords_to_h3idx'
    )
    
    # usar la tabla de quivalencias para asignarle su celda a cada ping
    
    query_agregar_h3idx = f'''CREATE TABLE {place_name}_pings_h3idx
                          WITH (external_location = '{extracted_table_location}/{place_name}_pings_h3idx', 
                                format = 'PARQUET', 
                                parquet_compression = 'SNAPPY') AS
                          SELECT {place_name}_pings.*, h3idx 
                          FROM {place_name}_pings LEFT JOIN {place_name}_coords_to_h3idx
                          ON CAST({place_name}_pings.longitude AS VARCHAR) || CAST({place_name}_pings.latitude AS VARCHAR) = 
                             CAST({place_name}_coords_to_h3idx.longitude AS VARCHAR) || CAST({place_name}_coords_to_h3idx.latitude AS VARCHAR)'''
    
    cursor.execute(query_agregar_h3idx)
    
    
    # Detección de _celda hogar_ y de _celda de ocupación_ 
    
    query_identificar_sitios = f'''CREATE TABLE {place_name}_user_locations
                          WITH (external_location = '{extracted_table_location}/{place_name}_user_locations', 
                                format = 'PARQUET', 
                                parquet_compression = 'SNAPPY') AS
                            SELECT 
                              caid, 
                              h3idx_home, 
                              distinct_days_home, 
                              total_pings_home, 
                              h3idx_occupation, 
                              distinct_days_occupation, 
                              total_pings_occupation 
                            FROM 
                              (
                                SELECT 
                                  caid, 
                                  h3idx_home, 
                                  distinct_days_home, 
                                  total_pings_home, 
                                  h3idx_occupation, 
                                  distinct_days_occupation, 
                                  total_pings_occupation, 
                                  ROW_NUMBER() OVER (
                                    PARTITION BY caid 
                                    ORDER BY 
                                      distinct_days_occupation DESC
                                  ) AS q01 
                                FROM 
                                  (
                                    SELECT 
                                      LHS.caid AS caid, 
                                      h3idx_home, 
                                      distinct_days_home, 
                                      total_pings_home, 
                                      h3idx_occupation, 
                                      distinct_days_occupation, 
                                      total_pings_occupation 
                                    FROM 
                                      (
                                        SELECT 
                                          caid, 
                                          h3idx_home, 
                                          distinct_days_home, 
                                          total_pings_home 
                                        FROM 
                                          (
                                            SELECT 
                                              caid, 
                                              h3idx_home, 
                                              distinct_days_home, 
                                              total_pings_home, 
                                              ROW_NUMBER() OVER (
                                                PARTITION BY caid 
                                                ORDER BY 
                                                  distinct_days_home DESC
                                              ) AS q01 
                                            FROM 
                                              (
                                                SELECT 
                                                  caid, 
                                                  h3idx_home, 
                                                  COUNT(*) AS distinct_days_home, 
                                                  SUM(freq_home) AS total_pings_home 
                                                FROM 
                                                  (
                                                    SELECT 
                                                      caid, 
                                                      year, 
                                                      month, 
                                                      day, 
                                                      h3idx_home, 
                                                      COUNT(*) AS freq_home 
                                                    FROM 
                                                      (
                                                        SELECT 
                                                          caid, 
                                                          year, 
                                                          month, 
                                                          day, 
                                                          h3idx AS h3idx_home 
                                                        FROM 
                                                          {place_name}_pings_h3idx 
                                                        WHERE 
                                                          (
                                                            (
                                                              hour_of_day IN (20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7)
                                                            ) 
                                                            AND (
                                                              NOT(
                                                                (
                                                                  (h3idx) IS NULL
                                                                )
                                                              )
                                                            )
                                                          )
                                                      ) 
                                                    GROUP BY 
                                                      caid, 
                                                      year, 
                                                      month, 
                                                      day, 
                                                      h3idx_home
                                                  ) 
                                                GROUP BY 
                                                  caid, 
                                                  h3idx_home
                                              ) 
                                            WHERE 
                                              (distinct_days_home > 1.0)
                                          ) 
                                        WHERE 
                                          (q01 <= 1)
                                      ) AS LHS 
                                      LEFT JOIN (
                                        SELECT 
                                          caid, 
                                          h3idx_occupation, 
                                          distinct_days_occupation, 
                                          total_pings_occupation 
                                        FROM 
                                          (
                                            SELECT 
                                              caid, 
                                              h3idx_occupation, 
                                              distinct_days_occupation, 
                                              total_pings_occupation, 
                                              ROW_NUMBER() OVER (
                                                PARTITION BY caid 
                                                ORDER BY 
                                                  distinct_days_occupation DESC
                                              ) AS q01 
                                            FROM 
                                              (
                                                SELECT 
                                                  caid, 
                                                  h3idx_occupation, 
                                                  COUNT(*) AS distinct_days_occupation, 
                                                  SUM(freq_occupation) AS total_pings_occupation 
                                                FROM 
                                                  (
                                                    SELECT 
                                                      caid, 
                                                      year, 
                                                      month, 
                                                      day, 
                                                      h3idx_occupation, 
                                                      COUNT(*) AS freq_occupation 
                                                    FROM 
                                                      (
                                                        SELECT 
                                                          caid, 
                                                          year, 
                                                          month, 
                                                          day, 
                                                          h3idx AS h3idx_occupation 
                                                        FROM 
                                                          {place_name}_pings_h3idx 
                                                        WHERE 
                                                          (
                                                            hour_of_day IN (9, 10, 11, 12, 13, 14, 15, 16, 17)
                                                          )
                                                      ) 
                                                    GROUP BY 
                                                      caid, 
                                                      year, 
                                                      month, 
                                                      day, 
                                                      h3idx_occupation
                                                  ) 
                                                GROUP BY 
                                                  caid, 
                                                  h3idx_occupation
                                              ) 
                                            WHERE 
                                              (distinct_days_occupation > 1.0)
                                          ) 
                                        WHERE 
                                          (q01 <= 1)
                                      ) AS RHS ON (LHS.caid = RHS.caid)
                                  ) 
                                WHERE 
                                  (h3idx_home != h3idx_occupation)
                              ) 
                            WHERE 
                              (q01 <= 1)'''  
    
    cursor.execute(query_identificar_sitios)
    
    
    # Generar matriz origen - destino
    
    query_matriz_OD = f'''CREATE TABLE {place_name}_OD_matrix
                          WITH (external_location = '{extracted_table_location}/{place_name}_OD_matrix', 
                                format = 'PARQUET', 
                                parquet_compression = 'SNAPPY') AS
                          SELECT year, month, day, hour_of_day, h3idx, h3idx_home, COUNT(*) AS n
                            FROM (SELECT year, month, day, hour_of_day, LHS.caid AS caid, h3idx, n, h3idx_home
                                    FROM (SELECT year, month, day, hour_of_day, caid, h3idx, n
                                            FROM (SELECT year, month, day, hour_of_day, caid, h3idx, n, MAX(n) OVER (PARTITION BY year, month, day, hour_of_day, caid) AS q01
                                                FROM (SELECT year, month, day, hour_of_day, caid, h3idx, COUNT(*) AS n
                                                        FROM {place_name}_pings_h3idx
                                                        GROUP BY year, month, day, hour_of_day, caid, h3idx)
                                                  )
                                            WHERE (n = q01)) AS LHS
                                            LEFT JOIN (SELECT caid, h3idx_home
                                                        FROM {place_name}_user_locations) AS RHS
                                            ON (LHS.caid = RHS.caid)
                                  )
                        GROUP BY year, month, day, hour_of_day, h3idx, h3idx_home
                        ORDER BY year ASC, month ASC, day ASC, hour_of_day ASC, n DESC'''   
    
    
    cursor.execute(query_matriz_OD)
    


__Ejecutar el proceso__

In [8]:
extract_OD_matrix(cursor, place_name, country_code, h3_resolution, extracted_table_location)

_c'est fini_