Combine OD Matrix 5 per country
worked:  
od_in_2019_agg5  


In [1]:
import os
import time
import logging
import datetime
import pandas as pd
from trino.dbapi import connect 
from sqlalchemy import create_engine
from datetime import timedelta
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [3]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [4]:
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [6]:
# Define the country
country_abbreviation = "MX".lower()  

# Define the date range
start_date = datetime.date(2019, 1, 1)
end_date = datetime.date(2019, 12, 31)

# Generate the list of dates
date_list = [(start_date + datetime.timedelta(days=x)).strftime('%Y%m%d') 
             for x in range((end_date - start_date).days + 1)]

# print(date_list)
len(date_list)

365

In [7]:
# Create master table
output_catalog = 'dedicated'
output_schema_name = 'od_matrix_10'
# output_schema_name = 'presence_data' # for test
input_schema_name = 'od_matrix_10'

master_table = f"od_{country_abbreviation}_2019_agg5"
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

# Create the master table in the presence_data schema
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {master_table}(
    start_geohash5 varchar,
    end_geohash5 varchar,
    trip_count bigint,
    m_duration_min double,
    mdn_duration_min double,
    sd_duration_min double,
    m_length_m double,
    mdn_length_m double,
    sd_length_m double,
    m_points_no double,
    mdn_points_no double,
    sd_points_no double,
    local_date varchar(8)
)
WITH (
    format = 'TEXTFILE'
)
"""

with con.connect() as connection:
    logging.info(f"Creating master table: {master_table}")
    connection.execute(create_table_query)

2024-06-25 22:46:10,681 - INFO - Creating master table: od_mx_2019_agg5


In [8]:
# Loop through each date and insert the data into the master table
for date in date_list:
    logging.info(f"Processing data for date: {date}")
    query = f"""
    INSERT INTO {output_catalog}.{output_schema_name}.{master_table} (
    start_geohash5, end_geohash5, trip_count,
    m_duration_min, mdn_duration_min, sd_duration_min,
    m_length_m, mdn_length_m, sd_length_m,
    m_points_no, mdn_points_no, sd_points_no,
    local_date)
        SELECT DISTINCT
            start_geohash5,
            end_geohash5,
            trip_count,
            ROUND(m_duration_min, 6) AS rounded_m_duration_min,
            ROUND(mdn_duration_min, 6) AS rounded_mdn_duration_min,
            ROUND(sd_duration_min, 6) AS rounded_sd_duration_min,
            ROUND(m_length_m, 6) AS rounded_m_length_m,
            ROUND(mdn_length_m, 6) AS rounded_mdn_length_m,
            ROUND(sd_length_m, 6) AS rounded_sd_length_m,
            ROUND(m_points_no, 6) AS rounded_m_points_no,
            ROUND(mdn_points_no, 6) AS rounded_mdn_points_no,
            ROUND(sd_points_no, 6) AS rounded_sd_points_no, 
            '{date}' AS local_date
        FROM {output_catalog}.{input_schema_name}.od_{country_abbreviation}_{date}_agg5_10
    """
    
    try:
        sql_engine.read_sql(query)
        logging.info(f"Successfully inserted data for date: {date}")
    except Exception as e:
        logging.error(f"Error inserting data for date: {date} - {e}")

logging.info("Data combined and inserted successfully.")

2024-06-25 22:46:16,413 - INFO - Processing data for date: 20190101
2024-06-25 22:46:25,339 - INFO - Successfully inserted data for date: 20190101
2024-06-25 22:46:25,339 - INFO - Processing data for date: 20190102
2024-06-25 22:46:28,392 - INFO - Successfully inserted data for date: 20190102
2024-06-25 22:46:28,393 - INFO - Processing data for date: 20190103
2024-06-25 22:46:31,184 - INFO - Successfully inserted data for date: 20190103
2024-06-25 22:46:31,185 - INFO - Processing data for date: 20190104
2024-06-25 22:46:33,889 - INFO - Successfully inserted data for date: 20190104
2024-06-25 22:46:33,890 - INFO - Processing data for date: 20190105
2024-06-25 22:46:36,506 - INFO - Successfully inserted data for date: 20190105
2024-06-25 22:46:36,507 - INFO - Processing data for date: 20190106
2024-06-25 22:46:39,205 - INFO - Successfully inserted data for date: 20190106
2024-06-25 22:46:39,206 - INFO - Processing data for date: 20190107
2024-06-25 22:46:43,773 - INFO - Successfully inse