In [13]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [14]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install python-geohash

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import time
import geohash
import logging
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine

In [5]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [6]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

# Export to local

In [9]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [10]:
# Function to process data for a specific date
def process_data_for_date(event_date, country_code, export_path):
    errors = []  # List to keep track of errors
    
    # Define the export file paths
    csv3_file = f"od_{country_code.lower()}_agg3_3h.csv"
    csv5_file = f"od_{country_code.lower()}_agg5_3h.csv"
    csv3_file_path = f"{export_path}{csv3_file}"
    csv5_file_path = f"{export_path}{csv5_file}"

    # Loop through 24 hours in increments of 3 hours
    for start_hour in range(0, 24, 3):
        end_hour = start_hour + 3  # end hour for the 3-hour interval
        logging.info(f"Querying data for date: {event_date}, hours: {start_hour}-{end_hour}")

        try:
            # Fetch the data for the specified event date and country code
            pe_tj_df3 = sql_engine.read_sql(
                f"""
                SELECT 
                    cuebiq_id,
                    duration_minutes,
                    length_meters,
                    number_of_points,
                    TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                    EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                    geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
                    geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
                    geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
                    geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
                    DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
                FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
                WHERE 
                    event_date = {event_date}
                    AND end_country = '{country_code}' 
                    AND start_country = '{country_code}' 
                """
            )

            # Filter the DataFrame for the current 3-hour interval
            logging.info(f"Processing data for date: {event_date}, hours: {start_hour}-{end_hour}")
            filtered_df = pe_tj_df3[(pe_tj_df3['event_hour'] >= start_hour) & (pe_tj_df3['event_hour'] < end_hour)]

            # Add user numbers to the aggregated data for geohash3
            aggregated_df3 = filtered_df.groupby(['start_geohash3', 'end_geohash3', 'local_date']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std']
            }).reset_index()

            # Flatten the MultiIndex columns
            aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m',
                                      'm_points_no', 'mdn_points_no', 'sd_points_no']

            # Add the local_time column
            aggregated_df3['local_time'] = aggregated_df3['local_date'].astype(str) + \
                                           f" {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

            # Filter out rows where trip_count is less than or equal to 9
            aggregated_df3 = aggregated_df3[aggregated_df3['trip_count'] > 9]

            # Append the results to the CSV file for geohash3
            logging.info(f"Exporting aggregated data (geohash3) for date: {event_date}, hours: {start_hour}-{end_hour}")
            if not os.path.isfile(csv3_file_path):
                aggregated_df3.to_csv(csv3_file_path, index=False)
            else:
                aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

            # Add user numbers to the aggregated data for geohash5
            aggregated_df5 = filtered_df.groupby(['start_geohash5', 'end_geohash5', 'local_date']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std']
            }).reset_index()

            # Flatten the MultiIndex columns
            aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'local_date', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m',
                                      'm_points_no', 'mdn_points_no', 'sd_points_no']

            # Add the local_time column
            aggregated_df5['local_time'] = aggregated_df5['local_date'].astype(str) + \
                                           f" {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

            # Filter out rows where trip_count is less than or equal to 9
            aggregated_df5 = aggregated_df5[aggregated_df5['trip_count'] > 9]

            # Append the results to the CSV file for geohash5
            logging.info(f"Exporting aggregated data (geohash5) for date: {event_date}, hours: {start_hour}-{end_hour}")
            if not os.path.isfile(csv5_file_path):
                aggregated_df5.to_csv(csv5_file_path, index=False)
            else:
                aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)
        
        except Exception as e:
            error_message = f"Error processing date: {event_date}, hours: {start_hour}-{end_hour} - {str(e)}"
            logging.error(error_message)
            errors.append(error_message)
    
    return errors

# Function to process data for a range of dates
def process_data_for_date_range(start_date, end_date, country_code, export_path):
    current_date = start_date
    all_errors = []
    while current_date <= end_date:
        event_date = current_date.strftime("%Y%m%d")
        logging.info(f"Processing data for date: {event_date}")
        errors = process_data_for_date(event_date, country_code, export_path)
        all_errors.extend(errors)
        current_date += timedelta(days=1)
    
    # Log any errors that occurred
    if all_errors:
        logging.info("Errors occurred during processing:")
        for error in all_errors:
            logging.info(error)

In [11]:
# Specify the date range
start_date = datetime.strptime("20190525", "%Y%m%d")
end_date = datetime.strptime("20190630", "%Y%m%d")
country_code = 'MX'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date range
process_data_for_date_range(start_date, end_date, country_code, export_path)


2024-07-03 06:28:08,894 - INFO - Processing data for date: 20190525
2024-07-03 06:28:08,895 - INFO - Querying data for date: 20190525, hours: 0-3
2024-07-03 06:28:44,187 - INFO - Processing data for date: 20190525, hours: 0-3
2024-07-03 06:28:44,219 - INFO - Exporting aggregated data (geohash3) for date: 20190525, hours: 0-3
2024-07-03 06:28:44,256 - INFO - Exporting aggregated data (geohash5) for date: 20190525, hours: 0-3
2024-07-03 06:28:44,272 - INFO - Querying data for date: 20190525, hours: 3-6
2024-07-03 06:29:23,288 - INFO - Processing data for date: 20190525, hours: 3-6
2024-07-03 06:29:23,325 - INFO - Exporting aggregated data (geohash3) for date: 20190525, hours: 3-6
2024-07-03 06:29:23,361 - INFO - Exporting aggregated data (geohash5) for date: 20190525, hours: 3-6
2024-07-03 06:29:23,374 - INFO - Querying data for date: 20190525, hours: 6-9
2024-07-03 06:29:49,689 - INFO - Processing data for date: 20190525, hours: 6-9
2024-07-03 06:29:49,751 - INFO - Exporting aggregated 

# Fillin Gaps

In [7]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a specific date and time intervals
def process_data_for_date(event_date, country_code, export_path, start_hour, end_hour):
    # Define the export file paths
    csv3_file = f"od_{country_code.lower()}_agg3_3h.csv"
    csv5_file = f"od_{country_code.lower()}_agg5_3h.csv"
    csv3_file_path = f"{export_path}{csv3_file}"
    csv5_file_path = f"{export_path}{csv5_file}"

    # Loop through the specified hours in increments of 3 hours
    for hour in range(start_hour, end_hour, 3):
        next_hour = hour + 3  # end hour for the 3-hour interval
        logging.info(f"Querying data for date: {event_date}, hours: {hour}-{next_hour}")

        # Fetch the data for the specified event date and country code
        pe_tj_df3 = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                duration_minutes,
                length_meters,
                number_of_points,
                TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
                geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
                geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
                geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
                DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )

        # Filter the DataFrame for the current 3-hour interval
        logging.info(f"Processing data for date: {event_date}, hours: {hour}-{next_hour}")
        filtered_df = pe_tj_df3[(pe_tj_df3['event_hour'] >= hour) & (pe_tj_df3['event_hour'] < next_hour)]

        # Add user numbers to the aggregated data for geohash3
        aggregated_df3 = filtered_df.groupby(['start_geohash3', 'end_geohash3', 'local_date']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()

        # Flatten the MultiIndex columns
        aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Add the local_time column
        aggregated_df3['local_time'] = aggregated_df3['local_date'].astype(str) + \
                                       f" {hour:02d}:00:00 - {next_hour:02d}:00:00"

        # Filter out rows where trip_count is less than or equal to 9
        aggregated_df3 = aggregated_df3[aggregated_df3['trip_count'] > 9]

        # Append the results to the CSV file for geohash3
        logging.info(f"Exporting aggregated data (geohash3) for date: {event_date}, hours: {hour}-{next_hour}")
        if not os.path.isfile(csv3_file_path):
            aggregated_df3.to_csv(csv3_file_path, index=False)
        else:
            aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

        # Add user numbers to the aggregated data for geohash5
        aggregated_df5 = filtered_df.groupby(['start_geohash5', 'end_geohash5', 'local_date']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()

        # Flatten the MultiIndex columns
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'local_date', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Add the local_time column
        aggregated_df5['local_time'] = aggregated_df5['local_date'].astype(str) + \
                                       f" {hour:02d}:00:00 - {next_hour:02d}:00:00"

        # Filter out rows where trip_count is less than or equal to 9
        aggregated_df5 = aggregated_df5[aggregated_df5['trip_count'] > 9]

        # Append the results to the CSV file for geohash5
        logging.info(f"Exporting aggregated data (geohash5) for date: {event_date}, hours: {hour}-{next_hour}")
        if not os.path.isfile(csv5_file_path):
            aggregated_df5.to_csv(csv5_file_path, index=False)
        else:
            aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)


In [8]:
# Specify the date and initial time interval
event_date = "20190524"
start_hour = 3
end_hour = 24  # Process until the end of the day
country_code = 'MX'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)

2024-07-03 06:24:51,898 - INFO - Querying data for date: 20190524, hours: 3-6
2024-07-03 06:25:26,529 - INFO - Processing data for date: 20190524, hours: 3-6
2024-07-03 06:25:26,559 - INFO - Exporting aggregated data (geohash3) for date: 20190524, hours: 3-6
2024-07-03 06:25:26,602 - INFO - Exporting aggregated data (geohash5) for date: 20190524, hours: 3-6
2024-07-03 06:25:26,618 - INFO - Querying data for date: 20190524, hours: 6-9
2024-07-03 06:25:52,302 - INFO - Processing data for date: 20190524, hours: 6-9
2024-07-03 06:25:52,381 - INFO - Exporting aggregated data (geohash3) for date: 20190524, hours: 6-9
2024-07-03 06:25:52,456 - INFO - Exporting aggregated data (geohash5) for date: 20190524, hours: 6-9
2024-07-03 06:25:52,485 - INFO - Querying data for date: 20190524, hours: 9-12
2024-07-03 06:26:16,348 - INFO - Processing data for date: 20190524, hours: 9-12
2024-07-03 06:26:16,462 - INFO - Exporting aggregated data (geohash3) for date: 20190524, hours: 9-12
2024-07-03 06:26:1