In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install python-geohash

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import time
import geohash
import logging
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine

In [5]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [6]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

# Export to local

In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [8]:
# Function to process data for a specific date
def process_data_for_date(event_date, country_code, export_path):
    errors = []  # List to keep track of errors
    
    # Define the export file paths
    csv3_file = f"od_{country_code.lower()}_agg3_3h.csv"
    csv5_file = f"od_{country_code.lower()}_agg5_3h.csv"
    csv3_file_path = f"{export_path}{csv3_file}"
    csv5_file_path = f"{export_path}{csv5_file}"

    # Loop through 24 hours in increments of 3 hours
    for start_hour in range(0, 24, 3):
        end_hour = start_hour + 3  # end hour for the 3-hour interval
        logging.info(f"Querying data for date: {event_date}, hours: {start_hour}-{end_hour}")

        try:
            # Fetch the data for the specified event date and country code
            pe_tj_df3 = sql_engine.read_sql(
                f"""
                SELECT 
                    cuebiq_id,
                    duration_minutes,
                    length_meters,
                    number_of_points,
                    TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                    EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                    geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
                    geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
                    geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
                    geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
                    DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
                FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
                WHERE 
                    event_date = {event_date}
                    AND end_country = '{country_code}' 
                    AND start_country = '{country_code}' 
                """
            )

            # Filter the DataFrame for the current 3-hour interval
            logging.info(f"Processing data for date: {event_date}, hours: {start_hour}-{end_hour}")
            filtered_df = pe_tj_df3[(pe_tj_df3['event_hour'] >= start_hour) & (pe_tj_df3['event_hour'] < end_hour)]

            # Add user numbers to the aggregated data for geohash3
            aggregated_df3 = filtered_df.groupby(['start_geohash3', 'end_geohash3', 'local_date']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std']
            }).reset_index()

            # Flatten the MultiIndex columns
            aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m',
                                      'm_points_no', 'mdn_points_no', 'sd_points_no']

            # Add the local_time column
            aggregated_df3['local_time'] = aggregated_df3['local_date'].astype(str) + \
                                           f" {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

            # Filter out rows where trip_count is less than or equal to 9
            aggregated_df3 = aggregated_df3[aggregated_df3['trip_count'] > 9]

            # Append the results to the CSV file for geohash3
            logging.info(f"Exporting aggregated data (geohash3) for date: {event_date}, hours: {start_hour}-{end_hour}")
            if not os.path.isfile(csv3_file_path):
                aggregated_df3.to_csv(csv3_file_path, index=False)
            else:
                aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

            # Add user numbers to the aggregated data for geohash5
            aggregated_df5 = filtered_df.groupby(['start_geohash5', 'end_geohash5', 'local_date']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std']
            }).reset_index()

            # Flatten the MultiIndex columns
            aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'local_date', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m',
                                      'm_points_no', 'mdn_points_no', 'sd_points_no']

            # Add the local_time column
            aggregated_df5['local_time'] = aggregated_df5['local_date'].astype(str) + \
                                           f" {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

            # Filter out rows where trip_count is less than or equal to 9
            aggregated_df5 = aggregated_df5[aggregated_df5['trip_count'] > 9]

            # Append the results to the CSV file for geohash5
            logging.info(f"Exporting aggregated data (geohash5) for date: {event_date}, hours: {start_hour}-{end_hour}")
            if not os.path.isfile(csv5_file_path):
                aggregated_df5.to_csv(csv5_file_path, index=False)
            else:
                aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)
        
        except Exception as e:
            error_message = f"Error processing date: {event_date}, hours: {start_hour}-{end_hour} - {str(e)}"
            logging.error(error_message)
            errors.append(error_message)
    
    return errors

# Function to process data for a range of dates
def process_data_for_date_range(start_date, end_date, country_code, export_path):
    current_date = start_date
    all_errors = []
    while current_date <= end_date:
        event_date = current_date.strftime("%Y%m%d")
        logging.info(f"Processing data for date: {event_date}")
        errors = process_data_for_date(event_date, country_code, export_path)
        all_errors.extend(errors)
        current_date += timedelta(days=1)
    
    # Log any errors that occurred
    if all_errors:
        logging.info("Errors occurred during processing:")
        for error in all_errors:
            logging.info(error)

In [9]:
# Specify the date range
start_date = datetime.strptime("20190228", "%Y%m%d")
end_date = datetime.strptime("20190630", "%Y%m%d")
country_code = 'IN'
export_path = '/home/jovyan/Data/TJ/3h/'

# Measure total execution time
start_time = time.time()

# Process data for the specified date range
process_data_for_date_range(start_date, end_date, country_code, export_path)


2024-07-02 05:12:43,142 - INFO - Processing data for date: 20190228
2024-07-02 05:12:43,143 - INFO - Querying data for date: 20190228, hours: 0-3
2024-07-02 05:13:43,386 - INFO - Processing data for date: 20190228, hours: 0-3
2024-07-02 05:13:43,407 - INFO - Exporting aggregated data (geohash3) for date: 20190228, hours: 0-3
2024-07-02 05:13:43,445 - INFO - Exporting aggregated data (geohash5) for date: 20190228, hours: 0-3
2024-07-02 05:13:43,460 - INFO - Querying data for date: 20190228, hours: 3-6
2024-07-02 05:14:11,651 - INFO - Processing data for date: 20190228, hours: 3-6
2024-07-02 05:14:11,676 - INFO - Exporting aggregated data (geohash3) for date: 20190228, hours: 3-6
2024-07-02 05:14:11,711 - INFO - Exporting aggregated data (geohash5) for date: 20190228, hours: 3-6
2024-07-02 05:14:11,727 - INFO - Querying data for date: 20190228, hours: 6-9
2024-07-02 05:14:28,385 - INFO - Processing data for date: 20190228, hours: 6-9
2024-07-02 05:14:28,429 - INFO - Exporting aggregated 

# Check missing dates/times

In [9]:
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'
# Define the export file paths
csv3_file = f"od_{country_code.lower()}_agg3_3h.csv"
csv5_file = f"od_{country_code.lower()}_agg5_3h.csv"
csv3_file_path = f"{export_path}{csv3_file}"
csv5_file_path = f"{export_path}{csv5_file}"

final_df = pd.read_csv(csv3_file_path)
final_df['local_date'] = final_df['local_date'].astype(int)
final_df

Unnamed: 0,start_geohash3,end_geohash3,local_date,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no,local_time
0,qpz,qpz,20190101,11,319.692424,18.150000,369.759705,10622.753732,4472.684420,13992.289067,6.272727,2.0,11.018992,20190101 00:00:00 - 03:00:00
1,qqg,qqg,20190101,211,218.170853,53.916667,313.656801,8737.583672,1612.809195,24155.813606,5.000000,2.0,8.441959,20190101 00:00:00 - 03:00:00
2,qqg,qqu,20190101,19,231.121930,68.200000,368.066313,36176.830101,15373.618545,43671.576650,4.263158,2.0,4.531888,20190101 00:00:00 - 03:00:00
3,qqt,qqt,20190101,12,326.195833,217.708333,352.397390,4881.131911,3290.121849,6251.524776,4.000000,2.5,3.247377,20190101 00:00:00 - 03:00:00
4,qqu,qqu,20190101,156,223.032158,64.300000,308.184984,12784.349973,1608.970669,56852.781262,4.570513,2.0,8.883557,20190101 00:00:00 - 03:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143486,w8j,w8j,20190819,10,22.165000,26.116667,11.921284,3156.114443,634.385410,6463.030898,4.300000,4.0,1.828782,20190819 21:00:00 - 24:00:00
143487,w8p,w8p,20190819,77,28.174242,21.483333,24.874522,4529.579396,2129.916561,6830.045733,5.038961,4.0,3.258356,20190819 21:00:00 - 24:00:00
143488,wb0,wb0,20190819,26,31.460256,16.658333,29.282130,5706.383253,585.489566,18198.006945,4.423077,3.0,4.419711,20190819 21:00:00 - 24:00:00
143489,wb2,wb2,20190819,97,30.447766,21.316667,27.862832,4446.269779,1572.560922,7185.324481,5.463918,4.0,4.349478,20190819 21:00:00 - 24:00:00


## Check missing dates 

In [13]:
unique_values_count = final_df["local_date"].unique()
# unique_values_count = final_df["local_date"].nunique()
# unique_values_count.sort()
# unique_values_count
# len(unique_values_count)

all_dates = pd.date_range(start='2019-01-01', end='2019-08-19').strftime('%Y%m%d').astype(int)

# Convert to numpy array
all_dates = np.array(all_dates)

# Find missing dates
missing_dates = np.setdiff1d(all_dates, unique_values_count)

missing_dates

array([], dtype=int64)

## Check missing times 

In [14]:
# Convert local_date to datetime objects
final_df['local_date'] = pd.to_datetime(final_df['local_date'], format='%Y%m%d')

# Function to parse the local_time column and extract the start hour
def parse_start_hour(time_str):
    try:
        start_time_str, _ = time_str.split(' - ')
        start_time = datetime.strptime(start_time_str, '%Y%m%d %H:%M:%S')
        return start_time.hour
    except ValueError:
        return None

final_df['start_hour'] = final_df['local_time'].map(parse_start_hour)

# Required hours to check
required_hours = set([0, 3, 6, 9, 12, 15, 18, 21])

# Group by local_date and check missing hours
missing_hours = {}

for date, group in final_df.groupby('local_date'):
    present_hours = set(group['start_hour'])
    missing = required_hours - present_hours
    if missing:
        missing_hours[date] = sorted(list(missing))

# Convert missing hours dictionary to DataFrame for better readability
missing_hours_df = pd.DataFrame(list(missing_hours.items()), columns=['Date', 'Missing Hours'])

# Display the missing hours
print("Missing hours for each date:")
print(missing_hours_df)

Missing hours for each date:
        Date Missing Hours
0 2019-01-02          [21]
1 2019-04-12           [3]
2 2019-05-26          [21]
3 2019-06-06          [12]
4 2019-08-01      [18, 21]


# Fillin Gaps

In [10]:
import time
import pandas as pd
import os
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a specific date and time intervals
def process_data_for_date(event_date, country_code, export_path, start_hour, end_hour):
    # Define the export file paths
    csv3_file = f"od_{country_code.lower()}_agg3_3h.csv"
    csv5_file = f"od_{country_code.lower()}_agg5_3h.csv"
    csv3_file_path = f"{export_path}{csv3_file}"
    csv5_file_path = f"{export_path}{csv5_file}"

    # Loop through the specified hours in increments of 3 hours
    for hour in range(start_hour, end_hour, 3):
        next_hour = hour + 3  # end hour for the 3-hour interval
        logging.info(f"Querying data for date: {event_date}, hours: {hour}-{next_hour}")

        # Fetch the data for the specified event date and country code
        pe_tj_df3 = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                duration_minutes,
                length_meters,
                number_of_points,
                TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
                geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
                geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
                geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
                DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )

        # Filter the DataFrame for the current 3-hour interval
        logging.info(f"Processing data for date: {event_date}, hours: {hour}-{next_hour}")
        filtered_df = pe_tj_df3[(pe_tj_df3['event_hour'] >= hour) & (pe_tj_df3['event_hour'] < next_hour)]

        # Add user numbers to the aggregated data for geohash3
        aggregated_df3 = filtered_df.groupby(['start_geohash3', 'end_geohash3', 'local_date']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()

        # Flatten the MultiIndex columns
        aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Add the local_time column
        aggregated_df3['local_time'] = aggregated_df3['local_date'].astype(str) + \
                                       f" {hour:02d}:00:00 - {next_hour:02d}:00:00"

        # Filter out rows where trip_count is less than or equal to 9
        aggregated_df3 = aggregated_df3[aggregated_df3['trip_count'] > 9]

        # Append the results to the CSV file for geohash3
        logging.info(f"Exporting aggregated data (geohash3) for date: {event_date}, hours: {hour}-{next_hour}")
        if not os.path.isfile(csv3_file_path):
            aggregated_df3.to_csv(csv3_file_path, index=False)
        else:
            aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

        # Add user numbers to the aggregated data for geohash5
        aggregated_df5 = filtered_df.groupby(['start_geohash5', 'end_geohash5', 'local_date']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()

        # Flatten the MultiIndex columns
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'local_date', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Add the local_time column
        aggregated_df5['local_time'] = aggregated_df5['local_date'].astype(str) + \
                                       f" {hour:02d}:00:00 - {next_hour:02d}:00:00"

        # Filter out rows where trip_count is less than or equal to 9
        aggregated_df5 = aggregated_df5[aggregated_df5['trip_count'] > 9]

        # Append the results to the CSV file for geohash5
        logging.info(f"Exporting aggregated data (geohash5) for date: {event_date}, hours: {hour}-{next_hour}")
        if not os.path.isfile(csv5_file_path):
            aggregated_df5.to_csv(csv5_file_path, index=False)
        else:
            aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)


In [11]:
# Specify the date and initial time interval
event_date = "20190429"
start_hour = 18
end_hour = 21  # Process until the end of the day
country_code = 'IN'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)


2024-07-02 19:01:12,005 - INFO - Querying data for date: 20190429, hours: 18-21
2024-07-02 19:01:41,484 - INFO - Processing data for date: 20190429, hours: 18-21
2024-07-02 19:01:41,543 - INFO - Exporting aggregated data (geohash3) for date: 20190429, hours: 18-21
2024-07-02 19:01:41,632 - INFO - Exporting aggregated data (geohash5) for date: 20190429, hours: 18-21


# For Explanation

In [15]:
import time
import pandas as pd
import os

start_time = time.time()

event_date = 20190103

country_code = 'IN'

# Define the export file paths
csv3_file = f"od_{country_code.lower()}_2019_agg3_3h.csv"
csv5_file = f"od_{country_code.lower()}_2019_agg5_3h.csv"
export_path = '/home/jovyan/Data/TJ/3h/'
csv3_file_path = f"{export_path}{csv3_file}"
csv5_file_path = f"{export_path}{csv5_file}"

# Loop through 24 hours in increments of 3 hours
for start_hour in range(0, 24, 3):
    end_hour = start_hour + 3  # end hour for the 3-hour interval

    # Fetch the data for the specified event date and country code
    pe_tj_df3 = sql_engine.read_sql(
        f"""
        SELECT 
            cuebiq_id,
            duration_minutes,
            length_meters,
            number_of_points,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
            geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
            geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date = {event_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}' 
        """
    )

    # Filter the DataFrame for the current 3-hour interval
    filtered_df = pe_tj_df3[(pe_tj_df3['event_hour'] >= start_hour) & (pe_tj_df3['event_hour'] < end_hour)]

    # Add user numbers to the aggregated data for geohash3
    aggregated_df3 = filtered_df.groupby(['start_geohash3', 'end_geohash3', 'local_date']).agg({
        'cuebiq_id': 'count',
        'duration_minutes': ['mean', 'median', 'std'],
        'length_meters': ['mean', 'median', 'std'],
        'number_of_points': ['mean', 'median', 'std']
    }).reset_index()

    # Flatten the MultiIndex columns
    aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 
                              'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                              'm_length_m', 'mdn_length_m', 'sd_length_m',
                              'm_points_no', 'mdn_points_no', 'sd_points_no']

    # Add the local_time column
    aggregated_df3['local_time'] = aggregated_df3['local_date'].astype(str) + \
                                   f" {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

    # Filter out rows where trip_count is less than or equal to 9
    aggregated_df3 = aggregated_df3[aggregated_df3['trip_count'] > 9]

    # # Append the results to the CSV file for geohash3
    # if not os.path.isfile(csv3_file_path):
    #     aggregated_df3.to_csv(csv3_file_path, index=False)
    # else:
    #     aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

    # Add user numbers to the aggregated data for geohash5
    aggregated_df5 = filtered_df.groupby(['start_geohash5', 'end_geohash5', 'local_date']).agg({
        'cuebiq_id': 'count',
        'duration_minutes': ['mean', 'median', 'std'],
        'length_meters': ['mean', 'median', 'std'],
        'number_of_points': ['mean', 'median', 'std']
    }).reset_index()

    # Flatten the MultiIndex columns
    aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'local_date', 'trip_count', 
                              'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                              'm_length_m', 'mdn_length_m', 'sd_length_m',
                              'm_points_no', 'mdn_points_no', 'sd_points_no']

    # Add the local_time column
    aggregated_df5['local_time'] = aggregated_df5['local_date'].astype(str) + \
                                   f" {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

    # Filter out rows where trip_count is less than or equal to 9
    aggregated_df5 = aggregated_df5[aggregated_df5['trip_count'] > 9]

    # # Append the results to the CSV file for geohash5
    # if not os.path.isfile(csv5_file_path):
    #     aggregated_df5.to_csv(csv5_file_path, index=False)
    # else:
    #     aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)

pandas_time = time.time() - start_time
print(f"Pandas operations execution time: {pandas_time} seconds")


Pandas operations execution time: 149.8554482460022 seconds


In [17]:
aggregated_df3

Unnamed: 0,start_geohash3,end_geohash3,local_date,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no,local_time
3,t9v,t9v,20190103,12,28.291667,11.866667,36.693957,1556.880126,1002.937187,2043.622132,5.916667,4.5,5.195423,20190103 21:00:00 - 24:00:00
6,t9w,t9w,20190103,79,36.799789,24.000000,33.789140,4250.759102,675.382234,7595.562787,4.063291,3.0,2.538634,20190103 21:00:00 - 24:00:00
8,t9x,t9x,20190103,28,29.816667,18.075000,26.762632,3916.483534,640.739134,12866.515040,3.571429,3.0,2.588129,20190103 21:00:00 - 24:00:00
9,t9y,t9y,20190103,245,36.955646,26.316667,32.344040,4263.265409,1019.369438,8182.492890,5.236735,4.0,8.018490,20190103 21:00:00 - 24:00:00
12,t9z,t9z,20190103,62,40.524194,27.575000,35.611544,3380.254967,1038.797423,9386.756001,4.741935,4.0,3.552582,20190103 21:00:00 - 24:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,wh1,wh1,20190103,60,36.983333,30.641667,28.802591,12088.731471,1022.960649,33209.130475,5.483333,5.0,2.971570,20190103 21:00:00 - 24:00:00
237,wh6,wh6,20190103,20,31.115000,20.183333,28.292291,1606.360298,530.420567,3159.900184,4.000000,3.5,2.635786,20190103 21:00:00 - 24:00:00
239,wh9,wh9,20190103,87,33.046360,24.000000,31.685464,9950.059312,818.173118,42120.831986,4.712644,4.0,2.765687,20190103 21:00:00 - 24:00:00
241,whd,whd,20190103,19,35.208772,16.800000,30.987919,2620.389418,230.103694,5793.475337,3.736842,2.0,2.490919,20190103 21:00:00 - 24:00:00


In [16]:
aggregated_df5

Unnamed: 0,start_geohash5,end_geohash5,local_date,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no,local_time
133,t9y30,t9y30,20190103,16,25.743750,18.100000,23.049263,914.540138,193.398505,1682.530966,7.125000,4.0,13.652228,20190103 21:00:00 - 24:00:00
195,t9ywx,t9ywx,20190103,18,27.483333,14.108333,28.211835,2067.711361,655.826934,3242.225684,4.611111,4.0,2.831314,20190103 21:00:00 - 24:00:00
204,t9yy8,t9yy8,20190103,10,42.798333,28.233333,38.503018,1398.170835,512.301681,2152.095069,5.800000,5.0,3.392803,20190103 21:00:00 - 24:00:00
235,t9z83,t9z83,20190103,10,32.678333,25.216667,25.136182,1810.188211,623.218678,3426.718113,4.700000,3.0,3.917199,20190103 21:00:00 - 24:00:00
255,t9zvw,t9zvw,20190103,10,35.218333,22.100000,34.476755,1080.602465,960.067620,875.369104,4.100000,4.0,1.449138,20190103 21:00:00 - 24:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4032,wh1yv,wh1yv,20190103,17,36.124510,30.116667,30.635309,4969.236797,570.681507,17027.435031,5.352941,4.0,3.823303,20190103 21:00:00 - 24:00:00
4037,wh1zj,wh1zj,20190103,21,22.729365,21.166667,13.335075,11931.911392,663.349774,34994.700861,4.285714,5.0,1.616875,20190103 21:00:00 - 24:00:00
4055,wh6v1,wh6v1,20190103,13,29.178205,19.666667,27.840942,901.354044,457.448653,1237.177019,3.538462,4.0,1.560736,20190103 21:00:00 - 24:00:00
4069,wh9hz,wh9hz,20190103,12,35.159722,22.600000,36.646429,28525.792259,516.490435,96317.718967,5.833333,5.0,4.041452,20190103 21:00:00 - 24:00:00


# Export to Schema

In [11]:
# # Set up logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# # Database connection setup
# output_schema_name = 'od_matrix_10'
# con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

In [None]:
# 这个还没改
# # Function to process data for a single day
# def process_day(event_date, country_code, sql_engine):
#     try:
#         # Read data from the SQL table
#         pe_tj_df = sql_engine.read_sql(
#             f"""
#             SELECT 
#                 cuebiq_id,
#                 start_lat,
#                 start_lng,
#                 end_lat,
#                 end_lng,
#                 duration_minutes,
#                 length_meters,
#                 number_of_points
#             FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
#             WHERE 
#                 event_date = {event_date}
#                 AND end_country = '{country_code}' 
#                 AND start_country = '{country_code}' 
#             """
#         )
#         logging.info(f"Executing SQL query for date {event_date}")

#         # Encode geohashes
#         pe_tj_df['start_geohash3'] = pe_tj_df.apply(
#             lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=3), axis=1)
#         pe_tj_df['end_geohash3'] = pe_tj_df.apply(
#             lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=3), axis=1)

#         # Aggregate data
#         aggregated_df3 = pe_tj_df.groupby(['start_geohash3', 'end_geohash3']).agg({
#             'cuebiq_id': 'count',
#             'duration_minutes': ['mean', 'median', 'std'],
#             'length_meters': ['mean', 'median', 'std'],
#             'number_of_points': ['mean', 'median', 'std']
#         }).reset_index()
#         aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'trip_count', 
#                                   'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
#                                   'm_length_m', 'mdn_length_m', 'sd_length_m',
#                                   'm_points_no', 'mdn_points_no', 'sd_points_no']

#         # Filter aggregated data and reorder columns
#         filtered_df3= aggregated_df3.loc[aggregated_df3['trip_count'] > 9]
#         filtered_df3 = filtered_df3[['start_geohash3', 'end_geohash3', 'trip_count', 
#                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
#                                      'm_length_m', 'mdn_length_m', 'sd_length_m',
#                                      'm_points_no', 'mdn_points_no', 'sd_points_no']]
#         return filtered_df3

#     except Exception as e:
#         logging.error(f"Error processing data for date {event_date}: {e}")
#         return pd.DataFrame()

# # Function to insert data with retry mechanism
# def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
#     for attempt in range(retries):
#         try:
#             df.to_sql(
#                 table_name, 
#                 con, 
#                 index=False, 
#                 if_exists="append", 
#                 method="multi"
#             )
#             logging.info(f"Inserted data into table {table_name}")
#             break
#         except Exception as e:
#             logging.error(f"Attempt {attempt+1} failed with error: {e}")
#             if attempt < retries - 1:
#                 time.sleep(delay)
#             else:
#                 logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")

# # Main processing loop
# def process_date_range(start_date, end_date, country_code, sql_engine):
#     start_time = time.time()  # Record start time before processing loop
        
#     current_date = start_date
#     while current_date <= end_date:
#         event_date = current_date.strftime('%Y%m%d')
        
#         filtered_df3 = process_day(event_date, country_code, sql_engine)

#         # Insert data into the database
#         if not filtered_df3.empty:
#             # table_name = f"{output_schema_name}.{country_code}_agg3"
#             table_name = f"od_{country_code.lower()}_{event_date}_agg3_10"
#             insert_data_with_retry(filtered_df3, table_name, con)
#         else:
#             logging.info(f"No data to insert for date {event_date}")

#         # Move to the next day
#         current_date += timedelta(days=1)
#     end_time = time.time()  # Record end time after processing loop
#     total_time = end_time - start_time
#     logging.info(f"Total processing time: {total_time:.2f} seconds")

In [None]:
# # Example usage:
# process_date_range(datetime(2019, 12, 31), datetime(2019, 12, 31), 'ID', sql_engine)