In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install python-geohash

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import os
import geohash
from datetime import datetime, timedelta
import logging

In [5]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [6]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [6]:
import pandas as pd
import os
import geohash
from datetime import datetime, timedelta
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )

        # Encode geohashes
        pe_tj_df['start_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
        pe_tj_df['end_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

        pe_tj_df['start_geohash3'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=3), axis=1)
        pe_tj_df['end_geohash3'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=3), axis=1)

        # Load cell lists from SQL
        try:
            celllist5 = sql_engine.read_sql(f"SELECT geohash5 AS geohash, no_of_unique_users FROM dedicated.pop_density.pd_{country_code}_{event_date}_agg5")
            geohash_dict5 = celllist5.set_index('geohash')['no_of_unique_users'].to_dict()
        except Exception as e:
            logging.warning(f"Failed to load geohash5 data for date {event_date}: {e}")
            geohash_dict5 = {}

        try:
            celllist3 = sql_engine.read_sql(f"SELECT geohash3 AS geohash, no_of_unique_users FROM dedicated.pop_density.pd_{country_code}_{event_date}_agg3")
            geohash_dict3 = celllist3.set_index('geohash')['no_of_unique_users'].to_dict()
        except Exception as e:
            logging.warning(f"Failed to load geohash3 data for date {event_date}: {e}")
            geohash_dict3 = {}

        # Add user numbers to the aggregated data
        aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std']
        }).reset_index()
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m']
        aggregated_df5['start_geohash_user'] = aggregated_df5['start_geohash5'].map(geohash_dict5)
        aggregated_df5['end_geohash_user'] = aggregated_df5['end_geohash5'].map(geohash_dict5)

        aggregated_df3 = pe_tj_df.groupby(['start_geohash3', 'end_geohash3']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std']
        }).reset_index()
        aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m']
        aggregated_df3['start_geohash_user'] = aggregated_df3['start_geohash3'].map(geohash_dict3)
        aggregated_df3['end_geohash_user'] = aggregated_df3['end_geohash3'].map(geohash_dict3)

        # Filter aggregated data
        filtered_df5 = aggregated_df5.dropna(subset=['start_geohash_user', 'end_geohash_user'])
        filtered_df3 = aggregated_df3.dropna(subset=['start_geohash_user', 'end_geohash_user'])

        # Reorder columns
        filtered_df5 = filtered_df5[['start_geohash5', 'start_geohash_user', 'end_geohash5', 'end_geohash_user', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m']]
        filtered_df3 = filtered_df3[['start_geohash3', 'start_geohash_user', 'end_geohash3', 'end_geohash_user', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m']]

        return filtered_df5, filtered_df3

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame(), pd.DataFrame()

# Main processing loop
def process_date_range(start_date, end_date, country_code, sql_engine):
    start_time = time.time()  # Record start time before processing loop
        
    current_date = start_date
    while current_date <= end_date:
        event_date = current_date.strftime('%Y%m%d')
        
        filtered_df5, filtered_df3 = process_day(event_date, country_code, sql_engine)

        # Define file paths
        file_path_5 = f'/home/jovyan/Data/Agg_TJ/{country_code}5/{event_date}_{country_code}_pe_tj_agg5.csv'
        file_path_3 = f'/home/jovyan/Data/Agg_TJ/{country_code}3/{event_date}_{country_code}_pe_tj_agg3.csv'

        # Save results if data is not empty and file does not already exist
        if not filtered_df5.empty and not os.path.exists(file_path_5):
            os.makedirs(os.path.dirname(file_path_5), exist_ok=True)
            filtered_df5.to_csv(file_path_5, index=False)
            logging.info(f"File saved to {file_path_5}")
        else:
            logging.info(f"File already exists or no data for {file_path_5}")

        if not filtered_df3.empty and not os.path.exists(file_path_3):
            os.makedirs(os.path.dirname(file_path_3), exist_ok=True)
            filtered_df3.to_csv(file_path_3, index=False)
            logging.info(f"File saved to {file_path_3}")
        else:
            logging.info(f"File already exists or no data for {file_path_3}")

        # Move to the next day
        current_date += timedelta(days=1)
    end_time = time.time()  # Record end time after processing loop
    total_time = end_time - start_time
    logging.info(f"Total processing time: {total_time:.2f} seconds")

# Example usage:
process_date_range(datetime(2019, 1, 1), datetime(2019, 1, 5), 'ID', sql_engine)


2024-06-05 15:28:19,324 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID5/20190101_ID_pe_tj_agg5.csv
2024-06-05 15:28:19,327 - INFO - File already exists or no data for /home/jovyan/Data/Agg_TJ/ID3/20190101_ID_pe_tj_agg3.csv
2024-06-05 15:28:31,358 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID5/20190102_ID_pe_tj_agg5.csv
2024-06-05 15:28:31,361 - INFO - File already exists or no data for /home/jovyan/Data/Agg_TJ/ID3/20190102_ID_pe_tj_agg3.csv
2024-06-05 15:28:44,325 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID5/20190103_ID_pe_tj_agg5.csv
2024-06-05 15:28:44,327 - INFO - File already exists or no data for /home/jovyan/Data/Agg_TJ/ID3/20190103_ID_pe_tj_agg3.csv
2024-06-05 15:28:57,655 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID5/20190104_ID_pe_tj_agg5.csv
2024-06-05 15:28:57,657 - INFO - File already exists or no data for /home/jovyan/Data/Agg_TJ/ID3/20190104_ID_pe_tj_agg3.csv
2024-06-05 15:29:09,791 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID5/20190105_ID_pe_t

In [10]:
import pandas as pd
df = pd.read_csv('/home/jovyan/Data/Agg_TJ/ID5/20190105_ID_pe_tj_agg5.csv')
df

Unnamed: 0,start_geohash5,start_geohash_user,end_geohash5,end_geohash_user,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m
0,qpz6e,215.0,qpz6e,215.0,64,62.748958,21.808333,106.319006,2809.440426,864.920319,4551.587443
1,qpz6e,215.0,qpz6g,152.0,16,65.984375,48.000000,51.030957,19473.521774,5115.413591,46925.373056
2,qpz6e,215.0,qpz6k,67.0,5,26.430000,25.433333,10.579195,5665.807804,6349.779386,1209.147733
3,qpz6e,215.0,qpz6s,199.0,14,41.248810,29.466667,38.708804,6747.501256,4458.383775,7528.676818
4,qpz6e,215.0,qpz6t,79.0,3,55.611111,32.100000,43.096553,8195.709085,8274.073697,1216.842496
...,...,...,...,...,...,...,...,...,...,...,...
13545,wb4s6,53.0,wb4sd,48.0,8,152.789583,31.841667,238.769228,5332.915015,4077.688602,3744.640126
13546,wb4sd,48.0,wb4s6,53.0,8,71.468750,29.866667,77.038761,4674.883895,4961.838307,1801.024438
13547,wb4sd,48.0,wb4sd,48.0,10,38.253333,25.416667,38.764266,8943.166572,2513.405653,11543.462684
13548,wb71b,16.0,wb71b,16.0,3,228.194444,166.716667,110.855577,639.779805,294.787540,787.798345


In [11]:
df.columns

Index(['start_geohash5', 'start_geohash_user', 'end_geohash5',
       'end_geohash_user', 'trip_count', 'm_duration_min', 'mdn_duration_min',
       'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m'],
      dtype='object')

In [13]:
from sqlalchemy.engine import create_engine
output_schema_name = 'presence_data'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

In [23]:
import pandas as pd
from sqlalchemy.engine import create_engine

# Replace with your actual schema name
output_schema_name = "presence_data"
bucketized_table = f"dedicated.{output_schema_name}.bucketiz1_table"

# Partition size
partition_size = 5000

# Data preparation
df['partition_key'] = (df.index // partition_size) + 1
df = df.astype({
    'start_geohash_user': 'int',
    'end_geohash_user': 'int',
    'trip_count': 'int',
    'partition_key': 'int'
})

# Create the SQL engine
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

# Define the SQL query to create the bucketized and partitioned table
create_table_query = f"""
CREATE TABLE {bucketized_table} (
    start_geohash5 varchar,
    start_geohash_user bigint,
    end_geohash5 varchar,
    end_geohash_user bigint,
    trip_count bigint,
    m_duration_min double,
    mdn_duration_min double,
    sd_duration_min double,
    m_length_m double,
    mdn_length_m double,
    sd_length_m double,
    partition_key bigint
)
WITH (
  partitioned_by = ARRAY['partition_key'],
  bucketed_by = ARRAY['end_geohash5'],
  bucket_count = 5
)
"""

# Execute the create table query
with con.connect() as connection:
    connection.execute(create_table_query)

# Function to insert data in chunks
def insert_data_in_chunks(df, table_name, engine, chunk_size=5000):
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        chunk.to_sql(table_name, engine, index=False, if_exists='append', method='multi')

# Insert data into the bucketized table
insert_data_in_chunks(df, bucketized_table, con, 5000)

country_code = 'id'
date_str = "20190101"
final_bucketized_table = f"od_{country_code.lower()}_{date_str}_agg3"

# Rename the table to the final name based on the date range and country code
rename_table_query = f"""
ALTER TABLE "bucketized_table" RENAME TO "{final_bucketized_table}"
"""

# Execute the rename table query
with con.connect() as connection:
    connection.execute(rename_table_query)

OperationalError: (trino.exceptions.TrinoExternalError) TrinoExternalError(type=EXTERNAL, name=HIVE_METASTORE_ERROR, message="dedicated.presence_data.bucketiz1_table is not a valid object name", query_id=20240605_185713_00642_mejye)
[SQL: 
CREATE TABLE "dedicated.presence_data.bucketiz1_table" (
	start_geohash5 VARCHAR, 
	start_geohash_user BIGINT, 
	end_geohash5 VARCHAR, 
	end_geohash_user BIGINT, 
	trip_count BIGINT, 
	m_duration_min DOUBLE, 
	mdn_duration_min DOUBLE, 
	sd_duration_min DOUBLE, 
	m_length_m DOUBLE, 
	mdn_length_m DOUBLE, 
	sd_length_m DOUBLE, 
	partition_key BIGINT
)

]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [8]:
import pandas as pd
import os
import geohash
from datetime import datetime, timedelta
import logging
from sqlalchemy import create_engine

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )

        # Encode geohashes
        pe_tj_df['start_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
        pe_tj_df['end_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

        # Load cell lists from SQL
        try:
            celllist5 = sql_engine.read_sql(f"SELECT geohash5 AS geohash, no_of_unique_users FROM dedicated.pop_density.pd_{country_code}_{event_date}_agg5")
            geohash_dict5 = celllist5.set_index('geohash')['no_of_unique_users'].to_dict()
        except Exception as e:
            logging.warning(f"Failed to load geohash5 data for date {event_date}: {e}")
            geohash_dict5 = {}

        # Add user numbers to the aggregated data
        aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std']
        }).reset_index()
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m']

        # Define the columns before mapping
        aggregated_df5['start_geohash_user'] = aggregated_df5['start_geohash5'].map(geohash_dict5)
        aggregated_df5['end_geohash_user'] = aggregated_df5['end_geohash5'].map(geohash_dict5)

        # Filter aggregated data
        filtered_df5 = aggregated_df5.dropna(subset=['start_geohash_user', 'end_geohash_user'])
        
        return filtered_df5

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame()

# Function to insert data in chunks
def insert_data_in_chunks(df, table_name, engine, chunk_size=5000):
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        chunk.to_sql(table_name, engine, index=False, if_exists='append', method='multi')

# Main processing loop
def process_date_range(start_date, end_date, country_code, sql_engine):
    start_time = time.time()  # Record start time before processing loop
        
    current_date = start_date
    while current_date <= end_date:
        event_date = current_date.strftime('%Y%m%d')
        
        filtered_df5 = process_day(event_date, country_code, sql_engine)

        # Create the SQL engine
        output_schema_name = "od_matrix"
        bucketized_table2 = f"dedicated.{output_schema_name}.tttest_table"
        con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

        # Create the SQL table if not exists for 5-level geohash
        create_table_query_5 = f"""
        CREATE TABLE IF NOT EXISTS {bucketized_table2} (
            start_geohash5 varchar,
            start_geohash_user bigint,
            end_geohash5 varchar,
            end_geohash_user bigint,
            trip_count bigint,
            m_duration_min double,
            mdn_duration_min double,
            sd_duration_min double,
            m_length_m double,
            mdn_length_m double,
            sd_length_m double,
            partition_key bigint
        )
        WITH (
          partitioned_by = ARRAY['partition_key'],
          bucketed_by = ARRAY['end_geohash5'],
          bucket_count = 5
        )
        """

        with con.connect() as connection:
            connection.execute(create_table_query_5)

        # Add partition key and ensure correct data types for 5-level geohash
        filtered_df5['partition_key'] = (filtered_df5.index // 5000) + 1
        filtered_df5 = filtered_df5.astype({
            'start_geohash_user': 'int',
            'end_geohash_user': 'int',
            'trip_count': 'int',
            'partition_key': 'int'
        })

        # Insert data into the bucketized tables and rename
        if not filtered_df5.empty:
            insert_data_in_chunks(filtered_df5, "bucketized_table2", con, 5000)
            logging.info(f"Data inserted into {bucketized_table2}")
            
            final_table_5 = f"od_{country_code.lower()}_{event_date}_agg5"
            
            rename_table_query_5 = f"""
            ALTER TABLE "bucketized_table2" RENAME TO "{final_table_5}"
            """
            with con.connect() as connection:
                connection.execute(rename_table_query_5)
        else:
            logging.info(f"No data to insert for {bucketized_table2} for 5-level geohash")

        # Move to the next day
        current_date += timedelta(days=1)
    
    end_time = time.time()  # Record end time after processing loop
    total_time = end_time - start_time
    logging.info(f"Total processing time: {total_time:.2f} seconds")

# Example usage:
process_date_range(datetime(2019, 1, 1), datetime(2019, 1, 5), 'ID', sql_engine)


2024-06-05 23:18:46,169 - INFO - Data inserted into dedicated.presence_data.tttest_table
2024-06-05 23:20:13,449 - INFO - Data inserted into dedicated.presence_data.tttest_table
2024-06-05 23:21:41,171 - INFO - Data inserted into dedicated.presence_data.tttest_table
2024-06-05 23:23:07,675 - INFO - Data inserted into dedicated.presence_data.tttest_table
2024-06-05 23:24:37,472 - INFO - Data inserted into dedicated.presence_data.tttest_table
2024-06-05 23:24:38,580 - INFO - Total processing time: 442.27 seconds


In [19]:
# break above down:
import pandas as pd
import os
import geohash
from datetime import datetime, timedelta
import logging
from sqlalchemy import create_engine

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )

        print("Data read from SQL:", pe_tj_df.head())

        # Encode geohashes
        pe_tj_df['start_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
        pe_tj_df['end_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

        print("Geohashes encoded:", pe_tj_df[['start_geohash5', 'end_geohash5']].head())

        # Load cell lists from SQL
        try:
            celllist5 = sql_engine.read_sql(f"SELECT geohash5 AS geohash, no_of_unique_users FROM dedicated.pop_density.pd_{country_code}_{event_date}_agg5")
            geohash_dict5 = celllist5.set_index('geohash')['no_of_unique_users'].to_dict()
        except Exception as e:
            logging.warning(f"Failed to load geohash5 data for date {event_date}: {e}")
            geohash_dict5 = {}

        print("Geohash dictionary:", geohash_dict5)

        # Add user numbers to the aggregated data
        aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std']
        }).reset_index()
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m']

        # Define the columns before mapping
        aggregated_df5['start_geohash_user'] = aggregated_df5['start_geohash5'].map(geohash_dict5)
        aggregated_df5['end_geohash_user'] = aggregated_df5['end_geohash5'].map(geohash_dict5)

        print("Aggregated data with user numbers:", aggregated_df5.head())

        # Filter aggregated data
        filtered_df5 = aggregated_df5.dropna(subset=['start_geohash_user', 'end_geohash_user'])

        print("Filtered data:", filtered_df5.head())

        return filtered_df5

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame()

# Function to insert data in chunks
def insert_data_in_chunks(df, table_name, engine, chunk_size=5000):
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        chunk.to_sql(table_name, engine, index=False, if_exists='append', method='multi')

# Main processing loop
def process_date_range(start_date, end_date, country_code, sql_engine):
    start_time = time.time()  # Record start time before processing loop
        
    current_date = start_date
    while current_date <= end_date:
        event_date = current_date.strftime('%Y%m%d')
        
        filtered_df5 = process_day(event_date, country_code, sql_engine)

        # Create the SQL engine
        output_schema_name = "presence_data"
        bucketized_table = f"dedicated.{output_schema_name}.test_table"
        con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

        # Create the SQL table if not exists for 5-level geohash
        create_table_query_5 = f"""
        CREATE TABLE IF NOT EXISTS {bucketized_table} (
            start_geohash5 varchar,
            start_geohash_user bigint,
            end_geohash5 varchar,
            end_geohash_user bigint,
            trip_count bigint,
            m_duration_min double,
            mdn_duration_min double,
            sd_duration_min double,
            m_length_m double,
            mdn_length_m double,
            sd_length_m double,
            partition_key bigint
        )
        WITH (
          partitioned_by = ARRAY['partition_key'],
          bucketed_by = ARRAY['end_geohash5'],
          bucket_count = 5
        )
        """

        with con.connect() as connection:
            connection.execute(create_table_query_5)

        # Add partition key
        filtered_df5['partition_key'] = (filtered_df5.index // 5000) + 1

        # Ensure all necessary columns exist in the DataFrame
        for column in ['start_geohash_user', 'end_geohash_user', 'trip_count', 'partition_key']:
            if column not in filtered_df5.columns:
                filtered_df5[column] = 0

        # Ensure correct data types for 5-level geohash
        filtered_df5 = filtered_df5.astype({
            'start_geohash_user': 'int',
            'end_geohash_user': 'int',
            'trip_count': 'int',
            'partition_key': 'int'
        })

        # Insert data into the bucketized tables and rename
        if not filtered_df5.empty:
            insert_data_in_chunks(filtered_df5, "bucketized_table", con, 5000)
            logging.info(f"Data inserted into {bucketized_table}")
            
            final_table_5 = f"od_{country_code.lower()}_{event_date}_agg5"
            
            rename_table_query_5 = f"""
            ALTER TABLE "bucketized_table" RENAME TO "{final_table_5}"
            """
            with con.connect() as connection:
                connection.execute(rename_table_query_5)
        else:
            logging.info(f"No data to insert for {bucketized_table} for 5-level geohash")

        # Move to the next day
        current_date += timedelta(days=1)
    
    end_time = time.time()  # Record end time after processing loop
    total_time = end_time - start_time
    logging.info(f"Total processing time: {total_time:.2f} seconds")

# Example usage:
process_date_range(datetime(2019, 1, 1), datetime(2019, 1, 1), 'ID', sql_engine)

Data read from SQL:     cuebiq_id  start_lat   start_lng   end_lat     end_lng  duration_minutes  \
0  1698762720  -5.423305  105.248176 -5.423459  105.247534         26.933333   
1  1698762720  -5.423459  105.247534 -5.423236  105.248163          4.566667   
2  1698762720  -5.423236  105.248163 -5.423589  105.248379         57.116667   
3  1698435164  -7.775437  110.319752 -8.110177  110.468590        232.250000   
4  1698435164  -8.110177  110.468590 -8.106027  110.462213         47.650000   

   length_meters  
0      73.166107  
1      74.001094  
2     104.751414  
3   51305.047139  
4    1116.158415  
Geohashes encoded:   start_geohash5 end_geohash5
0          qr4c4        qr4c4
1          qr4c4        qr4c4
2          qr4c4        qr4c4
3          qqw7y        qqw9c
4          qqw9c        qqw9c
Geohash dictionary: {'qpz3u': 12, 'qpz6e': 159, 'qpz6g': 138, 'qpz6h': 17, 'qpz6k': 51, 'qpz6s': 166, 'qpz6t': 71, 'qpz6u': 63, 'qpz6v': 16, 'qpz6w': 24, 'qpz75': 55, 'qpz76': 37, 'qpz77

OperationalError: (trino.exceptions.TrinoExternalError) TrinoExternalError(type=EXTERNAL, name=HIVE_PATH_ALREADY_EXISTS, message="Target directory for table 'presence_data.bucketized_table' already exists: s3a://cuebiq-paas-org-1872-kr7pteibwb/metastore/presence_data.db/bucketized_table", query_id=20240605_213647_00985_mejye)
[SQL: 
CREATE TABLE bucketized_table (
	start_geohash5 VARCHAR, 
	end_geohash5 VARCHAR, 
	trip_count BIGINT, 
	m_duration_min DOUBLE, 
	mdn_duration_min DOUBLE, 
	sd_duration_min DOUBLE, 
	m_length_m DOUBLE, 
	mdn_length_m DOUBLE, 
	sd_length_m DOUBLE, 
	start_geohash_user BIGINT, 
	end_geohash_user BIGINT, 
	partition_key BIGINT
)

]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [7]:
import pandas as pd
import geohash
from datetime import datetime, timedelta
import logging
from sqlalchemy import create_engine

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [10]:
current_date = datetime.strptime('20191231', '%Y%m%d')
country_code = 'IN'

In [12]:
event_date = current_date.strftime('%Y%m%d')
try:
    pe_tj_df = sql_engine.read_sql(
        f"""
        SELECT 
            cuebiq_id,
            start_lat,
            start_lng,
            end_lat,
            end_lng,
            duration_minutes,
            length_meters
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date = {event_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}' 
        """
    )
except Exception as e:
    logging.error(f"Error reading data for date {event_date}: {e}")
    
pe_tj_df

Unnamed: 0,cuebiq_id,start_lat,start_lng,end_lat,end_lng,duration_minutes,length_meters
0,2072448547,13.200317,77.70988,13.200019,77.708898,8.1,111.497342
1,2072448547,13.200019,77.708898,13.201249,77.710446,6.133333,216.473069
2,2573855929,22.641818,88.439534,22.642586,88.438834,3.616667,111.781136
3,2573855929,22.642586,88.438834,22.500821,88.353353,45.933333,20682.938257
4,2568416615,19.098772,72.874246,19.098587,72.873595,12.05,244.820889
5,2568416615,19.098587,72.873595,19.098606,72.87346,13.016667,245.590944
6,2134531595,28.555631,77.083816,28.555225,77.085207,37.533333,211.626826
7,2134531595,28.555225,77.085207,28.554646,77.0855,8.416667,70.557999
8,2134531595,28.554646,77.0855,28.552414,77.08734,22.616667,399.60043
9,2134531595,28.552414,77.08734,28.5533,77.08486,11.033333,261.781491


In [15]:
try:
    pe_tj_df['start_geohash5'] = pe_tj_df.apply(
        lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
    pe_tj_df['end_geohash5'] = pe_tj_df.apply(
        lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)
    print("Geohashes encoded:", pe_tj_df[['start_geohash5', 'end_geohash5']].head())
except Exception as e:
    logging.error(f"Error encoding geohashes for date {event_date}: {e}")


Geohashes encoded:   start_geohash5 end_geohash5
0          qw8hk        qw8hk
1          qw8hk        qw8hk
2          qw8hk        qw8hk
3          qw8hk        qw8hh
4          qw8fr        qw8fn


In [18]:
try:
    celllist5 = sql_engine.read_sql(f"SELECT geohash5 AS geohash, no_of_unique_users FROM dedicated.pop_density.pd_{country_code}_{event_date}_agg5")
    geohash_dict5 = celllist5.set_index('geohash')['no_of_unique_users'].to_dict()
    print("Geohash dictionary:", geohash_dict5)
except Exception as e:
    logging.warning(f"Failed to load geohash5 data for date {event_date}: {e}")
    geohash_dict5 = {}


Geohash dictionary: {'qpz3u': 12, 'qpz6e': 159, 'qpz6g': 138, 'qpz6h': 17, 'qpz6k': 51, 'qpz6s': 166, 'qpz6t': 71, 'qpz6u': 63, 'qpz6v': 16, 'qpz6w': 24, 'qpz75': 55, 'qpz76': 37, 'qpz77': 23, 'qpz7d': 34, 'qpz7f': 18, 'qpz7h': 21, 'qpz9v': 12, 'qpzdf': 12, 'qpze4': 11, 'qpze6': 31, 'qpze9': 16, 'qpzec': 14, 'qpzed': 39, 'qpzhq': 19, 'qpzk1': 11, 'qpzk2': 12, 'qpzk3': 15, 'qpzk4': 18, 'qpzkc': 11, 'qpzkf': 12, 'qpzkg': 13, 'qpzm5': 12, 'qpzm7': 11, 'qpzmk': 39, 'qpzmm': 12, 'qpzms': 18, 'qpzmu': 25, 'qpzq4': 14, 'qpzq5': 32, 'qpzq6': 12, 'qpzq7': 63, 'qpzqh': 87, 'qpzqj': 12, 'qpzqk': 82, 'qpzqm': 29, 'qpzqq': 23, 'qpzqr': 18, 'qpzqs': 22, 'qpzqt': 13, 'qpzs1': 13, 'qpzsh': 11, 'qpzsj': 12, 'qpzt2': 20, 'qpzt3': 17, 'qpzt8': 11, 'qpzw2': 30, 'qpzw3': 13, 'qpzw8': 12, 'qpzw9': 54, 'qpzwc': 26, 'qpzwd': 25, 'qpzwf': 21, 'qpzx0': 11, 'qpzx4': 14, 'qqezb': 11, 'qqg5p': 11, 'qqg78': 26, 'qqg79': 20, 'qqg7b': 16, 'qqg7d': 17, 'qqg7e': 22, 'qqg7s': 20, 'qqg7u': 19, 'qqg7v': 43, 'qqg7y': 33, '

In [26]:
try:
    aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
        'cuebiq_id': 'count',
        'duration_minutes': ['mean', 'median', 'std'],
        'length_meters': ['mean', 'median', 'std']
    }).reset_index()
    aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m']
    aggregated_df5['start_geohash_user'] = aggregated_df5['start_geohash5'].map(geohash_dict5)
    aggregated_df5['end_geohash_user'] = aggregated_df5['end_geohash5'].map(geohash_dict5)
    print("Aggregated data with user numbers:", aggregated_df5.head())
except Exception as e:
    logging.error(f"Error aggregating data for date {event_date}: {e}")


Aggregated data with user numbers:   start_geohash5 end_geohash5  trip_count  m_duration_min  mdn_duration_min  \
0          qpvzg        qpvzg           2       18.325000         18.325000   
1          qpvzu        qpvzg           1       26.950000         26.950000   
2          qpvzu        qpvzu           1      773.750000        773.750000   
3          qpxct        qpxgj           1      229.650000        229.650000   
4          qpxtp        qpxtp           2       59.091667         59.091667   

   sd_duration_min    m_length_m  mdn_length_m  sd_length_m  \
0         5.456507    245.488191    245.488191     3.050029   
1              NaN   2240.592518   2240.592518          NaN   
2              NaN    178.918013    178.918013          NaN   
3              NaN  27612.457875  27612.457875          NaN   
4        67.988317    289.338483    289.338483   245.511844   

   start_geohash_user  end_geohash_user  
0                 NaN               NaN  
1                 NaN      