In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install python-geohash

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import os
import geohash
from datetime import datetime, timedelta
import logging

In [5]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [6]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters,
                number_of_points
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )
        logging.info(f"Executing SQL query for date {event_date}")
        
        # Encode geohashes
        pe_tj_df['start_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
        pe_tj_df['end_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

        # Add user numbers to the aggregated data
        aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Filter aggregated data
        filtered_df5 = aggregated_df5.loc[aggregated_df5['trip_count'] > 9]
        return filtered_df5

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame()

# Function to insert data in chunks
def insert_data_in_chunks(df, table_name, engine, chunk_size):
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        chunk.to_sql(table_name, engine, index=False, if_exists='append', method='multi')

# Main processing loop
def process_date_range(start_date, end_date, country_code, sql_engine):
    start_time = time.time()  # Record start time before processing loop
        
    current_date = start_date
    while current_date <= end_date:
        event_date = current_date.strftime('%Y%m%d')
        try:
            filtered_df5 = process_day(event_date, country_code, sql_engine)

            # Create the SQL engine
            output_schema_name = "od_matrix_10"
            final_table_5 = f"od_{country_code.lower()}_{event_date}_agg5_10"
            con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

            # Create the SQL table with the correct name for 5-level geohash
            create_table_query_5 = f"""
            CREATE TABLE IF NOT EXISTS {final_table_5} (
                start_geohash5 varchar,
                end_geohash5 varchar,
                trip_count bigint,
                m_duration_min double,
                mdn_duration_min double,
                sd_duration_min double,
                m_length_m double,
                mdn_length_m double,
                sd_length_m double,
                m_points_no double,
                mdn_points_no double,
                sd_points_no double
            )
            WITH (
              bucketed_by = ARRAY['end_geohash5'],
              bucket_count = 30
            )
            """

            with con.connect() as connection:
                connection.execute(create_table_query_5)
                
            filtered_df5 = filtered_df5.astype({
                'trip_count': 'int'
            })
            
            # Insert data into the table with the correct name
            if not filtered_df5.empty:
                
                insert_data_in_chunks(filtered_df5, final_table_5, con, 500)
                logging.info(f"Data inserted into {final_table_5}")
            else:
                logging.info(f"No data to insert for {final_table_5} for 5-level geohash")
        except Exception as e:
            logging.error(f"Failed to process data for date {event_date}: {e}")

        # Move to the next day
        current_date += timedelta(days=1)
    
    end_time = time.time()  # Record end time after processing loop
    total_time = end_time - start_time
    logging.info(f"Total processing time: {total_time:.2f} seconds")


In [8]:
process_date_range(datetime(2019, 11, 15), datetime(2019, 11, 15), 'CO', sql_engine)

2024-06-25 14:47:40,561 - INFO - Executing SQL query for date 20191115
2024-06-25 14:47:57,328 - INFO - failed after 3 attempts
2024-06-25 14:47:57,802 - INFO - failed after 3 attempts
2024-06-25 14:47:57,803 - ERROR - Failed to process data for date 20191115: error 502: bad gateway
2024-06-25 14:47:57,804 - INFO - Total processing time: 28.28 seconds


# Only 1115

In [17]:
event_date = 20191115
country_code = 'CO'

pe_tj_df = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        start_lat,
        start_lng,
        end_lat,
        end_lng,
        duration_minutes,
        length_meters,
        number_of_points
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date = {event_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
    """
)
pe_tj_df

Unnamed: 0,cuebiq_id,start_lat,start_lng,end_lat,end_lng,duration_minutes,length_meters,number_of_points
0,2722516982,0.522623,-76.492504,0.505091,-76.498759,30.683333,2072.135498,2
1,2722516982,0.505091,-76.498759,0.504976,-76.499645,8.316667,99.707509,3
2,2722516982,0.504976,-76.499645,0.522623,-76.492504,18.450000,2119.208651,2
3,2722451045,4.851903,-75.667530,4.842302,-75.673926,7.116667,1282.848801,2
4,2722302343,7.117956,-73.119118,7.117951,-73.119131,91.366667,11660.396311,6
...,...,...,...,...,...,...,...,...
143039,2488440673,4.649701,-74.077352,4.649371,-74.077270,80.400000,16617.282246,8
143040,2488550871,6.206855,-75.601084,6.230893,-75.596562,65.383333,2740.153606,4
143041,2488550871,6.230893,-75.596562,6.206868,-75.601088,90.000000,4070.415207,6
143042,2109826857,4.657037,-74.064390,4.654763,-74.059496,76.683333,1288.436169,7


In [18]:
# Encode geohashes
pe_tj_df['start_geohash5'] = pe_tj_df.apply(
    lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
pe_tj_df['end_geohash5'] = pe_tj_df.apply(
    lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

# Add user numbers to the aggregated data
aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
    'cuebiq_id': 'count',
    'duration_minutes': ['mean', 'median', 'std'],
    'length_meters': ['mean', 'median', 'std'],
    'number_of_points': ['mean', 'median', 'std']
}).reset_index()
aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                          'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                          'm_length_m', 'mdn_length_m', 'sd_length_m',
                          'm_points_no', 'mdn_points_no', 'sd_points_no']

# Filter aggregated data
filtered_df5 = aggregated_df5.loc[aggregated_df5['trip_count'] > 9]
filtered_df5 = filtered_df5.astype({'trip_count': 'int'})

filtered_df5

Unnamed: 0,start_geohash5,end_geohash5,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
0,6rfyf,6rfyf,14,104.854762,97.841667,99.433246,1681.407646,1204.161108,1487.531304,7.214286,6.5,3.555680
10,d0rfr,d0rfr,107,63.075078,27.066667,127.706689,2481.494874,934.202857,4320.263611,5.626168,5.0,4.050445
12,d0rfr,d2242,18,77.666667,28.983333,96.881798,5762.866506,4628.030627,3826.727084,6.166667,4.5,4.395854
21,d20tj,d20tj,14,46.619048,37.016667,37.092256,453.111603,238.751282,335.485876,3.357143,3.0,1.549548
27,d20u8,d20u8,20,106.395000,64.391667,126.913197,1708.923403,1000.657483,2021.473064,5.750000,4.5,4.327087
...,...,...,...,...,...,...,...,...,...,...,...,...
10082,d6h8u,d6h8s,10,148.550000,69.941667,248.297725,7851.332034,3222.111552,12110.073522,7.400000,5.0,6.058969
10083,d6h8u,d6h8u,17,33.251961,13.233333,49.418228,601.713452,428.277732,604.747049,4.058824,4.0,1.519481
10086,d6hde,d6hde,35,69.114762,33.483333,75.835003,1637.413382,1386.325207,1538.001116,5.457143,5.0,3.118662
10088,d6hvd,d6hvd,26,87.869231,44.233333,137.320151,1190.806451,590.469198,1393.994855,3.923077,3.0,2.512737


In [22]:
agg_file_name = f"od_{country_code.lower()}_{event_date}_agg5_10"
filtered_df5.to_csv('/home/jovyan/Data/TJ/' + country_code + '/' + agg_file_name + '.csv', index=False)

In [23]:
query = f"""
SELECT DISTINCT *
FROM dedicated.od_matrix_10.od_co_2019_agg5
"""

od_co_2019_agg5 = sql_engine.read_sql(query)
od_co_2019_agg5

Unnamed: 0,start_geohash5,end_geohash5,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no,local_date
0,d2dze,d2dze,15,72.702222,13.166667,147.522869,465.250256,411.457412,273.092307,5.000000,4.0,4.720775,20191222
1,d2g1y,d2g1y,18,31.488889,17.741667,39.651386,531.318426,430.168812,520.099579,4.722222,4.0,2.420636,20191222
2,d2ew9,d2ew9,36,41.857407,16.808333,55.766308,1937.566732,1230.304123,3324.738919,4.694444,5.0,1.954035,20191222
3,d3fvf,d3fvf,19,58.286842,18.400000,91.218731,2866.957088,196.988061,9726.453078,5.631579,3.0,9.889447,20191222
4,d3h14,d3h11,11,76.539394,38.783333,106.437290,3534.544383,1913.910158,3448.340835,7.363636,5.0,4.842989,20191222
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62926,d2g6d,d2g6f,229,77.157496,51.816667,88.151735,6793.891477,5741.285878,6029.354308,10.122271,8.0,9.153318,20191113
62927,d2g3c,d2g64,58,149.777586,79.400000,196.039219,8443.225088,6975.940410,6573.903307,9.965517,8.0,8.520286,20191113
62928,d2g4p,d2g4r,28,101.950595,45.125000,164.996014,3828.557579,3077.125455,2406.556515,6.607143,5.0,6.081784,20191113
62929,d3ggn,d3ggn,15,71.481111,14.316667,136.616289,686.895778,450.675047,1013.010547,3.333333,3.0,1.914854,20191113


In [24]:
od_co_2019_agg5.to_csv('/home/jovyan/Data/TJ/CO/od_co_2019_agg5.csv', index=False)
