In [2]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [3]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [4]:
import os
import time
import logging
import pandas as pd
from pathlib import Path
from sqlalchemy import create_engine
from datetime import datetime, timedelta

In [5]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [6]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'od_matrix_10'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

In [26]:
# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    # Read data from the SQL table
    pe_tj_df = sql_engine.read_sql(
        f"""
        SELECT 
            cuebiq_id,
            geohash_encode(start_lat, start_lng, 3) AS start_geohash3, 
            geohash_encode(end_lat, end_lng, 3) AS end_geohash3, 
            duration_minutes,
            length_meters,
            number_of_points, 
            event_date
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date = {event_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}' 
            AND start_lat <> 0
            AND start_lng <> 0
            AND end_lat <> 0
            AND end_lng <> 0
        """
    )
    logging.info(f"Executing SQL query for date {event_date}")

    # Add user numbers to the aggregated data
    aggregated_df3 = pe_tj_df.groupby(['start_geohash3', 'end_geohash3']).agg({
        'cuebiq_id': 'count',
        'event_date':'first',
        'duration_minutes': ['mean', 'median', 'std'],
        'length_meters': ['mean', 'median', 'std'],
        'number_of_points': ['mean', 'median', 'std']
    }).reset_index()
    aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'trip_count', 'local_date',
                              'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                              'm_length_m', 'mdn_length_m', 'sd_length_m',
                              'm_points_no', 'mdn_points_no', 'sd_points_no']
    return aggregated_df3

In [27]:
def process_date_range(start_date, end_date, country_code, sql_engine):        
    current_date = start_date
    while current_date <= end_date:
        event_date = current_date.strftime('%Y%m%d')
        aggregated_df3 = process_day(event_date, country_code, sql_engine)
        aggregated_df3 = aggregated_df3.drop_duplicates() # Remove duplicates

        # Define the file path
        pathFolder = "/home/jovyan/Data/daily_od3/"
        fileName = f"{pathFolder}/{country_code}_daily_agg3.csv"

        # Save to CSV
        if not os.path.isfile(fileName):
            aggregated_df3.to_csv(fileName, sep=',', index=False)
        else:
            aggregated_df3.to_csv(fileName, sep=',', mode='a', header=False, index=False)

        # Move to the next day
        current_date += timedelta(days=1)


In [None]:
# usage:
process_date_range(datetime(2019, 1, 1), datetime(2019, 12, 31), 'MX', sql_engine)

2024-08-08 05:19:14,170 - INFO - Executing SQL query for date 20190101
2024-08-08 05:19:33,113 - INFO - Executing SQL query for date 20190102
2024-08-08 05:19:52,281 - INFO - Executing SQL query for date 20190103
2024-08-08 05:20:11,087 - INFO - Executing SQL query for date 20190104
2024-08-08 05:20:28,893 - INFO - Executing SQL query for date 20190105
2024-08-08 05:20:45,996 - INFO - Executing SQL query for date 20190106
2024-08-08 05:21:04,899 - INFO - Executing SQL query for date 20190107


In [None]:
# usage:
process_date_range(datetime(2019, 1, 1), datetime(2019, 12, 31), 'IN', sql_engine)

In [None]:
# usage:
process_date_range(datetime(2019, 1, 1), datetime(2019, 12, 31), 'ID', sql_engine)

In [None]:
# usage:
process_date_range(datetime(2019, 11, 1), datetime(2019, 12, 31), 'CO', sql_engine)

# Check Data

Need to remove duplicates, should be the first day of each month

In [None]:
# Too long time
country_code = 'CO'
pathFolder = "/home/jovyan/Data/daily_od3/".format(country_code)
Path(pathFolder).mkdir(parents=True, exist_ok=True)

pe_tj_df = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        geohash_encode(start_lat, start_lng, 3) AS start_geohash3, 
        geohash_encode(end_lat, end_lng, 3) AS end_geohash3, 
        duration_minutes,
        length_meters,
        number_of_points,
        DATE(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN 20191101 AND 20191231
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
        AND start_lat <> 0
        AND start_lng <> 0
        AND end_lat <> 0
        AND end_lng <> 0
    """
)

pe_tj_df['local_date'] = pd.to_datetime(pe_tj_df['local_date']).dt.strftime('%Y%m%d').astype(int)

# Aggregation
aggregated_df3 = pe_tj_df.groupby(['start_geohash3', 'end_geohash3', 'local_date']).agg({
    'cuebiq_id': 'count',
    'duration_minutes': ['mean', 'median', 'std'],
    'length_meters': ['mean', 'median', 'std'],
    'number_of_points': ['mean', 'median', 'std']
}).reset_index()
aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                          'm_length_m', 'mdn_length_m', 'sd_length_m', 'm_points_no', 'mdn_points_no', 'sd_points_no']
aggregated_df3 = aggregated_df3.drop_duplicates() # Remove duplicates

fileName = "{}{}_daily_agg3.csv".format(pathFolder,country_code)
# aggregated_df3.to_csv(fileName,  sep=',', index=False)

if not os.path.isfile(csv3_file_path):
    aggregated_df3.to_csv(fileName,  sep=',', index=False)
else:
    aggregated_df3.to_csv(fileName,  sep=',', mode='a', header=False, index=False)


In [7]:
country_code = 'CO'
event_date = 20191102
pe_tj_df = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        geohash_encode(start_lat, start_lng, 3) AS start_geohash3, 
        geohash_encode(end_lat, end_lng, 3) AS end_geohash3, 
        duration_minutes,
        length_meters,
        number_of_points,
        DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date = {event_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
        AND start_lat <> 0
        AND start_lng <> 0
        AND end_lat <> 0
        AND end_lng <> 0
    """
)
pe_tj_df

Unnamed: 0,cuebiq_id,start_geohash3,end_geohash3,duration_minutes,length_meters,number_of_points
0,2546703192,d3f,d3f,33.466667,6438.326392,5
1,2542446016,d2f,d2f,606.883333,368.617473,5
2,2542392691,d2n,d2n,753.850000,5633.960978,9
3,2542311521,d3k,d3k,2.016667,73.012999,2
4,2542311521,d3k,d3k,4.783333,68.321394,3
...,...,...,...,...,...,...
146947,2002399950,d3g,d3g,9.466667,342.007893,3
146948,2002399950,d3g,d3g,24.116667,508.553762,3
146949,2002399950,d3g,d3g,145.883333,1593.347778,7
146950,2002399950,d3g,d3g,102.300000,1119.091033,4


In [9]:
# Aggregation
aggregated_df3 = pe_tj_df.groupby(['start_geohash3', 'end_geohash3']).agg({
    'cuebiq_id': 'count',
    'duration_minutes': ['mean', 'median', 'std'],
    'length_meters': ['mean', 'median', 'std'],
    'number_of_points': ['mean', 'median', 'std']
}).reset_index()
aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                          'm_length_m', 'mdn_length_m', 'sd_length_m', 'm_points_no', 'mdn_points_no', 'sd_points_no']

aggregated_df3

Unnamed: 0,start_geohash3,end_geohash3,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
0,6rf,6rf,31,90.456989,56.633333,115.063642,2531.824351,1157.408885,7215.818258,5.258065,4.0,3.245178
1,6rf,d2g,1,613.083333,613.083333,,623699.578136,623699.578136,,45.000000,45.0,
2,6rs,6rs,2,19.616667,19.616667,21.048212,218.588355,218.588355,67.939015,3.000000,3.0,1.414214
3,6ru,6ru,1,30.866667,30.866667,,119.170019,119.170019,,4.000000,4.0,
4,6ru,d2g,1,880.416667,880.416667,,711427.706304,711427.706304,,25.000000,25.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
252,d6h,d3g,4,199.279167,216.991667,70.370692,146430.276812,151069.162140,65277.582959,9.250000,10.0,4.645787
253,d6h,d3u,8,182.500000,132.266667,122.936129,68974.586174,63677.669187,24201.408047,9.750000,6.0,12.103718
254,d6h,d6h,430,93.657907,35.058333,161.529026,3968.332717,1387.863918,12691.083426,5.148837,4.0,3.858353
255,d90,d3p,1,10.200000,10.200000,,1259.098507,1259.098507,,6.000000,6.0,


In [10]:
# Filter aggregated data and reorder columns
filtered_df3= aggregated_df3.loc[aggregated_df3['trip_count'] > 9]
filtered_df3 = filtered_df3[['start_geohash3', 'end_geohash3', 'trip_count', 
                             'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m', 
                           'm_points_no', 'mdn_points_no', 'sd_points_no']]
filtered_df3

Unnamed: 0,start_geohash3,end_geohash3,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
0,6rf,6rf,31,90.456989,56.633333,115.063642,2531.824351,1157.408885,7215.818258,5.258065,4.0,3.245178
5,d0r,d0r,139,74.018465,29.916667,114.856710,2585.935751,1168.774099,3885.699347,4.949640,4.0,4.045616
6,d0r,d22,10,157.086667,114.275000,154.373230,16873.704985,5225.802331,26723.447390,7.400000,7.0,5.146736
8,d20,d20,584,83.750171,36.066667,134.546193,4256.717355,1120.652689,13218.687745,5.523973,4.0,4.560563
9,d20,d21,18,309.378704,158.466667,316.106661,49365.442513,51769.416094,24000.099867,13.388889,12.0,7.397447
...,...,...,...,...,...,...,...,...,...,...,...,...
246,d4q,d4q,24,105.195139,37.058333,143.629018,4844.441099,3227.574050,6344.642027,5.875000,3.0,7.029828
248,d65,d3g,35,102.348571,50.800000,132.162022,15903.957995,3249.602622,35034.898987,7.571429,5.0,7.030546
249,d65,d65,35,81.671429,39.433333,120.039210,1068.362064,449.403343,1399.633072,5.114286,4.0,3.990956
254,d6h,d6h,430,93.657907,35.058333,161.529026,3968.332717,1387.863918,12691.083426,5.148837,4.0,3.858353


In [None]:
# Convert 'local_date' to 'yyyymmdd' format
pe_dl['local_date'] = pd.to_datetime(pe_dl['local_date']).dt.strftime('%Y%m%d').astype(int)
pe_dl = pe_dl.drop_duplicates() # Remove duplicates

# pe_dl

fileName = "{}{}_daily_agg5_8.csv".format(pathFolder,country_code)
pe_dl.to_csv(fileName,  sep=',', index=False)