In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install python-geohash

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import os
import geohash
from datetime import datetime, timedelta
import logging

In [5]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [7]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [22]:
# test with only bucket no partition

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters,
                number_of_points
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )
        logging.info(f"Executing SQL query for date {event_date}")
        
        # Encode geohashes
        pe_tj_df['start_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
        pe_tj_df['end_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

        # Add user numbers to the aggregated data
        aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Filter aggregated data
        filtered_df5 = aggregated_df5.loc[aggregated_df5['trip_count'] > 9]
        return filtered_df5

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame()

# Function to insert data in chunks
def insert_data_in_chunks(df, table_name, engine, chunk_size):
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        chunk.to_sql(table_name, engine, index=False, if_exists='append', method='multi')

# Main processing loop
def process_date_range(start_date, end_date, country_code, sql_engine):
    start_time = time.time()  # Record start time before processing loop
        
    current_date = start_date
    while current_date <= end_date:
        event_date = current_date.strftime('%Y%m%d')
        try:
            filtered_df5 = process_day(event_date, country_code, sql_engine)

            # Create the SQL engine
            output_schema_name = "od_matrix_10"
            final_table_5 = f"od_{country_code.lower()}_{event_date}_agg5_10"
            con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

            # Create the SQL table with the correct name for 5-level geohash
            create_table_query_5 = f"""
            CREATE TABLE IF NOT EXISTS {final_table_5} (
                start_geohash5 varchar,
                end_geohash5 varchar,
                trip_count bigint,
                m_duration_min double,
                mdn_duration_min double,
                sd_duration_min double,
                m_length_m double,
                mdn_length_m double,
                sd_length_m double,
                m_points_no double,
                mdn_points_no double,
                sd_points_no double
            )
            WITH (
              bucketed_by = ARRAY['end_geohash5'],
              bucket_count = 30
            )
            """

            with con.connect() as connection:
                connection.execute(create_table_query_5)
                
            filtered_df5 = filtered_df5.astype({
                'trip_count': 'int'
            })
            
            # Insert data into the table with the correct name
            if not filtered_df5.empty:
                
                insert_data_in_chunks(filtered_df5, final_table_5, con, 500)
                logging.info(f"Data inserted into {final_table_5}")
            else:
                logging.info(f"No data to insert for {final_table_5} for 5-level geohash")
        except Exception as e:
            logging.error(f"Failed to process data for date {event_date}: {e}")

        # Move to the next day
        current_date += timedelta(days=1)
    
    end_time = time.time()  # Record end time after processing loop
    total_time = end_time - start_time
    logging.info(f"Total processing time: {total_time:.2f} seconds")
    
process_date_range(datetime(2019, 1, 1), datetime(2019, 12, 31), 'ID', sql_engine)

2024-06-17 22:48:14,158 - INFO - Executing SQL query for date 20190315
2024-06-17 22:48:57,903 - INFO - Data inserted into od_in_20190315_agg5_10
2024-06-17 22:49:13,831 - INFO - Executing SQL query for date 20190316
2024-06-17 22:49:57,069 - INFO - Data inserted into od_in_20190316_agg5_10
2024-06-17 22:49:57,070 - INFO - Total processing time: 119.61 seconds


In [9]:
# Test query single
country_code = 'ID'
event_date = 20190106

pe_tj_df = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        start_lat,
        start_lng,
        end_lat,
        end_lng,
        duration_minutes,
        length_meters,
        number_of_points
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date = {event_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
    """
)

# pe_tj_df

# Encode geohashes
pe_tj_df['start_geohash5'] = pe_tj_df.apply(
    lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
pe_tj_df['end_geohash5'] = pe_tj_df.apply(
    lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

# Add user numbers to the aggregated data
aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
    'cuebiq_id': 'count',
    'duration_minutes': ['mean', 'median', 'std'],
    'length_meters': ['mean', 'median', 'std'],
    'number_of_points': ['mean', 'median', 'std']
}).reset_index()
aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                          'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                          'm_length_m', 'mdn_length_m', 'sd_length_m',
                          'm_points_no', 'mdn_points_no', 'sd_points_no']

# Filter aggregated data
filtered_df5 = aggregated_df5.loc[aggregated_df5['trip_count'] > 9]
filtered_df5['local_date'] = event_date
filtered_df5 = filtered_df5.astype({'trip_count': 'int'})
filtered_df5

agg_file_name = f"od_{country_code.lower()}_{event_date}_agg5_10"
filtered_df5.to_csv('/home/jovyan/Data/TJ/' + country_code + '/' + agg_file_name + '.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df5['local_date'] = event_date


Unnamed: 0,start_geohash5,end_geohash5,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no,local_date
25,qpz6e,qpz6e,42,87.852381,32.208333,168.030579,4676.194708,1523.026116,13531.575806,10.119048,5.0,15.381653,20190106
29,qpz6e,qpz6s,10,43.406667,24.833333,36.721081,3931.408664,3428.999614,2486.709348,4.900000,4.5,1.852926,20190106
35,qpz6g,qpz6e,10,57.320000,28.300000,58.491738,7226.994729,5089.530602,5321.204434,7.200000,8.0,3.190263,20190106
36,qpz6g,qpz6g,21,88.769048,42.916667,111.668216,4799.423202,1268.281055,7019.272456,15.904762,5.0,37.158989,20190106
58,qpz6s,qpz6s,38,80.742982,32.708333,96.794736,2479.739199,1489.384521,3569.710189,4.500000,4.0,2.214815,20190106
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16059,wb2b2,wb2b3,12,103.481944,50.033333,131.194427,4475.603842,3961.053100,3302.766768,5.750000,5.5,3.570714,20190106
16068,wb2b3,wb2b3,26,132.267949,38.800000,184.054177,2847.508584,1115.643962,5363.755756,4.807692,5.0,2.154423,20190106
16094,wb2b9,wb2b9,11,62.518182,64.883333,51.428689,1559.609883,378.245321,3335.194155,4.090909,4.0,2.119177,20190106
16102,wb2bd,wb2bd,14,56.386905,24.066667,73.205417,6450.858098,1978.203449,8006.088833,4.500000,4.5,2.312175,20190106


In [None]:
# Fill missing dates

In [17]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT DISTINCT
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters,
                number_of_points
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )
        logging.info(f"Executing SQL query for date {event_date}")
        
        # Encode geohashes
        try:
            pe_tj_df['start_geohash5'] = pe_tj_df.apply(
                lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5) if pd.notnull(x['start_lat']) and pd.notnull(x['start_lng']) else None, axis=1)
            pe_tj_df['end_geohash5'] = pe_tj_df.apply(
                lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5) if pd.notnull(x['end_lat']) and pd.notnull(x['end_lng']) else None, axis=1)
        except Exception as e:
            logging.error(f"Error encoding geohashes for date {event_date}: {e}")
            return pd.DataFrame()

        # Add user numbers to the aggregated data
        try:
            if not pe_tj_df.empty:
                aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
                    'cuebiq_id': 'count',
                    'duration_minutes': ['mean', 'median', 'std'],
                    'length_meters': ['mean', 'median', 'std'],
                    'number_of_points': ['mean', 'median', 'std']
                }).reset_index()
                aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                                          'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                          'm_length_m', 'mdn_length_m', 'sd_length_m',
                                          'm_points_no', 'mdn_points_no', 'sd_points_no']
            else:
                logging.info(f"No data to aggregate for date {event_date}")
                return pd.DataFrame()
        except Exception as e:
            logging.error(f"Error aggregating data for date {event_date}: {e}")
            return pd.DataFrame()

        # Filter aggregated data
        try:
            filtered_df5 = aggregated_df5.loc[aggregated_df5['trip_count'] > 9].copy()
            filtered_df5['local_date'] = event_date
            filtered_df5 = filtered_df5.astype({'trip_count': 'int'})
        except Exception as e:
            logging.error(f"Error filtering or assigning data for date {event_date}: {e}")
            return pd.DataFrame()
        
        return filtered_df5

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame()


In [18]:
# List of specific dates to process
date_list = [
    '20190106', '20190210', '20190214', '20190304', '20190306', 
    '20190316', '20190320', '20190327', '20190724', '20190804', 
    '20190808', '20190810', '20190812', '20190913', '20190925', 
    '20191023', '20191104', '20191126', '20191203', '20191204', 
    '20191205', '20191210', '20191218'
]

country_code = 'ID'
all_days_df = pd.DataFrame()

# Process each day and concatenate the results
for event_date in date_list:
    filtered_df5 = process_day(event_date, country_code, sql_engine)
    if not filtered_df5.empty:
        all_days_df = pd.concat([all_days_df, filtered_df5])


2024-06-25 19:33:22,425 - INFO - Executing SQL query for date 20190106
2024-06-25 19:33:33,833 - INFO - Executing SQL query for date 20190210
2024-06-25 19:33:48,799 - INFO - Executing SQL query for date 20190214
2024-06-25 19:34:06,726 - INFO - Executing SQL query for date 20190304
2024-06-25 19:34:26,340 - INFO - Executing SQL query for date 20190306
2024-06-25 19:34:44,650 - INFO - Executing SQL query for date 20190316
2024-06-25 19:35:03,558 - INFO - Executing SQL query for date 20190320
2024-06-25 19:35:22,995 - INFO - Executing SQL query for date 20190327
2024-06-25 19:35:42,503 - INFO - Executing SQL query for date 20190724
2024-06-25 19:35:59,197 - INFO - Executing SQL query for date 20190804
2024-06-25 19:36:17,703 - INFO - Executing SQL query for date 20190808
2024-06-25 19:36:36,429 - INFO - Executing SQL query for date 20190810
2024-06-25 19:36:54,882 - INFO - Executing SQL query for date 20190812
2024-06-25 19:37:16,645 - INFO - Executing SQL query for date 20190913
2024-0

In [20]:
all_days_df

Unnamed: 0,start_geohash5,end_geohash5,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no,local_date
25,qpz6e,qpz6e,42,87.852381,32.208333,168.030579,4676.194708,1523.026116,13531.575806,10.119048,5.0,15.381653,20190106
29,qpz6e,qpz6s,10,43.406667,24.833333,36.721081,3931.408664,3428.999614,2486.709348,4.900000,4.5,1.852926,20190106
35,qpz6g,qpz6e,10,57.320000,28.300000,58.491738,7226.994729,5089.530602,5321.204434,7.200000,8.0,3.190263,20190106
36,qpz6g,qpz6g,21,88.769048,42.916667,111.668216,4799.423202,1268.281055,7019.272456,15.904762,5.0,37.158989,20190106
58,qpz6s,qpz6s,38,80.742982,32.708333,96.794736,2479.739199,1489.384521,3569.710189,4.500000,4.0,2.214815,20190106
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15210,wb2b7,wb2b7,11,24.584848,16.450000,25.057702,826.650948,362.350495,770.842073,3.363636,3.0,1.433369,20191218
15215,wb2b8,wb2b8,12,23.944444,13.816667,23.122735,5091.193739,251.602016,13634.021100,3.416667,3.0,1.378954,20191218
15229,wb2bd,wb2bd,15,49.720000,28.083333,62.051096,2538.492526,966.270617,4906.471853,3.600000,3.0,1.352247,20191218
15272,wb4s6,wb4s6,24,42.283333,14.275000,62.885850,2453.871182,760.669901,5072.246530,4.166667,3.5,2.729336,20191218


In [21]:
all_days_df.to_csv('/home/jovyan/Data/TJ/' + 'ID_MissingDates.csv', index=False)
