In [9]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [10]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [11]:
pip install python-geohash

Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
import os
import geohash
from datetime import datetime, timedelta
import logging

In [13]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [14]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [8]:
import pandas as pd
import geohash
from datetime import datetime, timedelta
import logging
from sqlalchemy import create_engine
import time

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'od_matrix'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters,
                number_of_points
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )

        # Encode geohashes
        pe_tj_df['start_geohash3'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=3), axis=1)
        pe_tj_df['end_geohash3'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=3), axis=1)

        # Load cell lists from SQL
        try:
            celllist3 = sql_engine.read_sql(f"SELECT geohash3 AS geohash, no_of_unique_users FROM dedicated.pop_density.pd_{country_code}_{event_date}_agg3")
            geohash_dict3 = celllist3.set_index('geohash')['no_of_unique_users'].to_dict()
        except Exception as e:
            logging.warning(f"Failed to load geohash3 data for date {event_date}: {e}")
            geohash_dict3 = {}

        # Add user numbers to the aggregated data
        aggregated_df3 = pe_tj_df.groupby(['start_geohash3', 'end_geohash3']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()
        aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']
        aggregated_df3['start_geohash_user'] = aggregated_df3['start_geohash3'].map(geohash_dict3)
        aggregated_df3['end_geohash_user'] = aggregated_df3['end_geohash3'].map(geohash_dict3)

        # Filter aggregated data and reorder columns
        filtered_df3 = aggregated_df3.dropna(subset=['start_geohash_user', 'end_geohash_user'])
        filtered_df3 = filtered_df3[['start_geohash3', 'start_geohash_user', 'end_geohash3', 'end_geohash_user', 'trip_count', 
                                     'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                     'm_length_m', 'mdn_length_m', 'sd_length_m',
                                     'm_points_no', 'mdn_points_no', 'sd_points_no']]
        return filtered_df3

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame()

# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            break
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")

# Main processing loop
def process_date_range(start_date, end_date, country_code, sql_engine):
    start_time = time.time()  # Record start time before processing loop
        
    current_date = start_date
    while current_date <= end_date:
        event_date = current_date.strftime('%Y%m%d')
        
        filtered_df3 = process_day(event_date, country_code, sql_engine)

        # Insert data into the database
        if not filtered_df3.empty:
            table_name = f"od_{country_code.lower()}_{event_date}_agg3"
            insert_data_with_retry(filtered_df3, table_name, con)
        else:
            logging.info(f"No data to insert for date {event_date}")

        # Move to the next day
        current_date += timedelta(days=1)
    end_time = time.time()  # Record end time after processing loop
    total_time = end_time - start_time
    logging.info(f"Total processing time: {total_time:.2f} seconds")

# Example usage:
process_date_range(datetime(2019, 10, 1), datetime(2019, 12, 31), 'CO', sql_engine)


2024-06-05 19:33:20,454 - INFO - Inserted data into table od_id_20190103_agg3
2024-06-05 19:33:38,937 - INFO - Inserted data into table od_id_20190104_agg3
2024-06-05 19:33:56,798 - INFO - Inserted data into table od_id_20190105_agg3
2024-06-05 19:33:56,799 - INFO - Total processing time: 50.62 seconds


In [15]:
event_date = 20191113
country_code = 'CO'

pe_tj_df = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        start_lat,
        start_lng,
        end_lat,
        end_lng,
        duration_minutes,
        length_meters
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date = {event_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
    """
)

# Encode geohashes
pe_tj_df

Unnamed: 0,cuebiq_id,start_lat,start_lng,end_lat,end_lng,duration_minutes,length_meters
0,2717518640,3.436203,-76.540658,3.367047,-76.529736,178.216667,9072.208487
1,2717518640,3.367047,-76.529736,3.436122,-76.540637,80.083333,8189.523241
2,2717483025,10.963962,-74.840365,10.964186,-74.841364,173.116667,534.865058
3,2717377552,7.876402,-76.627072,7.878519,-76.626878,219.016667,2326.635493
4,2717377552,7.878519,-76.626878,7.876418,-76.627134,22.100000,235.592936
...,...,...,...,...,...,...,...
134755,2267573244,11.019228,-74.817414,11.005213,-74.826734,14.183333,2609.690248
134756,2267573244,11.005213,-74.826734,11.005883,-74.818957,14.816667,918.411412
134757,1540926349,9.578892,-73.471045,9.578242,-73.466186,51.500000,739.410094
134758,1540926349,9.578242,-73.466186,9.578323,-73.466123,297.933333,2283.671544


In [9]:
# # save as .csv to the system, only 3
# import pandas as pd
# import os
# import geohash
# from datetime import datetime, timedelta
# import logging

# # Set up logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# # Function to process data for a single day
# def process_day(event_date, country_code, sql_engine):
#     try:
#         # Read data from the SQL table
#         pe_tj_df = sql_engine.read_sql(
#             f"""
#             SELECT 
#                 cuebiq_id,
#                 start_lat,
#                 start_lng,
#                 end_lat,
#                 end_lng,
#                 duration_minutes,
#                 length_meters
#             FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
#             WHERE 
#                 event_date = {event_date}
#                 AND end_country = '{country_code}' 
#                 AND start_country = '{country_code}' 
#             """
#         )

#         # Encode geohashes
#         pe_tj_df['start_geohash3'] = pe_tj_df.apply(
#             lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=3), axis=1)
#         pe_tj_df['end_geohash3'] = pe_tj_df.apply(
#             lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=3), axis=1)

#         # Load cell lists from SQL
#         try:
#             celllist3 = sql_engine.read_sql(f"SELECT geohash3 AS geohash, no_of_unique_users FROM dedicated.pop_density.pd_{country_code}_{event_date}_agg3")
#             geohash_dict3 = celllist3.set_index('geohash')['no_of_unique_users'].to_dict()
#         except Exception as e:
#             logging.warning(f"Failed to load geohash3 data for date {event_date}: {e}")
#             geohash_dict3 = {}

#         # Add user numbers to the aggregated data
#         aggregated_df3 = pe_tj_df.groupby(['start_geohash3', 'end_geohash3']).agg({
#             'cuebiq_id': 'count',
#             'duration_minutes': ['mean', 'median', 'std'],
#             'length_meters': ['mean', 'median', 'std']
#         }).reset_index()
#         aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m']
#         aggregated_df3['start_geohash_user'] = aggregated_df3['start_geohash3'].map(geohash_dict3)
#         aggregated_df3['end_geohash_user'] = aggregated_df3['end_geohash3'].map(geohash_dict3)

#         # Filter aggregated data
#         filtered_df3 = aggregated_df3.dropna(subset=['start_geohash_user', 'end_geohash_user'])

#         # Reorder columns
#         filtered_df3 = filtered_df3[['start_geohash3', 'start_geohash_user', 'end_geohash3', 'end_geohash_user', 'trip_count', 'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 'm_length_m', 'mdn_length_m', 'sd_length_m']]

#         return filtered_df3

#     except Exception as e:
#         logging.error(f"Error processing data for date {event_date}: {e}")
#         return pd.DataFrame()

# # Main processing loop
# def process_date_range(start_date, end_date, country_code, sql_engine):
#     start_time = time.time()  # Record start time before processing loop
        
#     current_date = start_date
#     while current_date <= end_date:
#         event_date = current_date.strftime('%Y%m%d')
        
#         filtered_df3 = process_day(event_date, country_code, sql_engine)

#         # Define file path
#         file_path_3 = f'/home/jovyan/Data/Agg_TJ/{country_code}3/{event_date}_{country_code}_pe_tj_agg3.csv'

#         # Save results if data is not empty and file does not already exist
#         if not filtered_df3.empty and not os.path.exists(file_path_3):
#             os.makedirs(os.path.dirname(file_path_3), exist_ok=True)
#             filtered_df3.to_csv(file_path_3, index=False)
#             logging.info(f"File saved to {file_path_3}")
#         else:
#             logging.info(f"File already exists or no data for {file_path_3}")

#         # Move to the next day
#         current_date += timedelta(days=1)
#     end_time = time.time()  # Record end time after processing loop
#     total_time = end_time - start_time
#     logging.info(f"Total processing time: {total_time:.2f} seconds")

# # Example usage:
# process_date_range(datetime(2019, 1, 1), datetime(2019, 1, 5), 'ID', sql_engine)

# # worked save as .csv to the system, only 3

2024-06-05 19:12:47,973 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID3/20190101_ID_pe_tj_agg3.csv
2024-06-05 19:12:58,782 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID3/20190102_ID_pe_tj_agg3.csv
2024-06-05 19:13:13,316 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID3/20190103_ID_pe_tj_agg3.csv
2024-06-05 19:13:29,895 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID3/20190104_ID_pe_tj_agg3.csv
2024-06-05 19:13:44,626 - INFO - File saved to /home/jovyan/Data/Agg_TJ/ID3/20190105_ID_pe_tj_agg3.csv
2024-06-05 19:13:44,626 - INFO - Total processing time: 68.74 seconds
