In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install python-geohash

Collecting python-geohash
  Using cached python_geohash-0.8.5-cp39-cp39-linux_x86_64.whl
Installing collected packages: python-geohash
Successfully installed python-geohash-0.8.5
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import os
import geohash
from datetime import datetime, timedelta
import logging

In [5]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [6]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT DISTINCT
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters,
                number_of_points
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )
        logging.info(f"Executing SQL query for date {event_date}")
        
        # Encode geohashes
        pe_tj_df['start_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
        pe_tj_df['end_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

        # Add user numbers to the aggregated data
        aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Filter aggregated data
        filtered_df5 = aggregated_df5.loc[aggregated_df5['trip_count'] > 9]
        return filtered_df5

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame()

# # Function to insert data in chunks
# def insert_data_in_chunks(df, table_name, engine, chunk_size):
#     for start in range(0, len(df), chunk_size):
#         chunk = df.iloc[start:start + chunk_size]
#         chunk.to_sql(table_name, engine, index=False, if_exists='append', method='multi')

def insert_data_in_chunks(df, table_name, engine, chunk_size):
    df.to_sql(table_name, engine, index=False, if_exists='append', method='multi', chunksize=chunk_size)


In [8]:
# Main processing loop
def process_date_range(start_date, end_date, country_code, sql_engine):
    start_time = time.time()  # Record start time before processing loop
    errored_dates = []  # List to store dates that encounter errors
        
    current_date = start_date
    while current_date <= end_date:
        event_date = current_date.strftime('%Y%m%d')
        try:
            filtered_df5 = process_day(event_date, country_code, sql_engine)

            # Create the SQL engine
            output_schema_name = "od_matrix_10"
            final_table_5 = f"od_{country_code.lower()}_{event_date}_agg5_10"
            con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

            # Create the SQL table with the correct name for 5-level geohash
            create_table_query_5 = f"""
            CREATE TABLE IF NOT EXISTS {final_table_5} (
                start_geohash5 varchar,
                end_geohash5 varchar,
                trip_count bigint,
                m_duration_min double,
                mdn_duration_min double,
                sd_duration_min double,
                m_length_m double,
                mdn_length_m double,
                sd_length_m double,
                m_points_no double,
                mdn_points_no double,
                sd_points_no double
            )
            WITH (
              bucketed_by = ARRAY['end_geohash5'],
              bucket_count = 30
            )
            """

            with con.connect() as connection:
                connection.execute(create_table_query_5)
                
            filtered_df5 = filtered_df5.astype({
                'trip_count': 'int'
            })
            
            # Insert data into the table with the correct name
            if not filtered_df5.empty:
                insert_data_in_chunks(filtered_df5, final_table_5, con, 1000)
                logging.info(f"Data inserted into {final_table_5}")
            else:
                logging.info(f"No data to insert for {final_table_5} for 5-level geohash")
        except Exception as e:
            logging.error(f"Failed to process data for date {event_date}: {e}")
            errored_dates.append(event_date)  # Record the errored date

        # Move to the next day
        current_date += timedelta(days=1)
    
    end_time = time.time()  # Record end time after processing loop
    total_time = end_time - start_time
    logging.info(f"Total processing time: {total_time:.2f} seconds")

    # Log the errored dates
    if errored_dates:
        logging.info(f"Errored dates: {', '.join(errored_dates)}")
    else:
        logging.info("No errors encountered during processing.")


In [9]:
process_date_range(datetime(2019, 3, 1), datetime(2019, 12, 31), 'MX', sql_engine)

2024-06-18 01:50:45,036 - INFO - Executing SQL query for date 20190201
2024-06-18 01:51:58,690 - INFO - Data inserted into od_mx_20190201_agg5_10
2024-06-18 01:52:18,508 - INFO - Executing SQL query for date 20190202
2024-06-18 01:53:29,075 - INFO - Data inserted into od_mx_20190202_agg5_10
2024-06-18 01:53:50,609 - INFO - Executing SQL query for date 20190203
2024-06-18 01:54:54,513 - INFO - Data inserted into od_mx_20190203_agg5_10
2024-06-18 01:55:12,764 - INFO - Executing SQL query for date 20190204
2024-06-18 01:56:12,505 - INFO - Data inserted into od_mx_20190204_agg5_10
2024-06-18 01:56:34,532 - INFO - Executing SQL query for date 20190205
2024-06-18 01:57:49,980 - INFO - Data inserted into od_mx_20190205_agg5_10
2024-06-18 01:58:12,215 - INFO - Executing SQL query for date 20190206
2024-06-18 01:59:27,151 - INFO - Data inserted into od_mx_20190206_agg5_10
2024-06-18 01:59:51,128 - INFO - Executing SQL query for date 20190207
2024-06-18 02:01:05,499 - INFO - Data inserted into o

# Dealing with missing dates

In [8]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT DISTINCT
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters,
                number_of_points
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )
        logging.info(f"Executing SQL query for date {event_date}")
        
        # Encode geohashes
        try:
            pe_tj_df['start_geohash5'] = pe_tj_df.apply(
                lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5) if pd.notnull(x['start_lat']) and pd.notnull(x['start_lng']) else None, axis=1)
            pe_tj_df['end_geohash5'] = pe_tj_df.apply(
                lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5) if pd.notnull(x['end_lat']) and pd.notnull(x['end_lng']) else None, axis=1)
        except Exception as e:
            logging.error(f"Error encoding geohashes for date {event_date}: {e}")
            return pd.DataFrame()

        # Add user numbers to the aggregated data
        try:
            if not pe_tj_df.empty:
                aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
                    'cuebiq_id': 'count',
                    'duration_minutes': ['mean', 'median', 'std'],
                    'length_meters': ['mean', 'median', 'std'],
                    'number_of_points': ['mean', 'median', 'std']
                }).reset_index()
                aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                                          'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                          'm_length_m', 'mdn_length_m', 'sd_length_m',
                                          'm_points_no', 'mdn_points_no', 'sd_points_no']
            else:
                logging.info(f"No data to aggregate for date {event_date}")
                return pd.DataFrame()
        except Exception as e:
            logging.error(f"Error aggregating data for date {event_date}: {e}")
            return pd.DataFrame()

        # Filter aggregated data
        try:
            filtered_df5 = aggregated_df5.loc[aggregated_df5['trip_count'] > 9].copy()
            filtered_df5['local_date'] = event_date
            filtered_df5 = filtered_df5.astype({'trip_count': 'int'})
        except Exception as e:
            logging.error(f"Error filtering or assigning data for date {event_date}: {e}")
            return pd.DataFrame()
        
        return filtered_df5

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame()


In [9]:
# List of specific dates to process
date_list = [
    '20190107', '20190131', '20190225', '20190301', '20190303', '20190312', 
    '20190313', '20190315', '20190317', '20190331', '20190401', '20190410', 
    '20190423', '20190430', '20190504', '20190518', '20190601', '20190611', 
    '20190616', '20190624', '20190625', '20190630', '20190701', '20190720', 
    '20190806', '20190808', '20190824', '20190902', '20190912', '20191010', 
    '20191013', '20191015', '20191103', '20191106', '20191110', '20191117', 
    '20191125', '20191208', '20191221', '20191227', '20191231'
]

country_code = 'MX'
all_days_df = pd.DataFrame()

# Process each day and concatenate the results
for event_date in date_list:
    filtered_df5 = process_day(event_date, country_code, sql_engine)
    if not filtered_df5.empty:
        all_days_df = pd.concat([all_days_df, filtered_df5])


2024-06-25 22:18:16,921 - INFO - Executing SQL query for date 20190107
2024-06-25 22:18:56,303 - INFO - Executing SQL query for date 20190131
2024-06-25 22:19:35,295 - INFO - Executing SQL query for date 20190225
2024-06-25 22:20:16,920 - INFO - Executing SQL query for date 20190301
2024-06-25 22:20:54,602 - INFO - Executing SQL query for date 20190303
2024-06-25 22:21:33,736 - INFO - Executing SQL query for date 20190312
2024-06-25 22:22:19,809 - INFO - Executing SQL query for date 20190313
2024-06-25 22:23:02,038 - INFO - Executing SQL query for date 20190315
2024-06-25 22:23:39,676 - INFO - Executing SQL query for date 20190317
2024-06-25 22:24:25,250 - INFO - Executing SQL query for date 20190331
2024-06-25 22:24:59,839 - INFO - Executing SQL query for date 20190401
2024-06-25 22:25:40,625 - INFO - Executing SQL query for date 20190410
2024-06-25 22:26:21,936 - INFO - Executing SQL query for date 20190423
2024-06-25 22:27:03,894 - INFO - Executing SQL query for date 20190430
2024-0

In [10]:
all_days_df

Unnamed: 0,start_geohash5,end_geohash5,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no,local_date
6,9emmx,9emmx,13,57.775641,11.300000,91.727256,548.178805,427.810485,626.120861,3.230769,3.0,1.423250,20190107
29,9emt1,9emt1,11,53.127273,12.383333,69.829574,2519.050390,327.971030,6568.315633,4.454545,3.0,2.769969,20190107
39,9emt3,9emt3,105,64.662222,28.100000,91.496411,2268.257680,956.767012,4366.371742,6.180952,4.0,5.967510,20190107
53,9emtk,9emtk,58,40.962356,20.583333,60.703146,1211.532874,586.470368,1773.177908,4.431034,3.0,2.878304,20190107
62,9emu4,9emu4,14,43.578571,28.000000,42.458430,1458.265822,437.202134,2764.783262,5.785714,5.0,3.724732,20190107
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33371,d5f2b,d5f2b,166,43.662149,18.058333,62.739475,478.750023,277.756082,1142.851284,5.662651,4.0,11.174947,20191231
33376,d5f2c,d5f2f,10,39.635000,18.700000,47.332935,2997.426362,480.980488,7921.177226,4.300000,4.0,1.636392,20191231
33389,d5f2f,d5f2c,11,31.774242,12.083333,60.396571,1518.747992,528.967291,3384.258606,4.454545,3.0,2.423371,20191231
33391,d5f2f,d5f2f,13,38.432051,13.383333,55.416803,759.649485,195.201728,1278.907148,4.384615,4.0,2.959123,20191231


In [11]:
all_days_df.to_csv('/home/jovyan/Data/TJ/' + 'MX_MissingDates.csv', index=False)
