In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [None]:
pip install python-geohash

Collecting python-geohash
  Using cached python_geohash-0.8.5-cp39-cp39-linux_x86_64.whl
Installing collected packages: python-geohash
Successfully installed python-geohash-0.8.5
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import os
import geohash
from datetime import datetime, timedelta
import logging

In [None]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)
    
    def read_sql_chunked(self, query:str, chunksize:int=1000):
        """
        Read SQL query in chunks.
        """
        return pd.read_sql(query, self.engine, chunksize=chunksize)


sql_engine = TrinoEngine()
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

# Main

In [7]:
# Function to process data for a single day
def process_day(event_date, country_code, sql_engine):
    try:
        # Read data from the SQL table
        pe_tj_df = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                start_lat,
                start_lng,
                end_lat,
                end_lng,
                duration_minutes,
                length_meters,
                number_of_points
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )
        logging.info(f"Executing SQL query for date {event_date}")
        
        # Encode geohashes
        pe_tj_df['start_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
        pe_tj_df['end_geohash5'] = pe_tj_df.apply(
            lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

        # Add user numbers to the aggregated data
        aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Filter aggregated data
        filtered_df5 = aggregated_df5.loc[aggregated_df5['trip_count'] > 9]
        return filtered_df5

    except Exception as e:
        logging.error(f"Error processing data for date {event_date}: {e}")
        return pd.DataFrame()

# Function to insert data in chunks
def insert_data_in_chunks(df, table_name, engine, chunk_size):
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        chunk.to_sql(table_name, engine, index=False, if_exists='append', method='multi')

In [10]:
# Test with 502 handelling

# Main processing loop
def process_date_range(start_date, end_date, country_code, sql_engine):
    start_time = time.time()  # Record start time before processing loop
    errored_dates = []  # List to store dates that encounter errors
        
    current_date = start_date
    while current_date <= end_date:
        event_date = current_date.strftime('%Y%m%d')
        try:
            filtered_df5 = process_day(event_date, country_code, sql_engine)

            # Create the SQL engine
            # output_schema_name = "od_matrix_10"
            output_schema_name = "presence_data"
            final_table_5 = f"od_{country_code.lower()}_{event_date}_agg5_10"
            con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

            # Create the SQL table with the correct name for 5-level geohash
            create_table_query_5 = f"""
            CREATE TABLE IF NOT EXISTS {final_table_5} (
                start_geohash5 varchar,
                end_geohash5 varchar,
                trip_count bigint,
                m_duration_min double,
                mdn_duration_min double,
                sd_duration_min double,
                m_length_m double,
                mdn_length_m double,
                sd_length_m double,
                m_points_no double,
                mdn_points_no double,
                sd_points_no double
            )
            WITH (
              bucketed_by = ARRAY['end_geohash5'],
              bucket_count = 70
            )
            """

            with con.connect() as connection:
                connection.execute(create_table_query_5)
                
            filtered_df5 = filtered_df5.astype({
                'trip_count': 'int'
            })
            
            # Insert data into the table with the correct name
            if not filtered_df5.empty:
                insert_data_in_chunks(filtered_df5, final_table_5, con, 200)
                logging.info(f"Data inserted into {final_table_5}")
            else:
                logging.info(f"No data to insert for {final_table_5} for 5-level geohash")
        except Exception as e:
            logging.error(f"Failed to process data for date {event_date}: {e}")
            errored_dates.append(event_date)  # Record the errored date

        # Move to the next day
        current_date += timedelta(days=1)
    
    end_time = time.time()  # Record end time after processing loop
    total_time = end_time - start_time
    logging.info(f"Total processing time: {total_time:.2f} seconds")

    # Log the errored dates
    if errored_dates:
        logging.info(f"Errored dates: {', '.join(errored_dates)}")
    else:
        logging.info("No errors encountered during processing.")


In [11]:
process_date_range(datetime(2019, 3, 4), datetime(2019, 3, 4), 'IN', sql_engine)

2024-06-20 21:28:53,828 - INFO - Executing SQL query for date 20190304
2024-06-20 21:29:54,661 - INFO - failed after 3 attempts
2024-06-20 21:29:54,996 - INFO - failed after 3 attempts
2024-06-20 21:29:54,997 - ERROR - Failed to process data for date 20190304: error 502: bad gateway
2024-06-20 21:29:54,997 - INFO - Total processing time: 74.36 seconds
2024-06-20 21:29:54,998 - INFO - Errored dates: 20190304


In [53]:
# Test query single
country_code = 'IN'
event_date = 20191109

pe_tj_df = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        start_lat,
        start_lng,
        end_lat,
        end_lng,
        duration_minutes,
        length_meters,
        number_of_points
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date = {event_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
    """
)

pe_tj_df

Unnamed: 0,cuebiq_id,start_lat,start_lng,end_lat,end_lng,duration_minutes,length_meters,number_of_points
0,2578212103,31.510998,75.899543,31.510941,75.899569,579.233333,8476.455056,12
1,2578155448,22.466588,71.048813,22.466588,71.048813,214.950000,201.807830,4
2,2578110769,28.717822,77.219517,28.683205,77.204921,221.483333,4328.210385,5
3,2578110769,28.683205,77.204921,28.682530,77.206540,10.166667,175.157676,3
4,2578083192,28.826117,78.531765,29.200157,78.959050,91.966667,77508.600372,17
...,...,...,...,...,...,...,...,...
188924,2134256059,17.443464,78.463963,17.444313,78.462857,2.183333,150.778878,2
188925,2134256059,17.444313,78.462857,17.443592,78.463494,8.283333,591.979270,4
188926,2024708335,17.529377,78.482651,17.529297,78.482715,640.550000,1915.862259,31
188927,2024708335,17.529297,78.482715,17.529228,78.482685,371.500000,4939.743312,68


In [50]:
# file_name = 'TJ_IN_20190428.csv'
# pe_tj_df.to_csv('/home/jovyan/Data/TJ/IN/' + file_name, index=False)

In [54]:
# Encode geohashes
pe_tj_df['start_geohash5'] = pe_tj_df.apply(
    lambda x: geohash.encode(x['start_lat'], x['start_lng'], precision=5), axis=1)
pe_tj_df['end_geohash5'] = pe_tj_df.apply(
    lambda x: geohash.encode(x['end_lat'], x['end_lng'], precision=5), axis=1)

# Add user numbers to the aggregated data
aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
    'cuebiq_id': 'count',
    'duration_minutes': ['mean', 'median', 'std'],
    'length_meters': ['mean', 'median', 'std'],
    'number_of_points': ['mean', 'median', 'std']
}).reset_index()
aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                          'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                          'm_length_m', 'mdn_length_m', 'sd_length_m',
                          'm_points_no', 'mdn_points_no', 'sd_points_no']

# Filter aggregated data
filtered_df5 = aggregated_df5.loc[aggregated_df5['trip_count'] > 9]
filtered_df5 = filtered_df5.astype({'trip_count': 'int'})

filtered_df5

Unnamed: 0,start_geohash5,end_geohash5,trip_count,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
4,t9qxz,t9qxz,14,64.471429,15.708333,134.230909,3607.570955,690.369261,6265.760908,4.071429,3.0,2.335818
45,t9qz8,t9qz8,14,75.270238,65.325000,63.911568,571.681225,458.087500,520.744839,4.500000,4.0,3.180469
86,t9rn3,t9rn3,12,135.525000,38.558333,277.979627,1057.045158,998.828855,1141.320494,4.833333,3.5,3.927371
93,t9rn6,t9rn6,13,75.692308,46.033333,80.725737,1140.178636,268.848004,2018.288340,2.692308,2.0,0.947331
108,t9rn9,t9rn9,37,74.747748,20.300000,174.512479,4809.908558,954.147143,13833.808092,5.432432,4.0,5.434154
...,...,...,...,...,...,...,...,...,...,...,...,...
45301,whghr,whghr,11,38.448485,39.583333,29.046395,2922.908713,1190.227930,5372.684859,4.181818,4.0,2.136267
45312,whgkr,whgkr,25,76.657333,31.950000,93.563187,1916.595387,1117.115806,1985.959775,5.600000,5.0,3.013857
45321,whgs2,whgs2,13,83.200000,101.766667,79.663735,2045.042773,565.552162,3877.985647,4.307692,2.0,3.065524
45337,whgu3,whgu3,21,73.632540,40.366667,74.531991,5506.486950,1235.796937,18741.372398,4.571429,4.0,2.314550


In [55]:
agg_file_name = f"od_{country_code.lower()}_{event_date}_agg5_10"
filtered_df5.to_csv('/home/jovyan/Data/TJ/IN/' + agg_file_name + '.csv', index=False)