In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install python-geohash

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import time
import geohash
import logging
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine

In [5]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [9]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

# Export to local

In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [8]:
# Function to process data for a specific date
def process_data_for_date(event_date, country_code, export_path):
    errors = []  # List to keep track of errors
    
    # Define the export file paths
    csv3_file = f"od_{country_code.lower()}_agg3_3h.csv"
    csv5_file = f"od_{country_code.lower()}_agg5_3h.csv"
    csv3_file_path = f"{export_path}{csv3_file}"
    csv5_file_path = f"{export_path}{csv5_file}"

    # Loop through 24 hours in increments of 3 hours
    for start_hour in range(0, 24, 3):
        end_hour = start_hour + 3  # end hour for the 3-hour interval
        logging.info(f"Querying data for date: {event_date}, hours: {start_hour}-{end_hour}")

        try:
            # Fetch the data for the specified event date and country code
            pe_tj_df3 = sql_engine.read_sql(
                f"""
                SELECT 
                    cuebiq_id,
                    duration_minutes,
                    length_meters,
                    number_of_points,
                    TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                    EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                    geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
                    geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
                    geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
                    geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
                    DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
                FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
                WHERE 
                    event_date = {event_date}
                    AND end_country = '{country_code}' 
                    AND start_country = '{country_code}' 
                """
            )

            # Filter the DataFrame for the current 3-hour interval
            logging.info(f"Processing data for date: {event_date}, hours: {start_hour}-{end_hour}")
            filtered_df = pe_tj_df3[(pe_tj_df3['event_hour'] >= start_hour) & (pe_tj_df3['event_hour'] < end_hour)]

            # Add user numbers to the aggregated data for geohash3
            aggregated_df3 = filtered_df.groupby(['start_geohash3', 'end_geohash3', 'local_date']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std']
            }).reset_index()

            # Flatten the MultiIndex columns
            aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m',
                                      'm_points_no', 'mdn_points_no', 'sd_points_no']

            # Add the local_time column
            aggregated_df3['local_time'] = aggregated_df3['local_date'].astype(str) + \
                                           f" {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

            # Filter out rows where trip_count is less than or equal to 9
            aggregated_df3 = aggregated_df3[aggregated_df3['trip_count'] > 9]

            # Append the results to the CSV file for geohash3
            logging.info(f"Exporting aggregated data (geohash3) for date: {event_date}, hours: {start_hour}-{end_hour}")
            if not os.path.isfile(csv3_file_path):
                aggregated_df3.to_csv(csv3_file_path, index=False)
            else:
                aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

            # Add user numbers to the aggregated data for geohash5
            aggregated_df5 = filtered_df.groupby(['start_geohash5', 'end_geohash5', 'local_date']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std']
            }).reset_index()

            # Flatten the MultiIndex columns
            aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'local_date', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m',
                                      'm_points_no', 'mdn_points_no', 'sd_points_no']

            # Add the local_time column
            aggregated_df5['local_time'] = aggregated_df5['local_date'].astype(str) + \
                                           f" {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

            # Filter out rows where trip_count is less than or equal to 9
            aggregated_df5 = aggregated_df5[aggregated_df5['trip_count'] > 9]

            # Append the results to the CSV file for geohash5
            logging.info(f"Exporting aggregated data (geohash5) for date: {event_date}, hours: {start_hour}-{end_hour}")
            if not os.path.isfile(csv5_file_path):
                aggregated_df5.to_csv(csv5_file_path, index=False)
            else:
                aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)
        
        except Exception as e:
            error_message = f"Error processing date: {event_date}, hours: {start_hour}-{end_hour} - {str(e)}"
            logging.error(error_message)
            errors.append(error_message)
    
    return errors

# Function to process data for a range of dates
def process_data_for_date_range(start_date, end_date, country_code, export_path):
    current_date = start_date
    all_errors = []
    while current_date <= end_date:
        event_date = current_date.strftime("%Y%m%d")
        logging.info(f"Processing data for date: {event_date}")
        errors = process_data_for_date(event_date, country_code, export_path)
        all_errors.extend(errors)
        current_date += timedelta(days=1)
    
    # Log any errors that occurred
    if all_errors:
        logging.info("Errors occurred during processing:")
        for error in all_errors:
            logging.info(error)


In [11]:
# Specify the date range
start_date = datetime.strptime("20190809", "%Y%m%d")
end_date = datetime.strptime("20190819", "%Y%m%d")
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'


# Process data for the specified date range
process_data_for_date_range(start_date, end_date, country_code, export_path)


2024-07-02 19:50:40,770 - INFO - Processing data for date: 20190809
2024-07-02 19:50:40,771 - INFO - Querying data for date: 20190809, hours: 0-3
2024-07-02 19:51:16,147 - INFO - Processing data for date: 20190809, hours: 0-3
2024-07-02 19:51:16,159 - INFO - Exporting aggregated data (geohash3) for date: 20190809, hours: 0-3
2024-07-02 19:51:16,183 - INFO - Exporting aggregated data (geohash5) for date: 20190809, hours: 0-3
2024-07-02 19:51:16,197 - INFO - Querying data for date: 20190809, hours: 3-6
2024-07-02 19:51:41,186 - INFO - Processing data for date: 20190809, hours: 3-6
2024-07-02 19:51:41,204 - INFO - Exporting aggregated data (geohash3) for date: 20190809, hours: 3-6
2024-07-02 19:51:41,233 - INFO - Exporting aggregated data (geohash5) for date: 20190809, hours: 3-6
2024-07-02 19:51:41,248 - INFO - Querying data for date: 20190809, hours: 6-9
2024-07-02 19:52:06,038 - INFO - Processing data for date: 20190809, hours: 6-9
2024-07-02 19:52:06,071 - INFO - Exporting aggregated 

# Fillin Gaps

Missing info: 
Missing hours for each date:
         Date        Missing Hours
0  2019-01-02              [0, 21]
1  2019-01-03                  [0]
2  2019-01-07                  [0]
3  2019-01-09                  [0]
4  2019-01-10                  [0]
5  2019-01-11                  [0]
6  2019-01-14                  [0]
7  2019-01-15                  [0]
8  2019-01-16                  [0]
9  2019-01-18                  [0]
10 2019-01-21                  [0]
11 2019-01-22                  [0]
12 2019-01-25                  [0]
13 2019-01-28                  [0]
14 2019-01-29                  [0]
15 2019-02-02                  [0]
16 2019-04-04                  [0]
17 2019-04-12                  [3]
18 2019-04-29                  [0]
19 2019-05-26                 [21]
20 2019-06-06                 [12]
21 2019-08-01             [18, 21]
22 2019-08-31                 [18]
23 2019-09-08                  [3]
24 2019-10-17                 [21]
25 2019-10-22  [9, 12, 15, 18, 21]
26 2019-10-28                  [6]
27 2019-11-26                  [0]
28 2019-11-27                  [0]
29 2019-11-28                  [0]
30 2019-12-02                  [0]
31 2019-12-03                  [0]
32 2019-12-05                  [0]
33 2019-12-06                  [0]
34 2019-12-07                  [0]
35 2019-12-08                  [0]
36 2019-12-09                  [0]
37 2019-12-10                  [0]
38 2019-12-11                  [0]
39 2019-12-12                  [0]
40 2019-12-13                  [0]
41 2019-12-17                  [0]
42 2019-12-18                  [0]
43 2019-12-19                  [0]
44 2019-12-20                  [0]
45 2019-12-21                  [0]
46 2019-12-23                  [0]
47 2019-12-24                  [0]
48 2019-12-25                  [0]
49 2019-12-26                  [0]
50 2019-12-27                  [0]
51 2019-12-28                  [0]
52 2019-12-29                  [0]
53 2019-12-30                  [0]

In [12]:
import time
import pandas as pd
import os
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a specific date and time intervals
def process_data_for_date(event_date, country_code, export_path, start_hour, end_hour):
    # Define the export file paths
    csv3_file = f"od_{country_code.lower()}3_agg3_3h.csv"
    csv5_file = f"od_{country_code.lower()}3_agg5_3h.csv"
    csv3_file_path = f"{export_path}{csv3_file}"
    csv5_file_path = f"{export_path}{csv5_file}"

    # Loop through the specified hours in increments of 3 hours
    for hour in range(start_hour, end_hour, 3):
        next_hour = hour + 3  # end hour for the 3-hour interval
        logging.info(f"Querying data for date: {event_date}, hours: {hour}-{next_hour}")

        # Fetch the data for the specified event date and country code
        pe_tj_df3 = sql_engine.read_sql(
            f"""
            SELECT 
                cuebiq_id,
                duration_minutes,
                length_meters,
                number_of_points,
                TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
                geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
                geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
                geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
                DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date = {event_date}
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}' 
            """
        )

        # Filter the DataFrame for the current 3-hour interval
        logging.info(f"Processing data for date: {event_date}, hours: {hour}-{next_hour}")
        filtered_df = pe_tj_df3[(pe_tj_df3['event_hour'] >= hour) & (pe_tj_df3['event_hour'] < next_hour)]

        # Add user numbers to the aggregated data for geohash3
        aggregated_df3 = filtered_df.groupby(['start_geohash3', 'end_geohash3', 'local_date']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()

        # Flatten the MultiIndex columns
        aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Add the local_time column
        aggregated_df3['local_time'] = aggregated_df3['local_date'].astype(str) + \
                                       f" {hour:02d}:00:00 - {next_hour:02d}:00:00"

        # Filter out rows where trip_count is less than or equal to 9
        aggregated_df3 = aggregated_df3[aggregated_df3['trip_count'] > 9]

        # Append the results to the CSV file for geohash3
        logging.info(f"Exporting aggregated data (geohash3) for date: {event_date}, hours: {hour}-{next_hour}")
        if not os.path.isfile(csv3_file_path):
            aggregated_df3.to_csv(csv3_file_path, index=False)
        else:
            aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

        # Add user numbers to the aggregated data for geohash5
        aggregated_df5 = filtered_df.groupby(['start_geohash5', 'end_geohash5', 'local_date']).agg({
            'cuebiq_id': 'count',
            'duration_minutes': ['mean', 'median', 'std'],
            'length_meters': ['mean', 'median', 'std'],
            'number_of_points': ['mean', 'median', 'std']
        }).reset_index()

        # Flatten the MultiIndex columns
        aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'local_date', 'trip_count', 
                                  'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                  'm_length_m', 'mdn_length_m', 'sd_length_m',
                                  'm_points_no', 'mdn_points_no', 'sd_points_no']

        # Add the local_time column
        aggregated_df5['local_time'] = aggregated_df5['local_date'].astype(str) + \
                                       f" {hour:02d}:00:00 - {next_hour:02d}:00:00"

        # Filter out rows where trip_count is less than or equal to 9
        aggregated_df5 = aggregated_df5[aggregated_df5['trip_count'] > 9]

        # Append the results to the CSV file for geohash5
        logging.info(f"Exporting aggregated data (geohash5) for date: {event_date}, hours: {hour}-{next_hour}")
        if not os.path.isfile(csv5_file_path):
            aggregated_df5.to_csv(csv5_file_path, index=False)
        else:
            aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)


In [13]:
# Specify the date and initial time interval
event_date = "20190102"
start_hour = 0
end_hour = 24 
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)


2024-07-02 22:27:07,056 - INFO - Querying data for date: 20190102, hours: 0-3
2024-07-02 22:27:56,958 - INFO - Processing data for date: 20190102, hours: 0-3
2024-07-02 22:27:56,969 - INFO - Exporting aggregated data (geohash3) for date: 20190102, hours: 0-3
2024-07-02 22:27:56,989 - INFO - Exporting aggregated data (geohash5) for date: 20190102, hours: 0-3
2024-07-02 22:27:56,991 - INFO - Querying data for date: 20190102, hours: 3-6
2024-07-02 22:28:17,642 - INFO - Processing data for date: 20190102, hours: 3-6
2024-07-02 22:28:17,655 - INFO - Exporting aggregated data (geohash3) for date: 20190102, hours: 3-6
2024-07-02 22:28:17,678 - INFO - Exporting aggregated data (geohash5) for date: 20190102, hours: 3-6
2024-07-02 22:28:17,684 - INFO - Querying data for date: 20190102, hours: 6-9
2024-07-02 22:28:36,134 - INFO - Processing data for date: 20190102, hours: 6-9
2024-07-02 22:28:36,156 - INFO - Exporting aggregated data (geohash3) for date: 20190102, hours: 6-9
2024-07-02 22:28:36,1

In [14]:
# Specify the date and initial time interval
event_date = "20190412"
start_hour = 3
end_hour = 6
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)


2024-07-02 22:30:16,327 - INFO - Querying data for date: 20190412, hours: 3-6
2024-07-02 22:31:01,187 - INFO - Processing data for date: 20190412, hours: 3-6
2024-07-02 22:31:01,206 - INFO - Exporting aggregated data (geohash3) for date: 20190412, hours: 3-6
2024-07-02 22:31:01,238 - INFO - Exporting aggregated data (geohash5) for date: 20190412, hours: 3-6


In [15]:
# Specify the date and initial time interval
event_date = "20190526"
start_hour = 21
end_hour = 24
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)


2024-07-02 22:31:01,296 - INFO - Querying data for date: 20190526, hours: 21-24
2024-07-02 22:31:33,634 - INFO - Processing data for date: 20190526, hours: 21-24
2024-07-02 22:31:33,651 - INFO - Exporting aggregated data (geohash3) for date: 20190526, hours: 21-24
2024-07-02 22:31:33,678 - INFO - Exporting aggregated data (geohash5) for date: 20190526, hours: 21-24


In [16]:
# Specify the date and initial time interval
event_date = "20190606"
start_hour = 12
end_hour = 15
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)


2024-07-02 22:31:33,717 - INFO - Querying data for date: 20190606, hours: 21-24
2024-07-02 22:32:09,983 - INFO - Processing data for date: 20190606, hours: 21-24
2024-07-02 22:32:10,003 - INFO - Exporting aggregated data (geohash3) for date: 20190606, hours: 21-24
2024-07-02 22:32:10,052 - INFO - Exporting aggregated data (geohash5) for date: 20190606, hours: 21-24


In [17]:
# Specify the date and initial time interval
event_date = "20190801"
start_hour = 18
end_hour = 24
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)


2024-07-02 22:32:10,096 - INFO - Querying data for date: 20190801, hours: 18-21
2024-07-02 22:32:50,603 - INFO - Processing data for date: 20190801, hours: 18-21
2024-07-02 22:32:50,635 - INFO - Exporting aggregated data (geohash3) for date: 20190801, hours: 18-21
2024-07-02 22:32:50,676 - INFO - Exporting aggregated data (geohash5) for date: 20190801, hours: 18-21
2024-07-02 22:32:50,699 - INFO - Querying data for date: 20190801, hours: 21-24
2024-07-02 22:33:26,361 - INFO - Processing data for date: 20190801, hours: 21-24
2024-07-02 22:33:26,388 - INFO - Exporting aggregated data (geohash3) for date: 20190801, hours: 21-24
2024-07-02 22:33:26,417 - INFO - Exporting aggregated data (geohash5) for date: 20190801, hours: 21-24


In [18]:
# Specify the date and initial time interval
event_date = "20190831"
start_hour = 18
end_hour = 21
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)


2024-07-02 22:33:26,475 - INFO - Querying data for date: 20190831, hours: 18-21
2024-07-02 22:34:08,674 - INFO - Processing data for date: 20190831, hours: 18-21
2024-07-02 22:34:08,715 - INFO - Exporting aggregated data (geohash3) for date: 20190831, hours: 18-21
2024-07-02 22:34:08,773 - INFO - Exporting aggregated data (geohash5) for date: 20190831, hours: 18-21


In [19]:
# Specify the date and initial time interval
event_date = "20190908"
start_hour = 3
end_hour = 6
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)


2024-07-02 22:34:08,858 - INFO - Querying data for date: 20190908, hours: 3-6
2024-07-02 22:34:50,203 - INFO - Processing data for date: 20190908, hours: 3-6
2024-07-02 22:34:50,223 - INFO - Exporting aggregated data (geohash3) for date: 20190908, hours: 3-6
2024-07-02 22:34:50,265 - INFO - Exporting aggregated data (geohash5) for date: 20190908, hours: 3-6


In [20]:
# Specify the date and initial time interval
event_date = "20191017"
start_hour = 21
end_hour = 24
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)

2024-07-02 22:34:50,343 - INFO - Querying data for date: 20191017, hours: 21-24
2024-07-02 22:35:33,092 - INFO - Processing data for date: 20191017, hours: 21-24
2024-07-02 22:35:33,115 - INFO - Exporting aggregated data (geohash3) for date: 20191017, hours: 21-24
2024-07-02 22:35:33,154 - INFO - Exporting aggregated data (geohash5) for date: 20191017, hours: 21-24


In [21]:
# Specify the date and initial time interval
event_date = "20191022"
start_hour = 9
end_hour = 24
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)

2024-07-02 22:35:33,233 - INFO - Querying data for date: 20191022, hours: 9-12
2024-07-02 22:35:55,764 - INFO - Processing data for date: 20191022, hours: 9-12
2024-07-02 22:35:55,774 - INFO - Exporting aggregated data (geohash3) for date: 20191022, hours: 9-12
2024-07-02 22:35:55,784 - INFO - Exporting aggregated data (geohash5) for date: 20191022, hours: 9-12
2024-07-02 22:35:55,788 - INFO - Querying data for date: 20191022, hours: 12-15
2024-07-02 22:36:15,461 - INFO - Processing data for date: 20191022, hours: 12-15
2024-07-02 22:36:15,471 - INFO - Exporting aggregated data (geohash3) for date: 20191022, hours: 12-15
2024-07-02 22:36:15,480 - INFO - Exporting aggregated data (geohash5) for date: 20191022, hours: 12-15
2024-07-02 22:36:15,483 - INFO - Querying data for date: 20191022, hours: 15-18
2024-07-02 22:36:32,597 - INFO - Processing data for date: 20191022, hours: 15-18
2024-07-02 22:36:32,607 - INFO - Exporting aggregated data (geohash3) for date: 20191022, hours: 15-18
202

In [22]:
# Specify the date and initial time interval
event_date = "20191028"
start_hour = 6
end_hour = 9
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified date and time intervals
process_data_for_date(event_date, country_code, export_path, start_hour, end_hour)

2024-07-02 22:37:02,658 - INFO - Querying data for date: 20191028, hours: 6-9
2024-07-02 22:37:36,733 - INFO - Processing data for date: 20191028, hours: 6-9
2024-07-02 22:37:36,769 - INFO - Exporting aggregated data (geohash3) for date: 20191028, hours: 6-9
2024-07-02 22:37:36,820 - INFO - Exporting aggregated data (geohash5) for date: 20191028, hours: 6-9


In [10]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to process data for a specific date and time intervals
def process_data_for_date(event_dates, country_code, export_path, start_hour, end_hour):
    # Define the export file paths
    csv3_file = f"od_{country_code.lower()}3_agg3_3h.csv"
    csv5_file = f"od_{country_code.lower()}3_agg5_3h.csv"
    csv3_file_path = f"{export_path}{csv3_file}"
    csv5_file_path = f"{export_path}{csv5_file}"

    # Loop through each date
    for event_date in event_dates:
        # Loop through the specified hours in increments of 3 hours
        for hour in range(start_hour, end_hour, 3):
            next_hour = hour + 3  # end hour for the 3-hour interval
            logging.info(f"Querying data for date: {event_date}, hours: {hour}-{next_hour}")

            # Fetch the data for the specified event date and country code
            pe_tj_df3 = sql_engine.read_sql(
                f"""
                SELECT 
                    cuebiq_id,
                    duration_minutes,
                    length_meters,
                    number_of_points,
                    TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                    EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                    geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
                    geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
                    geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
                    geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
                    DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
                FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
                WHERE 
                    event_date = {event_date}
                    AND end_country = '{country_code}' 
                    AND start_country = '{country_code}' 
                """
            )

            # Filter the DataFrame for the current 3-hour interval
            logging.info(f"Processing data for date: {event_date}, hours: {hour}-{next_hour}")
            filtered_df = pe_tj_df3[(pe_tj_df3['event_hour'] >= hour) & (pe_tj_df3['event_hour'] < next_hour)]

            # Add user numbers to the aggregated data for geohash3
            aggregated_df3 = filtered_df.groupby(['start_geohash3', 'end_geohash3', 'local_date']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std']
            }).reset_index()

            # Flatten the MultiIndex columns
            aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'local_date', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m',
                                      'm_points_no', 'mdn_points_no', 'sd_points_no']

            # Add the local_time column
            aggregated_df3['local_time'] = aggregated_df3['local_date'].astype(str) + \
                                           f" {hour:02d}:00:00 - {next_hour:02d}:00:00"

            # Filter out rows where trip_count is less than or equal to 9
            aggregated_df3 = aggregated_df3[aggregated_df3['trip_count'] > 9]

            # Append the results to the CSV file for geohash3
            logging.info(f"Exporting aggregated data (geohash3) for date: {event_date}, hours: {hour}-{next_hour}")
            if not os.path.isfile(csv3_file_path):
                aggregated_df3.to_csv(csv3_file_path, index=False)
            else:
                aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

            # Add user numbers to the aggregated data for geohash5
            aggregated_df5 = filtered_df.groupby(['start_geohash5', 'end_geohash5', 'local_date']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std']
            }).reset_index()

            # Flatten the MultiIndex columns
            aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'local_date', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m',
                                      'm_points_no', 'mdn_points_no', 'sd_points_no']

            # Add the local_time column
            aggregated_df5['local_time'] = aggregated_df5['local_date'].astype(str) + \
                                           f" {hour:02d}:00:00 - {next_hour:02d}:00:00"

            # Filter out rows where trip_count is less than or equal to 9
            aggregated_df5 = aggregated_df5[aggregated_df5['trip_count'] > 9]

            # Append the results to the CSV file for geohash5
            logging.info(f"Exporting aggregated data (geohash5) for date: {event_date}, hours: {hour}-{next_hour}")
            if not os.path.isfile(csv5_file_path):
                aggregated_df5.to_csv(csv5_file_path, index=False)
            else:
                aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)

In [11]:
# For all 0s:

# event_dates = ["20190103", "20190107", "20190109", "20190110", "20190111", "20190114"]
event_dates = [
     # "20190115", "20190116", "20190118", "20190121", 
    "20190122", 
     "20190125", "20190128", "20190129", "20190202", "20190404", 
     "20190429", "20191126", "20191127", "20191128", "20191202", 
     "20191203", "20191205", "20191206", "20191207", "20191208", 
     "20191209", "20191210", "20191211", "20191212", "20191213", 
     "20191217", "20191218", "20191219", "20191220", "20191221", 
     "20191223", "20191224", "20191225", "20191226", "20191227", 
     "20191228", "20191229", "20191230"]

start_hour = 0
end_hour = 3 
country_code = 'ID'
export_path = '/home/jovyan/Data/TJ/3h/'

# Process data for the specified dates and time intervals
process_data_for_date(event_dates, country_code, export_path, start_hour, end_hour)


2024-07-02 21:59:54,830 - INFO - Querying data for date: 20190122, hours: 0-3
2024-07-02 22:00:09,080 - INFO - Processing data for date: 20190122, hours: 0-3
2024-07-02 22:00:09,092 - INFO - Exporting aggregated data (geohash3) for date: 20190122, hours: 0-3
2024-07-02 22:00:09,112 - INFO - Exporting aggregated data (geohash5) for date: 20190122, hours: 0-3
2024-07-02 22:00:09,114 - INFO - Querying data for date: 20190125, hours: 0-3
2024-07-02 22:00:34,751 - INFO - Processing data for date: 20190125, hours: 0-3
2024-07-02 22:00:34,763 - INFO - Exporting aggregated data (geohash3) for date: 20190125, hours: 0-3
2024-07-02 22:00:34,779 - INFO - Exporting aggregated data (geohash5) for date: 20190125, hours: 0-3
2024-07-02 22:00:34,782 - INFO - Querying data for date: 20190128, hours: 0-3
2024-07-02 22:01:15,562 - INFO - Processing data for date: 20190128, hours: 0-3
2024-07-02 22:01:15,574 - INFO - Exporting aggregated data (geohash3) for date: 20190128, hours: 0-3
2024-07-02 22:01:15,5