In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install geohash2

Note: you may need to restart the kernel to use updated packages.


In [4]:
import gc
import os
import time
import logging
import geohash2
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from trino.dbapi import connect 

In [5]:
# SQL engine
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()

In [6]:
# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            return True
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")
    return False

In [7]:
# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Export to jupyter notebook
should be working for CO, ID, IN, MX

In [8]:
# Define the date range
start_date = '2019-01-01'
end_date = '2019-01-02'
date_range = pd.date_range(start=start_date, end=end_date)

country_code = 'MX'

# Define the export file paths
export_file_name_5 = f"pd_{country_code.lower()}_2019_agg5_3h.csv"
export_file_name_3 = f"pd_{country_code.lower()}_2019_agg3_3h.csv"

# Define the export file paths
export_path = '/home/jovyan/Data/pd3_test/'
export_file_path_5 = f"{export_path}{export_file_name_5}"
export_file_path_3 = f"{export_path}{export_file_name_3}"

# Create a function to determine the 3-hour interval based on a given date
def get_3_hour_interval(start_hour, current_date):
    start_time = pd.Timestamp(current_date) + pd.Timedelta(hours=start_hour)
    end_time = start_time + pd.Timedelta(hours=3)
    return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# Check if files already exist to determine header writing
write_header_5 = not os.path.exists(export_file_path_5)
write_header_3 = not os.path.exists(export_file_path_3)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [10]:
import logging
from datetime import timedelta
import pandas as pd

def get_3_hour_interval(start_hour, formatted_current_date):
    return f"{formatted_current_date} {start_hour:02d}:00:00"

# Loop through each day in the date range
for current_date in date_range:
    formatted_current_date = current_date.strftime('%Y%m%d')
    
    for start_hour in range(0, 24, 3):
        end_hour = start_hour + 3
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

        try:
            # SQL Query to fetch data for the current 3-hour interval
            query = f"""
            SELECT 
                cuebiq_id, 
                event_zoned_datetime, 
                processing_date,
                lat,
                lng,
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                geohash_encode(lat, lng, 5) AS geohash5,
                geohash_encode(lat, lng, 3) AS geohash3
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
                AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN {start_hour} AND {end_hour-1}
            """

            logging.info(f"Executing SQL query for date {formatted_current_date} and interval {start_hour} to {end_hour}")
            pe_dl_table_gen = sql_engine.read_sql(query)
            
            # Convert event_datetime_local to datetime once
            pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
            
            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            pe_dl_table_gen['3_hour_interval'] = interval
            pe_dl_table_gen['local_date'] = formatted_current_date
            
            for geohash_col, export_file_path, write_header in [
                ('geohash5', export_file_path_5, write_header_5),
                ('geohash3', export_file_path_3, write_header_3)
            ]:
                # Aggregate data for geohash
                logging.info(f"Aggregating data for {geohash_col} for interval {start_hour} to {end_hour}")
                aggregated_data = pe_dl_table_gen.groupby(geohash_col).agg(
                    no_of_points=(geohash_col, 'size'),
                    no_of_unique_users=('cuebiq_id', 'nunique'),
                    local_time=('3_hour_interval', 'first'),
                    local_date=('local_date', 'first')
                ).reset_index()
                
                # Filter rows with no_of_unique_users > 10
                filtered_data = aggregated_data[aggregated_data['no_of_unique_users'] > 10].copy()
                
                # Append the DataFrame to the CSV file
                logging.info(f"Exporting data for interval {start_hour} to {end_hour}")
                filtered_data.to_csv(export_file_path, mode='a', header=write_header, index=False)
                
                # After the first write, set the header flag to False
                if geohash_col == 'geohash5':
                    write_header_5 = False
                else:
                    write_header_3 = False
                
            logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_name_5} and {export_file_name_3}")
        except Exception as e:
            logging.error(f"Error while processing data for date {formatted_current_date} interval {start_hour} to {end_hour}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)
# test query by 3 hour

2024-06-28 21:17:14,876 - INFO - Executing SQL query for date 20190101 and interval 0 to 3
2024-06-28 21:26:34,620 - INFO - Aggregating data for geohash5 for interval 0 to 3
2024-06-28 21:26:35,335 - INFO - Exporting data for interval 0 to 3
2024-06-28 21:26:35,359 - INFO - Aggregating data for geohash3 for interval 0 to 3
2024-06-28 21:26:35,967 - INFO - Exporting data for interval 0 to 3
2024-06-28 21:26:35,973 - INFO - Appended data for date 20190101 interval 0 to 3 to pd_mx_2019_agg5_3h.csv and pd_mx_2019_agg3_3h.csv
2024-06-28 21:26:35,974 - INFO - Executing SQL query for date 20190101 and interval 3 to 6


KeyboardInterrupt: 

# Export to schema

In [14]:
def get_3_hour_interval(start_hour, formatted_current_date):
    end_hour = start_hour + 3
    return f"{formatted_current_date} {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'presence_data'
# output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_date = '2019-11-01'
end_date = '2019-11-02'
date_range = pd.date_range(start=start_date, end=end_date)

country_code = 'CO'
country_abbre = country_code.lower()  
master_table_3 = f"test_pd_{country_abbre}_2019_3h_agg3"
master_table_5 = f"test_pd_{country_abbre}_2019_3h_agg5"

# Create the master tables if they do not exist
create_table_query_3 = f"""
CREATE TABLE IF NOT EXISTS {master_table_3}(
    geohash_3 varchar,
    no_of_points bigint,
    no_of_unique_users bigint,
    local_time varchar,
    local_date varchar
)
"""
create_table_query_5 = f"""
CREATE TABLE IF NOT EXISTS {master_table_5}(
    geohash_5 varchar,
    no_of_points bigint,
    no_of_unique_users bigint,
    local_time varchar,
    local_date varchar
)
"""
# with engine.connect() as connection:
#     connection.execute(create_table_query_3)
#     connection.execute(create_table_query_5)
    
with con.connect() as connection:
    logging.info(f"Creating master table: {master_table_3}")
    connection.execute(create_table_query_3)
    logging.info(f"Creating master table: {master_table_5}")
    connection.execute(create_table_query_5)


2024-06-28 23:06:54,052 - INFO - Creating master table: test_pd_co_2019_3h_agg3
2024-06-28 23:06:54,873 - INFO - Creating master table: test_pd_co_2019_3h_agg5


In [None]:
# Loop through each day in the date range
for current_date in date_range:
    formatted_current_date = current_date.strftime('%Y%m%d')
    
    for start_hour in range(0, 24, 3):
        end_hour = start_hour + 3
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

        try:
            # SQL Query to fetch data for the current 3-hour interval
            query = f"""
            SELECT 
                cuebiq_id, 
                event_zoned_datetime, 
                processing_date,
                lat,
                lng,
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                geohash_encode(lat, lng, 5) AS geohash_5,
                geohash_encode(lat, lng, 3) AS geohash_3
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
                AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN {start_hour} AND {end_hour-1}
            """

            logging.info(f"Executing SQL query for date {formatted_current_date} and interval {start_hour} to {end_hour}")
            pe_dl_table_gen = sql_engine.read_sql(query)
            
            # Convert event_datetime_local to datetime once
            pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
            
            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            pe_dl_table_gen['3_hour_interval'] = interval
            pe_dl_table_gen['local_date'] = formatted_current_date
            
            # Process for geohash_5
            logging.info(f"Aggregating data for geohash_5 for interval {start_hour} to {end_hour}")
            aggregated_data_5 = pe_dl_table_gen.groupby('geohash_5').agg(
                no_of_points=('geohash_5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            filtered_data_5.to_sql(master_table_5, con, if_exists='append', index=False)
            logging.info(f"Inserted aggregated data for date {formatted_current_date} interval {start_hour} to {end_hour} into {master_table_3}")

            # Process for geohash_3
            logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
            aggregated_data_3 = pe_dl_table_gen.groupby('geohash_3').agg(
                no_of_points=('geohash_3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            filtered_data_3.to_sql(master_table_3, con, if_exists='append', index=False, method='multi')
            logging.info(f"Inserted aggregated data for date {formatted_current_date} interval {start_hour} to {end_hour} into the {master tables}")
        
        except Exception as e:
            logging.error(f"Error while processing data for date {formatted_current_date} interval {start_hour} to {end_hour}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)


2024-06-28 23:06:55,862 - INFO - Executing SQL query for date 20191101 and interval 0 to 3
2024-06-28 23:08:02,072 - INFO - Aggregating data for geohash_5 for interval 0 to 3
2024-06-28 23:12:15,179 - INFO - Aggregating data for geohash3 for interval 0 to 3
2024-06-28 23:12:16,368 - INFO - Inserted aggregated data for date 20191101 interval 0 to 3 into the master tables
2024-06-28 23:12:16,369 - INFO - Executing SQL query for date 20191101 and interval 3 to 6
2024-06-28 23:12:38,950 - INFO - Aggregating data for geohash_5 for interval 3 to 6


In [None]:
# Worked query per day. - not working for MX
for current_date in date_range:
    try:
        formatted_current_date = current_date.strftime('%Y-%m-%d')
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)

        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')

        # SQL Query to fetch data for the current date with geohashes calculated in the query
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng,
            TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            geohash_encode(lat, lng, 5) AS geohash5,
            geohash_encode(lat, lng, 3) AS geohash3
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """

        logging.info(f"Executing SQL query for date {formatted_current_date}")
        pe_dl_table_gen = sql_engine.read_sql(query)
        
        # Convert event_datetime_local to datetime
        pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
        
        # Loop through each 3-hour interval
        for start_hour in range(0, 24, 3):
            end_hour = start_hour + 3
            
            # Filter data for the current 3-hour interval
            interval_data = pe_dl_table_gen[
                (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
                (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
            ].copy()
            
            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            interval_data['3_hour_interval'] = interval
            interval_data['local_date'] = formatted_current_date
            
            # Aggregate data for geohash5
            logging.info(f"Aggregating data for geohash5 for interval {start_hour} to {end_hour}")
            aggregated_data_5 = interval_data.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Append the DataFrame to the CSV file for geohash5
            logging.info(f"Exporting data to {export_file_name_3} for interval {start_hour} to {end_hour}")
            filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
            
            # After the first write, set the header flag to False for geohash5
            write_header_5 = False
            
            # Aggregate data for geohash3
            logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
            aggregated_data_3 = interval_data.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Append the DataFrame to the CSV file for geohash3
            logging.info(f"Exporting data to {export_file_name_5} for interval {start_hour} to {end_hour}")
            filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
            
            # After the first write, set the header flag to False for geohash3
            write_header_3 = False
            
            logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_name_5} and {export_file_name_3}")
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)


2024-06-28 18:42:23,805 - INFO - Executing SQL query for date 20190101


# !!!!!!!!!!!!!!! 这个

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'presence_data'
# output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'CO'
start_date = 20191101
end_date = 20191102
# longitude_ranges = [(-82, -74.53125), (-74.53125, -65)]  # CO specific longitude ranges

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

failed_inserts = []

In [7]:
# Function to calculate geohashes
def calculate_geohashes(df, lat_col, lng_col):
    df['geohash5'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=5), axis=1)
    df['geohash3'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=3), axis=1)
    return df

# Create a function to determine the 3-hour interval based on a given date
def get_3_hour_interval(start_hour, current_date):
    start_time = pd.Timestamp(current_date) + pd.Timedelta(hours=start_hour)
    end_time = start_time + pd.Timedelta(hours=3)
    return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# Define the date range
start_date = '2019-11-12'
end_date = '2019-11-11'
date_range = pd.date_range(start=start_date, end=end_date)

country_code = 'CO'

# Define the export file paths
export_file_name_5 = f"pd_{country_code.lower()}_2019_agg5_3h.csv"
export_file_name_3 = f"pd_{country_code.lower()}_2019_agg3_3h.csv"

# Define the export file paths
export_path = '/home/jovyan/Data/pd3_test/'
export_file_path_5 = f"{export_path}{export_file_name_5}"
export_file_path_3 = f"{export_path}{export_file_name_3}"

# Check if files already exist to determine header writing
write_header_5 = not os.path.exists(export_file_path_5)
write_header_3 = not os.path.exists(export_file_path_3)


In [9]:
# # Loop through each day in the date range
# for current_date in date_range:
#     try:
#         formatted_current_date = current_date.strftime('%Y-%m-%d')
        
#         # Calculate the lookback and lookahead dates
#         lookback_date = current_date - timedelta(days=1)
#         lookahead_date = current_date + timedelta(days=35)

#         # Format dates for the SQL query in 'yyyymmdd' format
#         formatted_lookback_date = lookback_date.strftime('%Y%m%d')
#         formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
#         formatted_current_date = current_date.strftime('%Y%m%d')
        
#         # SQL Query to fetch data for the current date with geohashes calculated in the query
#         query = f"""
#         SELECT 
#             cuebiq_id, 
#             event_zoned_datetime, 
#             processing_date,
#             lat,
#             lng,
#             TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
#             EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
#             geohash_encode(lat, lng, 5) AS geohash5,
#             geohash_encode(lat, lng, 3) AS geohash3
#         FROM {pe_dl_table}
#         WHERE 
#             processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
#             AND country_code = '{country_code}' 
#             AND event_zoned_datetime IS NOT NULL
#             AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
#             AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
#         """
        
#         logging.info(f"Executing SQL query for date {formatted_current_date}")
#         pe_dl_table_gen = sql_engine.read_sql(query)
        
#         # Convert event_datetime_local to datetime
#         pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
        
#         # Loop through each 3-hour interval
#         for start_hour in range(0, 24, 3):
#             end_hour = start_hour + 3
            
#             # Filter data for the current 3-hour interval
#             interval_data = pe_dl_table_gen[
#                 (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
#                 (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
#             ].copy()
            
#             # Create 3-hour interval column
#             interval = get_3_hour_interval(start_hour, formatted_current_date)
#             interval_data['3_hour_interval'] = interval
            
#             # Aggregate data for geohash5
#             logging.info(f"Aggregating data for geohash5 for interval {start_hour} to {end_hour}")
#             aggregated_data_5 = interval_data.groupby('geohash5').agg(
#                 no_of_points=('geohash5', 'size'),
#                 no_of_unique_users=('cuebiq_id', 'nunique'),
#                 local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
#             ).reset_index()
            
#             # Filter rows with no_of_unique_users > 10
#             filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
#             # Append the DataFrame to the CSV file for geohash5
#             logging.info(f"Exporting data to {export_file_path_5} for interval {start_hour} to {end_hour}")
#             filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
            
#             # After the first write, set the header flag to False for geohash5
#             write_header_5 = False
            
#             # Aggregate data for geohash3
#             logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
#             aggregated_data_3 = interval_data.groupby('geohash3').agg(
#                 no_of_points=('geohash3', 'size'),
#                 no_of_unique_users=('cuebiq_id', 'nunique'),
#                 local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
#             ).reset_index()
            
#             # Filter rows with no_of_unique_users > 10
#             filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
#             # Append the DataFrame to the CSV file for geohash3
#             logging.info(f"Exporting data to {export_file_path_3} for interval {start_hour} to {end_hour}")
#             filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
            
#             # After the first write, set the header flag to False for geohash3
#             write_header_3 = False
            
#             logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_path_5} and {export_file_path_3}")
#     except Exception as e:
#         logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
#     # Move to the next day
#     current_date += timedelta(days=1)

In [12]:
# Loop through each day in the date range

for current_date in date_range:
    try:
        formatted_current_date = current_date.strftime('%Y-%m-%d')
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)

        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')

        # SQL Query to fetch data for the current date with geohashes calculated in the query
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng,
            TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            geohash_encode(lat, lng, 5) AS geohash5,
            geohash_encode(lat, lng, 3) AS geohash3
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """

        logging.info(f"Executing SQL query for date {formatted_current_date}")
        pe_dl_table_gen = sql_engine.read_sql(query)
        
        # Convert event_datetime_local to datetime
        pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
        
        # Loop through each 3-hour interval
        for start_hour in range(0, 24, 3):
            end_hour = start_hour + 3
            
            # Filter data for the current 3-hour interval
            interval_data = pe_dl_table_gen[
                (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
                (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
            ].copy()
            
            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            interval_data['3_hour_interval'] = interval
            interval_data['local_date'] = formatted_current_date
            
            # Aggregate data for geohash5
            logging.info(f"Aggregating data for geohash5 for interval {start_hour} to {end_hour}")
            aggregated_data_5 = interval_data.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Append the DataFrame to the CSV file for geohash5
            logging.info(f"Exporting data to {export_file_path_5} for interval {start_hour} to {end_hour}")
            filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
            
            # After the first write, set the header flag to False for geohash5
            write_header_5 = False
            
            # Aggregate data for geohash3
            logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
            aggregated_data_3 = interval_data.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Append the DataFrame to the CSV file for geohash3
            logging.info(f"Exporting data to {export_file_path_3} for interval {start_hour} to {end_hour}")
            filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
            
            # After the first write, set the header flag to False for geohash3
            write_header_3 = False
            
            logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_path_5} and {export_file_path_3}")
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)
    


In [16]:
# Loop through each day in the date range
for current_date in date_range:
    try:
        formatted_current_date = current_date.strftime('%Y-%m-%d')
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)

        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')

        # SQL Query to fetch and aggregate data for geohash5 and geohash3
        query = f"""
        SELECT 
            geohash5,
            COUNT(*) as no_of_points,
            COUNT(DISTINCT cuebiq_id) as no_of_unique_users,
            '{formatted_current_date}' as event_date,
            '{get_3_hour_interval(0, formatted_current_date)}' as interval_0_3,
            '{get_3_hour_interval(3, formatted_current_date)}' as interval_3_6,
            '{get_3_hour_interval(6, formatted_current_date)}' as interval_6_9,
            '{get_3_hour_interval(9, formatted_current_date)}' as interval_9_12,
            '{get_3_hour_interval(12, formatted_current_date)}' as interval_12_15,
            '{get_3_hour_interval(15, formatted_current_date)}' as interval_15_18,
            '{get_3_hour_interval(18, formatted_current_date)}' as interval_18_21,
            '{get_3_hour_interval(21, formatted_current_date)}' as interval_21_24
        FROM (
            SELECT 
                cuebiq_id, 
                geohash_encode(lat, lng, 5) AS geohash5,
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        )
        GROUP BY geohash5
        HAVING COUNT(DISTINCT cuebiq_id) > 10
        """
        
        logging.info(f"Executing SQL5 query for date {formatted_current_date}")
        pe_dl_table_gen_5 = sql_engine.read_sql(query)

        # SQL Query to fetch and aggregate data for geohash3
        query = f"""
        SELECT 
            geohash3,
            COUNT(*) as no_of_points,
            COUNT(DISTINCT cuebiq_id) as no_of_unique_users,
            '{formatted_current_date}' as event_date,
            '{get_3_hour_interval(0, formatted_current_date)}' as interval_0_3,
            '{get_3_hour_interval(3, formatted_current_date)}' as interval_3_6,
            '{get_3_hour_interval(6, formatted_current_date)}' as interval_6_9,
            '{get_3_hour_interval(9, formatted_current_date)}' as interval_9_12,
            '{get_3_hour_interval(12, formatted_current_date)}' as interval_12_15,
            '{get_3_hour_interval(15, formatted_current_date)}' as interval_15_18,
            '{get_3_hour_interval(18, formatted_current_date)}' as interval_18_21,
            '{get_3_hour_interval(21, formatted_current_date)}' as interval_21_24
        FROM (
            SELECT 
                cuebiq_id, 
                geohash_encode(lat, lng, 3) AS geohash3,
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        )
        GROUP BY geohash3
        HAVING COUNT(DISTINCT cuebiq_id) > 10
        """
        
        logging.info(f"Executing SQL3 query for date {formatted_current_date}")
        pe_dl_table_gen_3 = sql_engine.read_sql(query)
        
        # Append the DataFrame to the CSV file for geohash5
        logging.info(f"Exporting data to {export_file_path_5} for date {formatted_current_date}")
        pe_dl_table_gen_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
        
        # After the first write, set the header flag to False for geohash5
        write_header_5 = False
        
        # Append the DataFrame to the CSV file for geohash3
        logging.info(f"Exporting data to {export_file_path_3} for date {formatted_current_date}")
        pe_dl_table_gen_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
        
        # After the first write, set the header flag to False for geohash3
        write_header_3 = False

        logging.info(f"Appended data for date {formatted_current_date} to {export_file_path_5} and {export_file_path_3}")
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

2024-06-28 17:15:28,436 - INFO - Executing SQL query for date 20191110
2024-06-28 17:16:27,608 - INFO - Executing SQL query for date 20191110
2024-06-28 17:17:30,670 - INFO - Exporting data to /home/jovyan/Data/pd3_test/pd_co_2019_agg5_3h.csv for date 20191110
2024-06-28 17:17:30,715 - INFO - Exporting data to /home/jovyan/Data/pd3_test/pd_co_2019_agg3_3h.csv for date 20191110
2024-06-28 17:17:30,725 - INFO - Appended data for date 20191110 to /home/jovyan/Data/pd3_test/pd_co_2019_agg5_3h.csv and /home/jovyan/Data/pd3_test/pd_co_2019_agg3_3h.csv
2024-06-28 17:17:30,726 - INFO - Executing SQL query for date 20191111
2024-06-28 17:18:26,612 - INFO - Executing SQL query for date 20191111
2024-06-28 17:19:16,483 - INFO - Exporting data to /home/jovyan/Data/pd3_test/pd_co_2019_agg5_3h.csv for date 20191111
2024-06-28 17:19:16,531 - INFO - Exporting data to /home/jovyan/Data/pd3_test/pd_co_2019_agg3_3h.csv for date 20191111
2024-06-28 17:19:16,549 - INFO - Appended data for date 20191111 to 

In [12]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Loop through each day in the date range
for current_date in date_range:
    try:
        formatted_current_date = current_date.strftime('%Y-%m-%d')

        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)

        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')

        # SQL Query to fetch data for the current date
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng,
            TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """

        logging.info(f"Executing SQL query for date {formatted_current_date}")
        pe_dl_table_gen = sql_engine.read_sql(query)

        # Convert event_datetime_local to datetime
        pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])

        # Calculate geohashes
        logging.info(f"Processing geohashes for date {formatted_current_date}")
        pe_dl_table_gen = calculate_geohashes(pe_dl_table_gen, 'lat', 'lng')

        # Loop through each 3-hour interval
        for start_hour in range(0, 24, 3):
            end_hour = start_hour + 3

            # Filter data for the current 3-hour interval
            interval_data = pe_dl_table_gen[
                (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
                (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
            ].copy()

            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            interval_data['3_hour_interval'] = interval

            # Aggregate data for geohash5
            logging.info(f"Aggregating data for geohash5 for interval {start_hour} to {end_hour}")
            aggregated_data_5 = interval_data.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()

            # Append the DataFrame to the CSV file for geohash5
            logging.info(f"Exporting data to {export_file_name_5} for interval {start_hour} to {end_hour}")
            filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)

            # After the first write, set the header flag to False for geohash5
            write_header_5 = False

            # Aggregate data for geohash3
            logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
            aggregated_data_3 = interval_data.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()

            # Append the DataFrame to the CSV file for geohash3
            logging.info(f"Exporting data to {export_file_name_3} for interval {start_hour} to {end_hour}")
            filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)

            # After the first write, set the header flag to False for geohash3
            write_header_3 = False

            # logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_path_5} and {export_file_path_3}")
    
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)
        
        

2024-06-28 16:37:15,928 - INFO - Executing SQL query for date 20191106
2024-06-28 16:38:45,487 - INFO - Processing geohashes for date 20191106
2024-06-28 16:40:17,660 - INFO - Aggregating data for geohash5 for interval 0 to 3
2024-06-28 16:40:17,804 - INFO - Exporting data to pd_co_2019_agg5_3h.csv for interval 0 to 3
2024-06-28 16:40:17,817 - INFO - Aggregating data for geohash3 for interval 0 to 3
2024-06-28 16:40:17,855 - INFO - Exporting data to pd_co_2019_agg3_3h.csv for interval 0 to 3
2024-06-28 16:40:17,862 - INFO - Appended data for date 20191106 interval 0 to 3 to /home/jovyan/Data/pd3_test/pd_co_2019_agg5_3h.csv and /home/jovyan/Data/pd3_test/pd_co_2019_agg3_3h.csv
2024-06-28 16:40:18,370 - INFO - Aggregating data for geohash5 for interval 3 to 6
2024-06-28 16:40:18,546 - INFO - Exporting data to pd_co_2019_agg5_3h.csv for interval 3 to 6
2024-06-28 16:40:18,565 - INFO - Aggregating data for geohash3 for interval 3 to 6
2024-06-28 16:40:18,608 - INFO - Exporting data to pd_c

In [None]:
# This one seemms to be working 

# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            break
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")

# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    try:
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        
        # Construct the SQL query
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """
        
        logging.info(f"Executing SQL query for date {formatted_current_date}")
        
        pe_dl_table_gen = sql_engine.read_sql_chunked(query)
        
        # Convert the generator to a DataFrame
        chunks = [chunk for chunk in pe_dl_table_gen]
        if chunks:
            pe_dl_table_df = pd.concat(chunks, ignore_index=True)
            
            # Calculate geohashes
            pe_dl_table_df['geohash5'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
            pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
            
            # Aggregate data for geohash5
            aggregated_data_5 = pe_dl_table_df.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_5.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash5 into SQL table
            if not filtered_data_5.empty:
                table_name_agg5 = f"pd_{country_code.lower()}_{formatted_current_date}_agg5"
                insert_data_with_retry(filtered_data_5, table_name_agg5, con)
            
            # Aggregate data for geohash3
            aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_3.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash3 into SQL table
            if not filtered_data_3.empty:
                table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                insert_data_with_retry(filtered_data_3, table_name_agg3, con)
        
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

logging.info("Data extraction, aggregation, and saving completed.")

end_time = time.time()  # End timing

# Calculate and print the total time taken
total_time = end_time - start_time
logging.info(f"Total time taken: {total_time} seconds")


# Check by single

In [18]:
query = f"""
SELECT 
    cuebiq_id, 
    event_zoned_datetime, 
    processing_date,
    lat,
    lng,
    TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
    EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
FROM {pe_dl_table}
WHERE 
    processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
    AND country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
"""
logging.info(f"Executing SQL query for date {formatted_current_date}")
pe_dl_table_gen = sql_engine.read_sql(query)
pe_dl_table_gen

Unnamed: 0,cuebiq_id,event_zoned_datetime,processing_date,lat,lng,event_datetime_local,event_hour
0,1557947231,2019-03-04T21:50:50+07:00,20190304,-7.314244,112.718623,2019-03-04 21:50:50,21
1,1557947231,2019-03-04T22:08:02+07:00,20190304,-7.314244,112.718623,2019-03-04 22:08:02,22
2,1557947231,2019-03-04T22:08:03+07:00,20190304,-7.314244,112.718623,2019-03-04 22:08:03,22
3,1557947231,2019-03-04T22:16:41+07:00,20190304,-7.314244,112.718623,2019-03-04 22:16:41,22
4,1557947231,2019-03-04T22:32:34+07:00,20190304,-7.314244,112.718623,2019-03-04 22:32:34,22
...,...,...,...,...,...,...,...
9610153,1860654372,2019-03-04T18:16:46+08:00,20190305,-8.655824,115.126922,2019-03-04 18:16:46,18
9610154,1860654372,2019-03-04T19:53:50+08:00,20190305,-8.655824,115.126922,2019-03-04 19:53:50,19
9610155,1860654372,2019-03-04T15:56:32+08:00,20190305,-8.655472,115.126692,2019-03-04 15:56:32,15
9610156,1860654372,2019-03-04T17:18:17+08:00,20190305,-8.655308,115.126956,2019-03-04 17:18:17,17


In [None]:
import pandas as pd
import geohash2
import os

# Define the export file path
export_file_path_3 = '/home/jovyan/Data/pd3_test/pd_co_2019_agg3_3h.csv'
export_file_path_5 = '/home/jovyan/Data/pd3_test/pd_co_2019_agg5_3h.csv'

# Function to calculate geohashes
def calculate_geohashes(df, lat_col, lng_col):
    df['geohash5'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=5), axis=1)
    df['geohash3'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=3), axis=1)
    return df

# Create a function to determine the 3-hour interval based on a given date
def get_3_hour_interval(start_hour, current_date):
    start_time = pd.Timestamp(current_date) + pd.Timedelta(hours=start_hour)
    end_time = start_time + pd.Timedelta(hours=3)
    return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# Convert event_datetime_local to datetime
pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])

# Calculate geohashes
pe_dl_table_gen = calculate_geohashes(pe_dl_table_gen, 'lat', 'lng')

# Initialize flags to write the headers only once
write_header_5 = True
write_header_3 = True

# Loop through each 3-hour interval
formatted_current_date = pd.to_datetime(formatted_current_date)  # Ensure it's a datetime object
for start_hour in range(0, 24, 3):
    end_hour = start_hour + 3
    
    # Filter data for the current 3-hour interval
    interval_data = pe_dl_table_gen[
        (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
        (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
    ].copy()
    
    # Create 3-hour interval column
    interval = get_3_hour_interval(start_hour, formatted_current_date)
    interval_data['3_hour_interval'] = interval
    
    # Aggregate data for geohash5
    aggregated_data_5 = interval_data.groupby('geohash5').agg(
        no_of_points=('geohash5', 'size'),
        no_of_unique_users=('cuebiq_id', 'nunique'),
        intervals=('3_hour_interval', lambda x: '; '.join(x.unique()))
    ).reset_index()
    
    # Filter rows with no_of_unique_users > 10
    filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
    
    # Append the DataFrame to the CSV file for geohash5
    filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
    
    # After the first write, set the header flag to False for geohash5
    write_header_5 = False
    
    # Aggregate data for geohash3
    aggregated_data_3 = interval_data.groupby('geohash3').agg(
        no_of_points=('geohash3', 'size'),
        no_of_unique_users=('cuebiq_id', 'nunique'),
        intervals=('3_hour_interval', lambda x: '; '.join(x.unique()))
    ).reset_index()
    
    # Filter rows with no_of_unique_users > 10
    filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
    
    # Append the DataFrame to the CSV file for geohash3
    filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
    
    # After the first write, set the header flag to False for geohash3
    write_header_3 = False
    
    print(f"Appended data for interval {start_hour} to {end_hour} to {export_file_path_5} and {export_file_path_3}")


# Test

In [10]:
query = f"""
SELECT 
    cuebiq_id, 
    event_zoned_datetime, 
    processing_date,
    lat,
    lng,
    TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
    EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
FROM {pe_dl_table}
WHERE 
    processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
    AND country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
    AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) IN (0, 1, 2)
"""
logging.info(f"Executing SQL query for date {formatted_current_date}")
pe_dl_table_gen = sql_engine.read_sql(query)
pe_dl_table_gen

2024-06-28 00:36:53,634 - INFO - Executing SQL query for date 20191101


Unnamed: 0,cuebiq_id,event_zoned_datetime,processing_date,lat,lng,event_datetime_local,event_hour
0,2488437649,2019-11-01T00:55:26-05:00,20191103,3.449487,-76.541778,2019-11-01 00:55:26,0
1,2488437649,2019-11-01T00:57:29-05:00,20191103,3.450623,-76.540825,2019-11-01 00:57:29,0
2,2488437649,2019-11-01T00:57:29-05:00,20191103,3.450623,-76.540825,2019-11-01 00:57:29,0
3,2488423975,2019-11-01T00:26:27-05:00,20191103,3.018457,-76.478375,2019-11-01 00:26:27,0
4,2488423975,2019-11-01T00:01:01-05:00,20191103,3.018794,-76.478607,2019-11-01 00:01:01,0
...,...,...,...,...,...,...,...
125556,2492387999,2019-11-01T00:25:02-05:00,20191120,3.423894,-76.528569,2019-11-01 00:25:02,0
125557,2492387999,2019-11-01T00:25:02-05:00,20191120,3.423894,-76.528569,2019-11-01 00:25:02,0
125558,2492387999,2019-11-01T00:20:54-05:00,20191120,3.424044,-76.528218,2019-11-01 00:20:54,0
125559,2489786886,2019-11-01T01:59:57-05:00,20191120,4.757085,-74.035650,2019-11-01 01:59:57,1


In [20]:
# Calculate geohashes
pe_dl_table_gen['geohash5'] = pe_dl_table_gen.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
pe_dl_table_gen['geohash3'] = pe_dl_table_gen.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
pe_dl_table_gen

Unnamed: 0,cuebiq_id,event_zoned_datetime,lat,lng,event_datetime_local,event_hour,geohash5,geohash3
0,1517207294,2019-03-04T00:52:01+08:00,-8.528349,115.243144,2019-03-04 00:52:01,0,qw3zq,qw3
1,1517207294,2019-03-04T00:45:32+08:00,-8.223645,115.396520,2019-03-04 00:45:32,0,qwd11,qwd
2,1522013587,2019-03-04T01:51:05+07:00,-7.669646,111.107331,2019-03-04 01:51:05,1,qqxh2,qqx
3,1522013587,2019-03-04T01:59:12+07:00,-7.669308,111.107703,2019-03-04 01:59:12,1,qqxh2,qqx
4,1818627892,2019-03-04T01:07:10+07:00,-6.189541,106.813124,2019-03-04 01:07:10,1,qqguy,qqg
...,...,...,...,...,...,...,...,...
599273,1750154524,2019-03-04T02:25:59+07:00,-6.151897,106.895090,2019-03-04 02:25:59,2,qquj0,qqu
599274,1750154524,2019-03-04T02:59:32+07:00,-6.151897,106.895090,2019-03-04 02:59:32,2,qquj0,qqu
599275,1752656232,2019-03-04T00:16:04+07:00,-0.057598,109.356632,2019-03-04 00:16:04,0,qrvz8,qrv
599276,1583178821,2019-03-04T02:07:45+08:00,-8.211994,124.528087,2019-03-04 02:07:45,2,qy893,qy8


In [15]:
# Convert event_datetime_local to datetime
pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])

# Create a function to determine the 3-hour interval
def get_3_hour_interval(dt):
    start_hour = (dt.hour // 3) * 3
    start_time = dt.replace(hour=start_hour, minute=0, second=0, microsecond=0)
    end_time = start_time + pd.Timedelta(hours=3)
    return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# Apply the function to create the 3-hour interval column
pe_dl_table_gen['3_hour_interval'] = pe_dl_table_gen['event_datetime_local'].apply(get_3_hour_interval)

# Aggregate data for geohash5
aggregated_data_5 = pe_dl_table_gen.groupby('geohash5').agg(
    no_of_points=('geohash5', 'size'),
    no_of_unique_users=('cuebiq_id', 'nunique'),
    local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
).reset_index()

# Filter rows with no_of_unique_users > 10
filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()

filtered_data_5

Unnamed: 0,geohash5,no_of_points,no_of_unique_users,intervals
30,6rgpz,26,12,2019-11-01 00:00:00/2019-11-01 03:00:00
34,6rgrg,21,15,2019-11-01 00:00:00/2019-11-01 03:00:00
35,6rgru,36,23,2019-11-01 00:00:00/2019-11-01 03:00:00
36,6rgrv,27,19,2019-11-01 00:00:00/2019-11-01 03:00:00
37,6rgry,32,14,2019-11-01 00:00:00/2019-11-01 03:00:00
...,...,...,...,...
2353,d650p,65,12,2019-11-01 00:00:00/2019-11-01 03:00:00
2359,d6h1s,199,27,2019-11-01 00:00:00/2019-11-01 03:00:00
2360,d6h1t,66,16,2019-11-01 00:00:00/2019-11-01 03:00:00
2363,d6h8e,59,15,2019-11-01 00:00:00/2019-11-01 03:00:00


In [19]:
import pandas as pd
import geohash2

query = f"""
SELECT 
    cuebiq_id, 
    event_zoned_datetime, 
    lat,
    lng,
    TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
    EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
FROM {pe_dl_table}
WHERE 
    processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
    AND country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
    AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) IN (0, 1, 2)
"""
logging.info(f"Executing SQL query for date {formatted_current_date}")
pe_dl_table_gen = sql_engine.read_sql(query)
pe_dl_table_gen

Unnamed: 0,cuebiq_id,event_zoned_datetime,lat,lng,event_datetime_local,event_hour
0,1517207294,2019-03-04T00:52:01+08:00,-8.528349,115.243144,2019-03-04 00:52:01,0
1,1517207294,2019-03-04T00:45:32+08:00,-8.223645,115.396520,2019-03-04 00:45:32,0
2,1522013587,2019-03-04T01:51:05+07:00,-7.669646,111.107331,2019-03-04 01:51:05,1
3,1522013587,2019-03-04T01:59:12+07:00,-7.669308,111.107703,2019-03-04 01:59:12,1
4,1818627892,2019-03-04T01:07:10+07:00,-6.189541,106.813124,2019-03-04 01:07:10,1
...,...,...,...,...,...,...
599273,1750154524,2019-03-04T02:25:59+07:00,-6.151897,106.895090,2019-03-04 02:25:59,2
599274,1750154524,2019-03-04T02:59:32+07:00,-6.151897,106.895090,2019-03-04 02:59:32,2
599275,1752656232,2019-03-04T00:16:04+07:00,-0.057598,109.356632,2019-03-04 00:16:04,0
599276,1583178821,2019-03-04T02:07:45+08:00,-8.211994,124.528087,2019-03-04 02:07:45,2


In [19]:
# Function to calculate geohashes
def calculate_geohashes(df, lat_col, lng_col):
    df['geohash5'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=5), axis=1)
    df['geohash3'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=3), axis=1)
    return df

# Calculate geohashes
pe_dl_table_gen = calculate_geohashes(pe_dl_table_gen, 'lat', 'lng')

# Convert event_datetime_local to datetime
pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])

# Create a function to determine the 3-hour interval
def get_3_hour_interval(dt):
    start_hour = (dt.hour // 3) * 3
    start_time = dt.replace(hour=start_hour, minute=0, second=0, microsecond=0)
    end_time = start_time + pd.Timedelta(hours=3)
    return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# Apply the function to create the 3-hour interval column
pe_dl_table_gen['3_hour_interval'] = pe_dl_table_gen['event_datetime_local'].apply(get_3_hour_interval)

# Aggregate data for geohash5
aggregated_data_5 = pe_dl_table_gen.groupby('geohash5').agg(
    no_of_points=('geohash5', 'size'),
    no_of_unique_users=('cuebiq_id', 'nunique'),
    intervals=('3_hour_interval', lambda x: '; '.join(x.unique()))
).reset_index()

# Filter rows with no_of_unique_users > 10
filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()

filtered_data_5

     geohash5  no_of_points  no_of_unique_users  \
30      6rgpz            26                  12   
34      6rgrg            21                  15   
35      6rgru            36                  23   
36      6rgrv            27                  19   
37      6rgry            32                  14   
...       ...           ...                 ...   
2353    d650p            65                  12   
2359    d6h1s           199                  27   
2360    d6h1t            66                  16   
2363    d6h8e            59                  15   
2365    d6h8s           100                  17   

                                    intervals  
30    2019-11-01 00:00:00/2019-11-01 03:00:00  
34    2019-11-01 00:00:00/2019-11-01 03:00:00  
35    2019-11-01 00:00:00/2019-11-01 03:00:00  
36    2019-11-01 00:00:00/2019-11-01 03:00:00  
37    2019-11-01 00:00:00/2019-11-01 03:00:00  
...                                       ...  
2353  2019-11-01 00:00:00/2019-11-01 03:00:00  
235

In [20]:
filtered_data_5

Unnamed: 0,geohash5,no_of_points,no_of_unique_users,intervals
30,6rgpz,26,12,2019-11-01 00:00:00/2019-11-01 03:00:00
34,6rgrg,21,15,2019-11-01 00:00:00/2019-11-01 03:00:00
35,6rgru,36,23,2019-11-01 00:00:00/2019-11-01 03:00:00
36,6rgrv,27,19,2019-11-01 00:00:00/2019-11-01 03:00:00
37,6rgry,32,14,2019-11-01 00:00:00/2019-11-01 03:00:00
...,...,...,...,...
2353,d650p,65,12,2019-11-01 00:00:00/2019-11-01 03:00:00
2359,d6h1s,199,27,2019-11-01 00:00:00/2019-11-01 03:00:00
2360,d6h1t,66,16,2019-11-01 00:00:00/2019-11-01 03:00:00
2363,d6h8e,59,15,2019-11-01 00:00:00/2019-11-01 03:00:00
