In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install geohash2

Note: you may need to restart the kernel to use updated packages.


In [4]:
import gc
import os
import time
import logging
import geohash2
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from trino.dbapi import connect 

In [5]:
# SQL engine
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()

In [6]:
# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Export to jupyter notebook
should be working for CO, ID, IN, MX

In [7]:
# # Create a function to determine the 3-hour interval based on a given date
# def get_3_hour_interval(start_hour, current_date):
#     start_time = pd.Timestamp(current_date) + pd.Timedelta(hours=start_hour)
#     end_time = start_time + pd.Timedelta(hours=3)
#     return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

def get_3_hour_interval(start_hour, formatted_current_date):
    end_hour = start_hour + 3
    return f"{formatted_current_date} {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [10]:
# Define the date range
start_date = '2019-11-20'
end_date = '2019-12-31'
date_range = pd.date_range(start=start_date, end=end_date)

country_code = 'ID'

# Define the export file paths
export_file_name_5 = f"pd_{country_code.lower()}_2019_agg5_3h.csv"
export_file_name_3 = f"pd_{country_code.lower()}_2019_agg3_3h.csv"

# Define the export file paths
# export_path = '/home/jovyan/Data/pd3_test/'
export_path = '/home/jovyan/Data/pd3/'
export_file_path_5 = f"{export_path}{export_file_name_5}"
export_file_path_3 = f"{export_path}{export_file_name_3}"

# Check if files already exist to determine header writing
write_header_5 = not os.path.exists(export_file_path_5)
write_header_3 = not os.path.exists(export_file_path_3)

# List to record errors
error_records = []

In [None]:
for current_date in date_range:
    formatted_current_date = current_date.strftime('%Y%m%d')
    
    for start_hour in range(0, 24, 3):
        end_hour = start_hour + 3
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

        try:
            # SQL Query to fetch data for the current 3-hour interval
            query = f"""
            SELECT 
                cuebiq_id, 
                event_zoned_datetime, 
                processing_date,
                lat,
                lng,
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                geohash_encode(lat, lng, 5) AS geohash_5,
                geohash_encode(lat, lng, 3) AS geohash_3
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
                AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN {start_hour} AND {end_hour-1}
            """

            logging.info(f"Executing SQL query for date {formatted_current_date} and interval {start_hour} to {end_hour}")
            pe_dl_table_gen = sql_engine.read_sql(query)
            
            # Convert event_datetime_local to datetime once
            pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
            
            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            pe_dl_table_gen['3_hour_interval'] = interval
            pe_dl_table_gen['local_date'] = formatted_current_date
            
            for geohash_col, export_file_path, write_header in [
                ('geohash_5', export_file_path_5, write_header_5),
                ('geohash_3', export_file_path_3, write_header_3)
            ]:
                # Aggregate data for geohash
                logging.info(f"Aggregating data for {geohash_col} for interval {start_hour} to {end_hour}")
                aggregated_data = pe_dl_table_gen.groupby(geohash_col).agg(
                    no_of_points=(geohash_col, 'size'),
                    no_of_unique_users=('cuebiq_id', 'nunique'),
                    local_time=('3_hour_interval', 'first'),
                    local_date=('local_date', 'first')
                ).reset_index()
                
                # Filter rows with no_of_unique_users > 10
                filtered_data = aggregated_data[aggregated_data['no_of_unique_users'] > 10].copy()
                
                # Append the DataFrame to the CSV file
                logging.info(f"Exporting data for interval {start_hour} to {end_hour}")
                filtered_data.to_csv(export_file_path, mode='a', header=write_header, index=False)
                
                # After the first write, set the header flag to False
                if geohash_col == 'geohash5':
                    write_header_5 = False
                else:
                    write_header_3 = False
                
            logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_name_5} and {export_file_name_3}")
        
        except Exception as e:
            error_records.append((formatted_current_date, start_hour, end_hour, str(e))) 
            logging.error(f"Error while processing data for date {formatted_current_date} interval {start_hour} to {end_hour}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

# Log all error records at the end
logging.info("Error records:")
for record in error_records:
    logging.info(f"Date: {record[0]}, Interval: {record[1]}:00 - {record[2]}:00, Error: {record[3]}")

2024-07-03 06:35:37,576 - INFO - Executing SQL query for date 20191120 and interval 0 to 3
2024-07-03 06:36:52,867 - INFO - Aggregating data for geohash_5 for interval 0 to 3
2024-07-03 06:36:53,247 - INFO - Exporting data for interval 0 to 3
2024-07-03 06:36:53,276 - INFO - Aggregating data for geohash_3 for interval 0 to 3
2024-07-03 06:36:53,603 - INFO - Exporting data for interval 0 to 3
2024-07-03 06:36:53,614 - INFO - Appended data for date 20191120 interval 0 to 3 to pd_id_2019_agg5_3h.csv and pd_id_2019_agg3_3h.csv
2024-07-03 06:36:53,615 - INFO - Executing SQL query for date 20191120 and interval 3 to 6
2024-07-03 06:38:04,985 - INFO - Aggregating data for geohash_5 for interval 3 to 6
2024-07-03 06:38:05,147 - INFO - Exporting data for interval 3 to 6
2024-07-03 06:38:05,165 - INFO - Aggregating data for geohash_3 for interval 3 to 6
2024-07-03 06:38:05,304 - INFO - Exporting data for interval 3 to 6
2024-07-03 06:38:05,318 - INFO - Appended data for date 20191120 interval 3 

2024-07-01 15:20:09,850 - INFO - Executing SQL query for date 20190626 and interval 18 to 21

## Filling dates

In [12]:
def get_3_hour_interval(start_hour, formatted_current_date):
    end_hour = start_hour + 3
    return f"{formatted_current_date} {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Inputs
inputs = [
    # {'date': '20190118', 'interval': '18:00 - 21:00'},
    # {'date': '20190626', 'interval': '18:00 - 21:00'},
    # {'date': '20190710', 'interval': '12:00 - 15:00'},
    # {'date': '20190713', 'interval': '18:00 - 21:00'},
    # {'date': '20190726', 'interval': '15:00 - 18:00'},
    # {'date': '20190810', 'interval': '9:00 - 12:00'},
    # {'date': '20190827', 'interval': '15:00 - 18:00'},
    # {'date': '20190830', 'interval': '0:00 - 3:00'},
    # {'date': '20190920', 'interval': '6:00 - 9:00'},
    # {'date': '20190927', 'interval': '21:00 - 24:00'},
    # {'date': '20190928', 'interval': '0:00 - 3:00'},
    # {'date': '20190929', 'interval': '18:00 - 21:00'},
    # {'date': '20191009', 'interval': '9:00 - 12:00'},
    # {'date': '20191012', 'interval': '9:00 - 12:00'},
    # {'date': '20191013', 'interval': '15:00 - 18:00'}
    # {'date': '20191014', 'interval': '18:00 - 21:00'},
    # {'date': '20191101', 'interval': '12:00 - 15:00'},
    # {'date': '20191122', 'interval': '18:00 - 21:00'},
    {'date': '20191220', 'interval': '12:00 - 15:00'}
]
country_code = 'ID'

# Define the export file paths
export_file_name_5 = f"pd_{country_code.lower()}_2019_agg5_3h.csv"
export_file_name_3 = f"pd_{country_code.lower()}_2019_agg3_3h.csv"

# Define the export file paths
# export_path = '/home/jovyan/Data/pd3_test/'
export_path = '/home/jovyan/Data/pd3/'
export_file_path_5 = f"{export_path}{export_file_name_5}"
export_file_path_3 = f"{export_path}{export_file_name_3}"

write_header_5 = False
write_header_3 = False


In [13]:
# List to record errors
error_records = []

for input in inputs:
    input_date = input['date']
    input_interval = input['interval']

    # Parse the input
    formatted_current_date = input_date
    start_hour = int(input_interval.split(':')[0])
    end_hour = start_hour + 3

    # Calculate the lookback and lookahead dates
    current_date = datetime.strptime(input_date, '%Y%m%d')
    lookback_date = current_date - timedelta(days=1)
    lookahead_date = current_date + timedelta(days=35)

    # Format dates for the SQL query in 'yyyymmdd' format
    formatted_lookback_date = lookback_date.strftime('%Y%m%d')
    formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

    try:
        # SQL Query to fetch data for the specified date and interval
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng,
            TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            geohash_encode(lat, lng, 5) AS geohash_5,
            geohash_encode(lat, lng, 3) AS geohash_3
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
            AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN {start_hour} AND {end_hour-1}
        """

        logging.info(f"Executing SQL query for date {formatted_current_date} and interval {start_hour} to {end_hour}")
        pe_dl_table_gen = sql_engine.read_sql(query)
        
        # Convert event_datetime_local to datetime once
        pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
        
        # Create 3-hour interval column
        interval = get_3_hour_interval(start_hour, formatted_current_date)
        pe_dl_table_gen['3_hour_interval'] = interval
        pe_dl_table_gen['local_date'] = formatted_current_date
        
        for geohash_col, export_file_path, write_header in [
            ('geohash_5', export_file_path_5, write_header_5),
            ('geohash_3', export_file_path_3, write_header_3)
        ]:
            # Aggregate data for geohash
            logging.info(f"Aggregating data for {geohash_col} for interval {start_hour} to {end_hour}")
            aggregated_data = pe_dl_table_gen.groupby(geohash_col).agg(
                no_of_points=(geohash_col, 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data = aggregated_data[aggregated_data['no_of_unique_users'] > 10].copy()
            
            # Append the DataFrame to the CSV file
            logging.info(f"Exporting data for interval {start_hour} to {end_hour}")
            filtered_data.to_csv(export_file_path, mode='a', header=write_header, index=False)
            
            # After the first write, set the header flag to False
            if geohash_col == 'geohash_5':
                write_header_5 = False
            else:
                write_header_3 = False
            
        logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_name_5} and {export_file_name_3}")
    
    except Exception as e:
        error_records.append((formatted_current_date, start_hour, end_hour, str(e)))
        logging.error(f"Error while processing data for date {formatted_current_date} interval {start_hour} to {end_hour}: {e}")

# Log all error records at the end
logging.info("Error records:")
for record in error_records:
    logging.info(f"Date: {record[0]}, Interval: {record[1]}:00 - {record[2]}:00, Error: {record[3]}")

2024-07-03 20:41:00,654 - INFO - Executing SQL query for date 20191220 and interval 12 to 15
2024-07-03 20:42:36,732 - INFO - Aggregating data for geohash_5 for interval 12 to 15
2024-07-03 20:42:36,908 - INFO - Exporting data for interval 12 to 15
2024-07-03 20:42:36,925 - INFO - Aggregating data for geohash_3 for interval 12 to 15
2024-07-03 20:42:37,091 - INFO - Exporting data for interval 12 to 15
2024-07-03 20:42:37,101 - INFO - Appended data for date 20191220 interval 12 to 15 to pd_id_2019_agg5_3h.csv and pd_id_2019_agg3_3h.csv
2024-07-03 20:42:37,102 - INFO - Error records:


# Export to schema

In [None]:
def get_3_hour_interval(start_hour, formatted_current_date):
    end_hour = start_hour + 3
    return f"{formatted_current_date} {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
# output_schema_name = 'presence_data'
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_date = '2019-11-01'
end_date = '2019-12-31'
date_range = pd.date_range(start=start_date, end=end_date)

country_code = 'CO'
country_abbre = country_code.lower()  
master_table_3 = f"pd_{country_abbre}_2019_3h_agg3"
master_table_5 = f"pd_{country_abbre}_2019_3h_agg5"

# Create the master tables if they do not exist
create_table_query_3 = f"""
CREATE TABLE IF NOT EXISTS {master_table_3}(
    geohash_3 varchar,
    no_of_points bigint,
    no_of_unique_users bigint,
    local_time varchar,
    local_date varchar
)
"""
create_table_query_5 = f"""
CREATE TABLE IF NOT EXISTS {master_table_5}(
    geohash_5 varchar,
    no_of_points bigint,
    no_of_unique_users bigint,
    local_time varchar,
    local_date varchar
)
"""
    
with con.connect() as connection:
    logging.info(f"Creating master table: {master_table_3}")
    connection.execute(create_table_query_3)
    logging.info(f"Creating master table: {master_table_5}")
    connection.execute(create_table_query_5)

# List to record errors
error_records = []

In [None]:
# Loop through each day in the date range
for current_date in date_range:
    formatted_current_date = current_date.strftime('%Y%m%d')
    
    for start_hour in range(0, 24, 3):
        end_hour = start_hour + 3
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

        try:
            # SQL Query to fetch data for the current 3-hour interval
            query = f"""
            SELECT 
                cuebiq_id, 
                event_zoned_datetime, 
                processing_date,
                lat,
                lng,
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                geohash_encode(lat, lng, 5) AS geohash_5,
                geohash_encode(lat, lng, 3) AS geohash_3
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
                AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN {start_hour} AND {end_hour-1}
            """

            logging.info(f"Executing SQL query for date {formatted_current_date} and interval {start_hour} to {end_hour}")
            pe_dl_table_gen = sql_engine.read_sql(query)
            
            # Convert event_datetime_local to datetime once
            pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
            
            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            pe_dl_table_gen['3_hour_interval'] = interval
            pe_dl_table_gen['local_date'] = formatted_current_date

            # Process for geohash_5
            logging.info(f"Aggregating data for geohash_5 for interval {start_hour} to {end_hour}")
            aggregated_data_5 = pe_dl_table_gen.groupby('geohash_5').agg(
                no_of_points=('geohash_5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            filtered_data_5.to_sql(master_table_5, con, if_exists='append', index=False)
            logging.info(f"Inserted aggregated data for date {formatted_current_date} interval {start_hour} to {end_hour} into {master_table_3}")

            # Process for geohash_3
            logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
            aggregated_data_3 = pe_dl_table_gen.groupby('geohash_3').agg(
                no_of_points=('geohash_3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            filtered_data_3.to_sql(master_table_3, con, if_exists='append', index=False, method='multi')
            logging.info(f"Inserted aggregated data for date {formatted_current_date} interval {start_hour} to {end_hour} into the {master_table_5}")
        
        except Exception as e:
            error_records.append((formatted_current_date, start_hour, end_hour, str(e)))
            logging.error(f"Error while processing data for date {formatted_current_date} interval {start_hour} to {end_hour}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

# Log all error records at the end
logging.info("Error records:")
for record in error_records:
    logging.info(f"Date: {record[0]}, Interval: {record[1]}:00 - {record[2]}:00, Error: {record[3]}")

# Check output

In [33]:
# Define the file paths
export_file_path_5 = '/home/jovyan/Data/pd3/pd_co_2019_agg5_3h.csv'
export_file_path_3 = '/home/jovyan/Data/pd3/pd_co_2019_agg3_3h.csv'

# Read the geohash_5 CSV file to get the column names
data_5 = pd.read_csv(export_file_path_5)
data_5['local_date'] = pd.to_numeric(data_5['local_date'], errors='coerce')
data_5 = data_5.dropna(subset=['local_date'])
data_5['local_date'] = data_5['local_date'].astype(int)

data_5

Unnamed: 0,geohash_5,no_of_points,no_of_unique_users,local_time,local_date
0,d0rfr,106,21,20191101 00:00:00 - 03:00:00,20191101
1,d20u8,69,16,20191101 00:00:00 - 03:00:00,20191101
2,d20u9,307,48,20191101 00:00:00 - 03:00:00,20191101
3,d21nc,1073,170,20191101 00:00:00 - 03:00:00,20191101
4,d21p1,23,11,20191101 00:00:00 - 03:00:00,20191101
...,...,...,...,...,...
232296,d650p,149,27,20191108 09:00:00 - 12:00:00,20191108
232297,d6h1s,634,80,20191108 09:00:00 - 12:00:00,20191108
232298,d6h1t,280,54,20191108 09:00:00 - 12:00:00,20191108
232299,d6h8e,87,21,20191108 09:00:00 - 12:00:00,20191108


In [34]:
# Read the geohash_3 CSV file without headers
data_3 = pd.read_csv(export_file_path_3, header=None)

data_3.columns = data_5.columns # Assign column names from geohash_5 to geohash_3 for CO

data_3['local_date'] = pd.to_numeric(data_3['local_date'], errors='coerce')
data_3 = data_3.dropna(subset=['local_date'])
data_3['local_date'] = data_3['local_date'].astype(int)

data_3

Unnamed: 0,geohash_5,no_of_points,no_of_unique_users,local_time,local_date
0,d0r,125,24,20191101 00:00:00 - 03:00:00,20191101
1,d20,678,107,20191101 00:00:00 - 03:00:00,20191101
2,d21,1498,242,20191101 00:00:00 - 03:00:00,20191101
3,d22,253,38,20191101 00:00:00 - 03:00:00,20191101
4,d23,1636,289,20191101 00:00:00 - 03:00:00,20191101
...,...,...,...,...,...
19582,d3s,210,20,20191108 09:00:00 - 12:00:00,20191108
19583,d3u,1091,100,20191108 09:00:00 - 12:00:00,20191108
19584,d4j,1140,94,20191108 09:00:00 - 12:00:00,20191108
19585,d65,247,41,20191108 09:00:00 - 12:00:00,20191108


## Format Correction

In [49]:
# Extract the start time from local_time and convert to datetime
data_5['start_time'] = data_5['local_time'].str.split(' - ').str[0]
data_3['start_time'] = data_3['local_time'].str.split(' - ').str[0]

data_5['start_time'] = pd.to_datetime(data_5['start_time'], format='%Y%m%d %H:%M:%S')
data_3['start_time'] = pd.to_datetime(data_3['start_time'], format='%Y%m%d %H:%M:%S')

# Sort the data by start_time
data_5_sorted = data_5.sort_values(by='start_time').reset_index(drop=True)
data_3_sorted = data_3.sort_values(by='start_time').reset_index(drop=True)
data_3_sorted

Unnamed: 0,geohash_5,no_of_points,no_of_unique_users,local_time,local_date,start_time
0,d0r,125,24,20191101 00:00:00 - 03:00:00,20191101,2019-11-01 00:00:00
1,d34,21285,3198,20191101 00:00:00 - 03:00:00,20191101,2019-11-01 00:00:00
2,d35,1191,197,20191101 00:00:00 - 03:00:00,20191101,2019-11-01 00:00:00
3,d36,591,89,20191101 00:00:00 - 03:00:00,20191101,2019-11-01 00:00:00
4,d37,2493,392,20191101 00:00:00 - 03:00:00,20191101,2019-11-01 00:00:00
...,...,...,...,...,...,...
19582,d3d,7267,695,20191231 21:00:00 - 24:00:00,20191231,2019-12-31 21:00:00
19583,d39,497,43,20191231 21:00:00 - 24:00:00,20191231,2019-12-31 21:00:00
19584,d37,3988,462,20191231 21:00:00 - 24:00:00,20191231,2019-12-31 21:00:00
19585,d3j,148,13,20191231 21:00:00 - 24:00:00,20191231,2019-12-31 21:00:00


## Check output missing dates

In [36]:
# Generate the full range of 3-hour intervals
start_date = '2019-11-01'
end_date = '2019-12-31'
date_range = pd.date_range(start=start_date, end=end_date, freq='3H')

In [37]:
# Check for missing intervals in geohash_5 data
data_5_intervals = data_5_sorted['start_time']
missing_intervals_5 = date_range.difference(data_5_intervals)
print("Missing intervals in geohash_5 data:")
print(missing_intervals_5)

Missing intervals in geohash_5 data:
DatetimeIndex([], dtype='datetime64[ns]', freq=None)


In [38]:
# Check for missing intervals in geohash_3 data
data_3_intervals = data_3_sorted['start_time']
missing_intervals_3 = date_range.difference(data_3_intervals)
print("Missing intervals in geohash_3 data:")
print(missing_intervals_3)

Missing intervals in geohash_3 data:
DatetimeIndex([], dtype='datetime64[ns]', freq=None)


## Overwrite output

In [50]:
data_3_sorted = data_3_sorted.drop(['start_time'], axis=1)
data_5_sorted = data_5_sorted.drop(['start_time'], axis=1)
data_5_sorted

Unnamed: 0,geohash_5,no_of_points,no_of_unique_users,local_time,local_date
0,d0rfr,106,21,20191101 00:00:00 - 03:00:00,20191101
1,d3473,166,40,20191101 00:00:00 - 03:00:00,20191101
2,d3478,2829,516,20191101 00:00:00 - 03:00:00,20191101
3,d3479,57,18,20191101 00:00:00 - 03:00:00,20191101
4,d347b,2333,420,20191101 00:00:00 - 03:00:00,20191101
...,...,...,...,...,...
231810,d345y,30,11,20191231 21:00:00 - 24:00:00,20191231
231811,d345z,294,62,20191231 21:00:00 - 24:00:00,20191231
231812,d3467,304,43,20191231 21:00:00 - 24:00:00,20191231
231813,d3401,165,18,20191231 21:00:00 - 24:00:00,20191231


In [51]:
# Save the cleaned and sorted data back to the original paths, overwriting the existing files
data_5_sorted.to_csv(export_file_path_5, index=False)
data_3_sorted.to_csv(export_file_path_3, index=False)

# tests 

In [None]:
# Worked query per day. - not working for MX
for current_date in date_range:
    try:
        formatted_current_date = current_date.strftime('%Y-%m-%d')
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)

        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')

        # SQL Query to fetch data for the current date with geohashes calculated in the query
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng,
            TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            geohash_encode(lat, lng, 5) AS geohash5,
            geohash_encode(lat, lng, 3) AS geohash3
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """

        logging.info(f"Executing SQL query for date {formatted_current_date}")
        pe_dl_table_gen = sql_engine.read_sql(query)
        
        # Convert event_datetime_local to datetime
        pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
        
        # Loop through each 3-hour interval
        for start_hour in range(0, 24, 3):
            end_hour = start_hour + 3
            
            # Filter data for the current 3-hour interval
            interval_data = pe_dl_table_gen[
                (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
                (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
            ].copy()
            
            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            interval_data['3_hour_interval'] = interval
            interval_data['local_date'] = formatted_current_date
            
            # Aggregate data for geohash5
            logging.info(f"Aggregating data for geohash5 for interval {start_hour} to {end_hour}")
            aggregated_data_5 = interval_data.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Append the DataFrame to the CSV file for geohash5
            logging.info(f"Exporting data to {export_file_name_3} for interval {start_hour} to {end_hour}")
            filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
            
            # After the first write, set the header flag to False for geohash5
            write_header_5 = False
            
            # Aggregate data for geohash3
            logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
            aggregated_data_3 = interval_data.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Append the DataFrame to the CSV file for geohash3
            logging.info(f"Exporting data to {export_file_name_5} for interval {start_hour} to {end_hour}")
            filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
            
            # After the first write, set the header flag to False for geohash3
            write_header_3 = False
            
            logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_name_5} and {export_file_name_3}")
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)


# !!!!!!!!!!!!!!! 这个

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'presence_data'
# output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'CO'
start_date = 20191101
end_date = 20191102
# longitude_ranges = [(-82, -74.53125), (-74.53125, -65)]  # CO specific longitude ranges

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

failed_inserts = []

In [None]:
# Function to calculate geohashes
def calculate_geohashes(df, lat_col, lng_col):
    df['geohash5'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=5), axis=1)
    df['geohash3'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=3), axis=1)
    return df

# Create a function to determine the 3-hour interval based on a given date
def get_3_hour_interval(start_hour, current_date):
    start_time = pd.Timestamp(current_date) + pd.Timedelta(hours=start_hour)
    end_time = start_time + pd.Timedelta(hours=3)
    return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# Define the date range
start_date = '2019-11-12'
end_date = '2019-11-11'
date_range = pd.date_range(start=start_date, end=end_date)

country_code = 'CO'

# Define the export file paths
export_file_name_5 = f"pd_{country_code.lower()}_2019_agg5_3h.csv"
export_file_name_3 = f"pd_{country_code.lower()}_2019_agg3_3h.csv"

# Define the export file paths
export_path = '/home/jovyan/Data/pd3_test/'
export_file_path_5 = f"{export_path}{export_file_name_5}"
export_file_path_3 = f"{export_path}{export_file_name_3}"

# Check if files already exist to determine header writing
write_header_5 = not os.path.exists(export_file_path_5)
write_header_3 = not os.path.exists(export_file_path_3)


In [None]:
# # Loop through each day in the date range
# for current_date in date_range:
#     try:
#         formatted_current_date = current_date.strftime('%Y-%m-%d')
        
#         # Calculate the lookback and lookahead dates
#         lookback_date = current_date - timedelta(days=1)
#         lookahead_date = current_date + timedelta(days=35)

#         # Format dates for the SQL query in 'yyyymmdd' format
#         formatted_lookback_date = lookback_date.strftime('%Y%m%d')
#         formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
#         formatted_current_date = current_date.strftime('%Y%m%d')
        
#         # SQL Query to fetch data for the current date with geohashes calculated in the query
#         query = f"""
#         SELECT 
#             cuebiq_id, 
#             event_zoned_datetime, 
#             processing_date,
#             lat,
#             lng,
#             TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
#             EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
#             geohash_encode(lat, lng, 5) AS geohash5,
#             geohash_encode(lat, lng, 3) AS geohash3
#         FROM {pe_dl_table}
#         WHERE 
#             processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
#             AND country_code = '{country_code}' 
#             AND event_zoned_datetime IS NOT NULL
#             AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
#             AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
#         """
        
#         logging.info(f"Executing SQL query for date {formatted_current_date}")
#         pe_dl_table_gen = sql_engine.read_sql(query)
        
#         # Convert event_datetime_local to datetime
#         pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
        
#         # Loop through each 3-hour interval
#         for start_hour in range(0, 24, 3):
#             end_hour = start_hour + 3
            
#             # Filter data for the current 3-hour interval
#             interval_data = pe_dl_table_gen[
#                 (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
#                 (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
#             ].copy()
            
#             # Create 3-hour interval column
#             interval = get_3_hour_interval(start_hour, formatted_current_date)
#             interval_data['3_hour_interval'] = interval
            
#             # Aggregate data for geohash5
#             logging.info(f"Aggregating data for geohash5 for interval {start_hour} to {end_hour}")
#             aggregated_data_5 = interval_data.groupby('geohash5').agg(
#                 no_of_points=('geohash5', 'size'),
#                 no_of_unique_users=('cuebiq_id', 'nunique'),
#                 local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
#             ).reset_index()
            
#             # Filter rows with no_of_unique_users > 10
#             filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
#             # Append the DataFrame to the CSV file for geohash5
#             logging.info(f"Exporting data to {export_file_path_5} for interval {start_hour} to {end_hour}")
#             filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
            
#             # After the first write, set the header flag to False for geohash5
#             write_header_5 = False
            
#             # Aggregate data for geohash3
#             logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
#             aggregated_data_3 = interval_data.groupby('geohash3').agg(
#                 no_of_points=('geohash3', 'size'),
#                 no_of_unique_users=('cuebiq_id', 'nunique'),
#                 local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
#             ).reset_index()
            
#             # Filter rows with no_of_unique_users > 10
#             filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
#             # Append the DataFrame to the CSV file for geohash3
#             logging.info(f"Exporting data to {export_file_path_3} for interval {start_hour} to {end_hour}")
#             filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
            
#             # After the first write, set the header flag to False for geohash3
#             write_header_3 = False
            
#             logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_path_5} and {export_file_path_3}")
#     except Exception as e:
#         logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
#     # Move to the next day
#     current_date += timedelta(days=1)

In [None]:
# Loop through each day in the date range

for current_date in date_range:
    try:
        formatted_current_date = current_date.strftime('%Y-%m-%d')
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)

        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')

        # SQL Query to fetch data for the current date with geohashes calculated in the query
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng,
            TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            geohash_encode(lat, lng, 5) AS geohash5,
            geohash_encode(lat, lng, 3) AS geohash3
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """

        logging.info(f"Executing SQL query for date {formatted_current_date}")
        pe_dl_table_gen = sql_engine.read_sql(query)
        
        # Convert event_datetime_local to datetime
        pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
        
        # Loop through each 3-hour interval
        for start_hour in range(0, 24, 3):
            end_hour = start_hour + 3
            
            # Filter data for the current 3-hour interval
            interval_data = pe_dl_table_gen[
                (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
                (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
            ].copy()
            
            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            interval_data['3_hour_interval'] = interval
            interval_data['local_date'] = formatted_current_date
            
            # Aggregate data for geohash5
            logging.info(f"Aggregating data for geohash5 for interval {start_hour} to {end_hour}")
            aggregated_data_5 = interval_data.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Append the DataFrame to the CSV file for geohash5
            logging.info(f"Exporting data to {export_file_path_5} for interval {start_hour} to {end_hour}")
            filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
            
            # After the first write, set the header flag to False for geohash5
            write_header_5 = False
            
            # Aggregate data for geohash3
            logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
            aggregated_data_3 = interval_data.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Append the DataFrame to the CSV file for geohash3
            logging.info(f"Exporting data to {export_file_path_3} for interval {start_hour} to {end_hour}")
            filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
            
            # After the first write, set the header flag to False for geohash3
            write_header_3 = False
            
            logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_path_5} and {export_file_path_3}")
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)
    


In [None]:
# Loop through each day in the date range
for current_date in date_range:
    try:
        formatted_current_date = current_date.strftime('%Y-%m-%d')
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)

        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')

        # SQL Query to fetch and aggregate data for geohash5 and geohash3
        query = f"""
        SELECT 
            geohash5,
            COUNT(*) as no_of_points,
            COUNT(DISTINCT cuebiq_id) as no_of_unique_users,
            '{formatted_current_date}' as event_date,
            '{get_3_hour_interval(0, formatted_current_date)}' as interval_0_3,
            '{get_3_hour_interval(3, formatted_current_date)}' as interval_3_6,
            '{get_3_hour_interval(6, formatted_current_date)}' as interval_6_9,
            '{get_3_hour_interval(9, formatted_current_date)}' as interval_9_12,
            '{get_3_hour_interval(12, formatted_current_date)}' as interval_12_15,
            '{get_3_hour_interval(15, formatted_current_date)}' as interval_15_18,
            '{get_3_hour_interval(18, formatted_current_date)}' as interval_18_21,
            '{get_3_hour_interval(21, formatted_current_date)}' as interval_21_24
        FROM (
            SELECT 
                cuebiq_id, 
                geohash_encode(lat, lng, 5) AS geohash5,
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        )
        GROUP BY geohash5
        HAVING COUNT(DISTINCT cuebiq_id) > 10
        """
        
        logging.info(f"Executing SQL5 query for date {formatted_current_date}")
        pe_dl_table_gen_5 = sql_engine.read_sql(query)

        # SQL Query to fetch and aggregate data for geohash3
        query = f"""
        SELECT 
            geohash3,
            COUNT(*) as no_of_points,
            COUNT(DISTINCT cuebiq_id) as no_of_unique_users,
            '{formatted_current_date}' as event_date,
            '{get_3_hour_interval(0, formatted_current_date)}' as interval_0_3,
            '{get_3_hour_interval(3, formatted_current_date)}' as interval_3_6,
            '{get_3_hour_interval(6, formatted_current_date)}' as interval_6_9,
            '{get_3_hour_interval(9, formatted_current_date)}' as interval_9_12,
            '{get_3_hour_interval(12, formatted_current_date)}' as interval_12_15,
            '{get_3_hour_interval(15, formatted_current_date)}' as interval_15_18,
            '{get_3_hour_interval(18, formatted_current_date)}' as interval_18_21,
            '{get_3_hour_interval(21, formatted_current_date)}' as interval_21_24
        FROM (
            SELECT 
                cuebiq_id, 
                geohash_encode(lat, lng, 3) AS geohash3,
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        )
        GROUP BY geohash3
        HAVING COUNT(DISTINCT cuebiq_id) > 10
        """
        
        logging.info(f"Executing SQL3 query for date {formatted_current_date}")
        pe_dl_table_gen_3 = sql_engine.read_sql(query)
        
        # Append the DataFrame to the CSV file for geohash5
        logging.info(f"Exporting data to {export_file_path_5} for date {formatted_current_date}")
        pe_dl_table_gen_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
        
        # After the first write, set the header flag to False for geohash5
        write_header_5 = False
        
        # Append the DataFrame to the CSV file for geohash3
        logging.info(f"Exporting data to {export_file_path_3} for date {formatted_current_date}")
        pe_dl_table_gen_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
        
        # After the first write, set the header flag to False for geohash3
        write_header_3 = False

        logging.info(f"Appended data for date {formatted_current_date} to {export_file_path_5} and {export_file_path_3}")
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Loop through each day in the date range
for current_date in date_range:
    try:
        formatted_current_date = current_date.strftime('%Y-%m-%d')

        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)

        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')

        # SQL Query to fetch data for the current date
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng,
            TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """

        logging.info(f"Executing SQL query for date {formatted_current_date}")
        pe_dl_table_gen = sql_engine.read_sql(query)

        # Convert event_datetime_local to datetime
        pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])

        # Calculate geohashes
        logging.info(f"Processing geohashes for date {formatted_current_date}")
        pe_dl_table_gen = calculate_geohashes(pe_dl_table_gen, 'lat', 'lng')

        # Loop through each 3-hour interval
        for start_hour in range(0, 24, 3):
            end_hour = start_hour + 3

            # Filter data for the current 3-hour interval
            interval_data = pe_dl_table_gen[
                (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
                (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
            ].copy()

            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            interval_data['3_hour_interval'] = interval

            # Aggregate data for geohash5
            logging.info(f"Aggregating data for geohash5 for interval {start_hour} to {end_hour}")
            aggregated_data_5 = interval_data.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()

            # Append the DataFrame to the CSV file for geohash5
            logging.info(f"Exporting data to {export_file_name_5} for interval {start_hour} to {end_hour}")
            filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)

            # After the first write, set the header flag to False for geohash5
            write_header_5 = False

            # Aggregate data for geohash3
            logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
            aggregated_data_3 = interval_data.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()

            # Append the DataFrame to the CSV file for geohash3
            logging.info(f"Exporting data to {export_file_name_3} for interval {start_hour} to {end_hour}")
            filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)

            # After the first write, set the header flag to False for geohash3
            write_header_3 = False

            # logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_path_5} and {export_file_path_3}")
    
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)
        
        

In [None]:
# This one seemms to be working 

# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            break
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")

# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    try:
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        
        # Construct the SQL query
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """
        
        logging.info(f"Executing SQL query for date {formatted_current_date}")
        
        pe_dl_table_gen = sql_engine.read_sql_chunked(query)
        
        # Convert the generator to a DataFrame
        chunks = [chunk for chunk in pe_dl_table_gen]
        if chunks:
            pe_dl_table_df = pd.concat(chunks, ignore_index=True)
            
            # Calculate geohashes
            pe_dl_table_df['geohash5'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
            pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
            
            # Aggregate data for geohash5
            aggregated_data_5 = pe_dl_table_df.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_5.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash5 into SQL table
            if not filtered_data_5.empty:
                table_name_agg5 = f"pd_{country_code.lower()}_{formatted_current_date}_agg5"
                insert_data_with_retry(filtered_data_5, table_name_agg5, con)
            
            # Aggregate data for geohash3
            aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_3.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash3 into SQL table
            if not filtered_data_3.empty:
                table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                insert_data_with_retry(filtered_data_3, table_name_agg3, con)
        
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

logging.info("Data extraction, aggregation, and saving completed.")

end_time = time.time()  # End timing

# Calculate and print the total time taken
total_time = end_time - start_time
logging.info(f"Total time taken: {total_time} seconds")


# Check by single

In [None]:
query = f"""
SELECT 
    cuebiq_id, 
    event_zoned_datetime, 
    processing_date,
    lat,
    lng,
    TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
    EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
FROM {pe_dl_table}
WHERE 
    processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
    AND country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
"""
logging.info(f"Executing SQL query for date {formatted_current_date}")
pe_dl_table_gen = sql_engine.read_sql(query)
pe_dl_table_gen

In [None]:
import pandas as pd
import geohash2
import os

# Define the export file path
export_file_path_3 = '/home/jovyan/Data/pd3_test/pd_co_2019_agg3_3h.csv'
export_file_path_5 = '/home/jovyan/Data/pd3_test/pd_co_2019_agg5_3h.csv'

# Function to calculate geohashes
def calculate_geohashes(df, lat_col, lng_col):
    df['geohash5'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=5), axis=1)
    df['geohash3'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=3), axis=1)
    return df

# Create a function to determine the 3-hour interval based on a given date
def get_3_hour_interval(start_hour, current_date):
    start_time = pd.Timestamp(current_date) + pd.Timedelta(hours=start_hour)
    end_time = start_time + pd.Timedelta(hours=3)
    return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# Convert event_datetime_local to datetime
pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])

# Calculate geohashes
pe_dl_table_gen = calculate_geohashes(pe_dl_table_gen, 'lat', 'lng')

# Initialize flags to write the headers only once
write_header_5 = True
write_header_3 = True

# Loop through each 3-hour interval
formatted_current_date = pd.to_datetime(formatted_current_date)  # Ensure it's a datetime object
for start_hour in range(0, 24, 3):
    end_hour = start_hour + 3
    
    # Filter data for the current 3-hour interval
    interval_data = pe_dl_table_gen[
        (pe_dl_table_gen['event_datetime_local'].dt.hour >= start_hour) & 
        (pe_dl_table_gen['event_datetime_local'].dt.hour < end_hour)
    ].copy()
    
    # Create 3-hour interval column
    interval = get_3_hour_interval(start_hour, formatted_current_date)
    interval_data['3_hour_interval'] = interval
    
    # Aggregate data for geohash5
    aggregated_data_5 = interval_data.groupby('geohash5').agg(
        no_of_points=('geohash5', 'size'),
        no_of_unique_users=('cuebiq_id', 'nunique'),
        intervals=('3_hour_interval', lambda x: '; '.join(x.unique()))
    ).reset_index()
    
    # Filter rows with no_of_unique_users > 10
    filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
    
    # Append the DataFrame to the CSV file for geohash5
    filtered_data_5.to_csv(export_file_path_5, mode='a', header=write_header_5, index=False)
    
    # After the first write, set the header flag to False for geohash5
    write_header_5 = False
    
    # Aggregate data for geohash3
    aggregated_data_3 = interval_data.groupby('geohash3').agg(
        no_of_points=('geohash3', 'size'),
        no_of_unique_users=('cuebiq_id', 'nunique'),
        intervals=('3_hour_interval', lambda x: '; '.join(x.unique()))
    ).reset_index()
    
    # Filter rows with no_of_unique_users > 10
    filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
    
    # Append the DataFrame to the CSV file for geohash3
    filtered_data_3.to_csv(export_file_path_3, mode='a', header=write_header_3, index=False)
    
    # After the first write, set the header flag to False for geohash3
    write_header_3 = False
    
    print(f"Appended data for interval {start_hour} to {end_hour} to {export_file_path_5} and {export_file_path_3}")


# Test

In [None]:
query = f"""
SELECT 
    cuebiq_id, 
    event_zoned_datetime, 
    processing_date,
    lat,
    lng,
    TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
    EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
FROM {pe_dl_table}
WHERE 
    processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
    AND country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
    AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) IN (0, 1, 2)
"""
logging.info(f"Executing SQL query for date {formatted_current_date}")
pe_dl_table_gen = sql_engine.read_sql(query)
pe_dl_table_gen

In [None]:
# Calculate geohashes
pe_dl_table_gen['geohash5'] = pe_dl_table_gen.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
pe_dl_table_gen['geohash3'] = pe_dl_table_gen.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
pe_dl_table_gen

In [None]:
# Convert event_datetime_local to datetime
pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])

# Create a function to determine the 3-hour interval
def get_3_hour_interval(dt):
    start_hour = (dt.hour // 3) * 3
    start_time = dt.replace(hour=start_hour, minute=0, second=0, microsecond=0)
    end_time = start_time + pd.Timedelta(hours=3)
    return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# Apply the function to create the 3-hour interval column
pe_dl_table_gen['3_hour_interval'] = pe_dl_table_gen['event_datetime_local'].apply(get_3_hour_interval)

# Aggregate data for geohash5
aggregated_data_5 = pe_dl_table_gen.groupby('geohash5').agg(
    no_of_points=('geohash5', 'size'),
    no_of_unique_users=('cuebiq_id', 'nunique'),
    local_time=('3_hour_interval', lambda x: '; '.join(x.unique()))
).reset_index()

# Filter rows with no_of_unique_users > 10
filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()

filtered_data_5

In [None]:
import pandas as pd
import geohash2

query = f"""
SELECT 
    cuebiq_id, 
    event_zoned_datetime, 
    lat,
    lng,
    TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
    EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour
FROM {pe_dl_table}
WHERE 
    processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
    AND country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
    AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) IN (0, 1, 2)
"""
logging.info(f"Executing SQL query for date {formatted_current_date}")
pe_dl_table_gen = sql_engine.read_sql(query)
pe_dl_table_gen

In [None]:
# Function to calculate geohashes
def calculate_geohashes(df, lat_col, lng_col):
    df['geohash5'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=5), axis=1)
    df['geohash3'] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lng_col], precision=3), axis=1)
    return df

# Calculate geohashes
pe_dl_table_gen = calculate_geohashes(pe_dl_table_gen, 'lat', 'lng')

# Convert event_datetime_local to datetime
pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])

# Create a function to determine the 3-hour interval
def get_3_hour_interval(dt):
    start_hour = (dt.hour // 3) * 3
    start_time = dt.replace(hour=start_hour, minute=0, second=0, microsecond=0)
    end_time = start_time + pd.Timedelta(hours=3)
    return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# Apply the function to create the 3-hour interval column
pe_dl_table_gen['3_hour_interval'] = pe_dl_table_gen['event_datetime_local'].apply(get_3_hour_interval)

# Aggregate data for geohash5
aggregated_data_5 = pe_dl_table_gen.groupby('geohash5').agg(
    no_of_points=('geohash5', 'size'),
    no_of_unique_users=('cuebiq_id', 'nunique'),
    intervals=('3_hour_interval', lambda x: '; '.join(x.unique()))
).reset_index()

# Filter rows with no_of_unique_users > 10
filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()

filtered_data_5

In [None]:
filtered_data_5