In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install geohash2

Note: you may need to restart the kernel to use updated packages.


In [4]:
import gc
import os
import time
import logging
import geohash2
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from trino.dbapi import connect 

In [5]:
# SQL engine
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()

In [6]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

In [7]:
# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            return True
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")
    return False

In [None]:
# test seperate by lon, and record error: 

In [8]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'CO'
start_date = 20191020
end_date = 20191031
longitude_ranges = [(-82, -74.53125), (-74.53125, -65)]  # CO specific longitude ranges

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

failed_inserts = []

In [9]:
# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    try:
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        
        # Loop through each longitude range
        for min_lon, max_lon in longitude_ranges:
            
            # Construct the SQL query
            query = f"""
            SELECT 
                cuebiq_id, 
                event_zoned_datetime, 
                processing_date,
                lat,
                lng
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
                AND lng BETWEEN {min_lon} AND {max_lon}
            """
            
            logging.info(f"Executing SQL query for date {formatted_current_date} and longitude range {min_lon} to {max_lon}")
            
            try:
                pe_dl_table_gen = sql_engine.read_sql_chunked(query)
                
                # Convert the generator to a DataFrame
                chunks = [chunk for chunk in pe_dl_table_gen]
                if chunks:
                    pe_dl_table_df = pd.concat(chunks, ignore_index=True)
                    
                    # Calculate geohashes
                    pe_dl_table_df['geohash5'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
                    pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
                    
                    # Aggregate data for geohash5
                    aggregated_data_5 = pe_dl_table_df.groupby('geohash5').agg(
                        no_of_points=('geohash5', 'size'),
                        no_of_unique_users=('cuebiq_id', 'nunique')
                    ).reset_index()

                    # Filter rows with no_of_unique_users > 10
                    filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
                    
                    # Add the local_date column
                    filtered_data_5.loc[:, 'local_date'] = formatted_current_date
                    
                    # Insert filtered aggregated data for geohash5 into SQL table
                    if not filtered_data_5.empty:
                        table_name_agg5 = f"pd_{country_code.lower()}_{formatted_current_date}_agg5"
                        if not insert_data_with_retry(filtered_data_5, table_name_agg5, con):
                            failed_inserts.append((formatted_current_date, min_lon, max_lon, 'agg5'))
                    
                    # Aggregate data for geohash3
                    aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                        no_of_points=('geohash3', 'size'),
                        no_of_unique_users=('cuebiq_id', 'nunique')
                    ).reset_index()
                    
                    # Filter rows with no_of_unique_users > 10
                    filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
                    
                    # Add the local_date column
                    filtered_data_3.loc[:, 'local_date'] = formatted_current_date
                    
                    # Insert filtered aggregated data for geohash3 into SQL table
                    if not filtered_data_3.empty:
                        table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                        if not insert_data_with_retry(filtered_data_3, table_name_agg3, con):
                            failed_inserts.append((formatted_current_date, min_lon, max_lon, 'agg3'))
            
            except Exception as e:
                logging.error(f"Error while processing data for date {formatted_current_date} and longitude range {min_lon} to {max_lon}: {e}")
                failed_inserts.append((formatted_current_date, min_lon, max_lon, 'both'))
    
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
        failed_inserts.append((formatted_current_date, None, None, 'both'))
    
    # Move to the next day
    current_date += timedelta(days=1)

# Print the failed inserts
if failed_inserts:
    print("Failed inserts:")
    for failed_insert in failed_inserts:
        print(failed_insert)

logging.info("Data extraction, aggregation, and saving completed.")

end_time = time.time()  # End timing

# Calculate and print the total time taken
total_time = end_time - start_time
logging.info(f"Total time taken: {total_time} seconds")

2024-06-12 23:02:26,324 - INFO - Executing SQL query for date 20191020 and longitude range -82 to -74.53125
2024-06-12 23:03:18,645 - INFO - failed after 3 attempts
2024-06-12 23:03:19,326 - INFO - failed after 3 attempts
2024-06-12 23:03:19,326 - ERROR - Attempt 1 failed with error: error 502: bad gateway
2024-06-12 23:03:24,808 - INFO - failed after 3 attempts
2024-06-12 23:03:24,809 - ERROR - Error closing cursor
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 1900, in _execute_context
    self.dialect.do_execute(
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/sqlalchemy/dialect.py", line 365, in do_execute
    cursor.execute(statement, parameters)
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/dbapi.py", line 482, in execute
    self._prepare_statement(operation, statement_name)
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/dbapi.py", line 37

Failed inserts:
('20191020', -82, -74.53125, 'agg5')
('20191020', -82, -74.53125, 'agg3')
('20191020', -74.53125, -65, 'agg5')
('20191020', -74.53125, -65, 'agg3')
('20191021', -82, -74.53125, 'agg5')
('20191021', -82, -74.53125, 'agg3')
('20191021', -74.53125, -65, 'agg5')
('20191021', -74.53125, -65, 'agg3')
('20191022', -82, -74.53125, 'agg5')
('20191022', -82, -74.53125, 'agg3')
('20191022', -74.53125, -65, 'agg5')
('20191022', -74.53125, -65, 'agg3')
('20191023', -82, -74.53125, 'agg5')
('20191023', -82, -74.53125, 'agg3')
('20191023', -74.53125, -65, 'agg5')
('20191023', -74.53125, -65, 'agg3')
('20191024', -82, -74.53125, 'agg5')
('20191024', -82, -74.53125, 'agg3')
('20191024', -74.53125, -65, 'agg5')
('20191024', -74.53125, -65, 'agg3')
('20191025', -82, -74.53125, 'agg5')
('20191025', -82, -74.53125, 'agg3')
('20191025', -74.53125, -65, 'agg5')
('20191025', -74.53125, -65, 'agg3')
('20191026', -82, -74.53125, 'agg5')
('20191026', -82, -74.53125, 'agg3')
('20191026', -74.53125

In [8]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'CO'
start_date = 20191015
end_date = 20191015

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            break
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")

# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    try:
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        
        # Construct the SQL query
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """
        
        logging.info(f"Executing SQL query for date {formatted_current_date}")
        
        pe_dl_table_gen = sql_engine.read_sql_chunked(query)
        
        # Convert the generator to a DataFrame
        chunks = [chunk for chunk in pe_dl_table_gen]
        if chunks:
            pe_dl_table_df = pd.concat(chunks, ignore_index=True)
            
            # Calculate geohashes
            pe_dl_table_df['geohash5'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
            pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
            
            # Aggregate data for geohash5
            aggregated_data_5 = pe_dl_table_df.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_5.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash5 into SQL table
            if not filtered_data_5.empty:
                table_name_agg5 = f"pd_{country_code.lower()}_{formatted_current_date}_agg5"
                insert_data_with_retry(filtered_data_5, table_name_agg5, con)
            
            # Aggregate data for geohash3
            aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_3.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash3 into SQL table
            if not filtered_data_3.empty:
                table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                insert_data_with_retry(filtered_data_3, table_name_agg3, con)
        
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

logging.info("Data extraction, aggregation, and saving completed.")

end_time = time.time()  # End timing

# Calculate and print the total time taken
total_time = end_time - start_time
logging.info(f"Total time taken: {total_time} seconds")


2024-06-07 17:58:41,889 - INFO - Executing SQL query for date 20191015
2024-06-07 17:59:16,778 - INFO - failed after 3 attempts
2024-06-07 17:59:17,191 - INFO - failed after 3 attempts
2024-06-07 17:59:17,193 - ERROR - Attempt 1 failed with error: error 502: bad gateway
2024-06-07 17:59:22,693 - INFO - failed after 3 attempts
2024-06-07 17:59:22,694 - ERROR - Error closing cursor
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 1900, in _execute_context
    self.dialect.do_execute(
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/sqlalchemy/dialect.py", line 365, in do_execute
    cursor.execute(statement, parameters)
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/dbapi.py", line 482, in execute
    self._prepare_statement(operation, statement_name)
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/dbapi.py", line 379, in _prepare_statement
    query.ex

In [9]:
# This one seemms to be working 

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'CO'
start_date = 20191015
end_date = 20191015

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            break
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")

# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    try:
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        
        # Construct the SQL query
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng
        FROM {pe_dl_table}
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """
        
        logging.info(f"Executing SQL query for date {formatted_current_date}")
        
        pe_dl_table_gen = sql_engine.read_sql_chunked(query)
        
        # Convert the generator to a DataFrame
        chunks = [chunk for chunk in pe_dl_table_gen]
        if chunks:
            pe_dl_table_df = pd.concat(chunks, ignore_index=True)
            
            # Calculate geohashes
            pe_dl_table_df['geohash5'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
            pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
            
            # Aggregate data for geohash5
            aggregated_data_5 = pe_dl_table_df.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_5.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash5 into SQL table
            if not filtered_data_5.empty:
                table_name_agg5 = f"pd_{country_code.lower()}_{formatted_current_date}_agg5"
                insert_data_with_retry(filtered_data_5, table_name_agg5, con)
            
            # Aggregate data for geohash3
            aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_3.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash3 into SQL table
            if not filtered_data_3.empty:
                table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                insert_data_with_retry(filtered_data_3, table_name_agg3, con)
        
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

logging.info("Data extraction, aggregation, and saving completed.")

end_time = time.time()  # End timing

# Calculate and print the total time taken
total_time = end_time - start_time
logging.info(f"Total time taken: {total_time} seconds")


2024-06-06 22:27:10,473 - INFO - Executing SQL query for date 20191015
2024-06-06 22:27:29,402 - INFO - failed after 3 attempts
2024-06-06 22:27:30,069 - INFO - failed after 3 attempts
2024-06-06 22:27:30,070 - ERROR - Attempt 1 failed with error: error 502: bad gateway
2024-06-06 22:27:35,994 - INFO - failed after 3 attempts
2024-06-06 22:27:35,995 - ERROR - Error closing cursor
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 1900, in _execute_context
    self.dialect.do_execute(
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/sqlalchemy/dialect.py", line 365, in do_execute
    cursor.execute(statement, parameters)
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/dbapi.py", line 482, in execute
    self._prepare_statement(operation, statement_name)
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/dbapi.py", line 379, in _prepare_statement
    query.ex

In [12]:
# Database connection setup
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'CO'
start_date = 20191015
end_date = 20191015

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Calculate the lookback and lookahead dates
lookback_date = current_date - timedelta(days=1)
lookahead_date = current_date + timedelta(days=35)

# Format dates for the SQL query
formatted_lookback_date = lookback_date.strftime('%Y%m%d')
formatted_current_date = current_date.strftime('%Y%m%d')
formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

query = f"""
SELECT 
    cuebiq_id, 
    event_zoned_datetime, 
    processing_date,
    lat,
    lng
FROM {pe_dl_table}
WHERE 
    processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
    AND country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
"""

logging.info(f"Executing SQL query for date {formatted_current_date}")

pe_dl_table_gen = sql_engine.read_sql_chunked(query)

# Convert the generator to a DataFrame
chunks = [chunk for chunk in pe_dl_table_gen]
if chunks:
    pe_dl_table_df = pd.concat(chunks, ignore_index=True)
pe_dl_table_df

2024-06-06 17:15:06,916 - INFO - Executing SQL query for date 20191017


Unnamed: 0,cuebiq_id,event_zoned_datetime,processing_date,lat,lng
0,2380378528,2019-10-17T16:57:25-05:00,20191105,3.330445,-76.543109
1,2380378528,2019-10-17T16:50:32-05:00,20191105,3.330306,-76.542735
2,2380378528,2019-10-17T16:50:32-05:00,20191105,3.330306,-76.542735
3,2380378528,2019-10-17T16:48:25-05:00,20191105,3.332553,-76.541478
4,2380378528,2019-10-17T17:10:33-05:00,20191105,3.337887,-76.534914
...,...,...,...,...,...
35792,2488939223,2019-10-17T23:13:02-05:00,20191114,1.215279,-77.288334
35793,2488939223,2019-10-17T23:13:02-05:00,20191114,1.215279,-77.288334
35794,2488939223,2019-10-17T23:13:02-05:00,20191114,1.215279,-77.288334
35795,2488939223,2019-10-17T23:13:02-05:00,20191114,1.215279,-77.288334


In [13]:
# Calculate geohashes
pe_dl_table_df['geohash5'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
pe_dl_table_df

Unnamed: 0,cuebiq_id,event_zoned_datetime,processing_date,lat,lng,geohash5,geohash3
0,2380378528,2019-10-17T16:57:25-05:00,20191105,3.330445,-76.543109,d29df,d29
1,2380378528,2019-10-17T16:50:32-05:00,20191105,3.330306,-76.542735,d29df,d29
2,2380378528,2019-10-17T16:50:32-05:00,20191105,3.330306,-76.542735,d29df,d29
3,2380378528,2019-10-17T16:48:25-05:00,20191105,3.332553,-76.541478,d29df,d29
4,2380378528,2019-10-17T17:10:33-05:00,20191105,3.337887,-76.534914,d29df,d29
...,...,...,...,...,...,...,...
35792,2488939223,2019-10-17T23:13:02-05:00,20191114,1.215279,-77.288334,d21nc,d21
35793,2488939223,2019-10-17T23:13:02-05:00,20191114,1.215279,-77.288334,d21nc,d21
35794,2488939223,2019-10-17T23:13:02-05:00,20191114,1.215279,-77.288334,d21nc,d21
35795,2488939223,2019-10-17T23:13:02-05:00,20191114,1.215279,-77.288334,d21nc,d21


In [14]:
# Aggregate data for geohash5
aggregated_data_5 = pe_dl_table_df.groupby('geohash5').agg(
    no_of_points=('geohash5', 'size'),
    no_of_unique_users=('cuebiq_id', 'nunique')
).reset_index()

# Filter rows with no_of_unique_users > 10
filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()

# Add the local_date column
filtered_data_5.loc[:, 'local_date'] = formatted_current_date

# Aggregate data for geohash3
aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
    no_of_points=('geohash3', 'size'),
    no_of_unique_users=('cuebiq_id', 'nunique')
).reset_index()

# Filter rows with no_of_unique_users > 10
filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()

# Add the local_date column
filtered_data_3.loc[:, 'local_date'] = formatted_current_date


In [15]:
filtered_data_3

Unnamed: 0,geohash3,no_of_points,no_of_unique_users,local_date
0,6rf,55,35,20191017
1,6rg,112,71,20191017
3,6ru,64,25,20191017
4,6rv,23,18,20191017
6,d20,214,20,20191017
7,d21,270,35,20191017
8,d22,89,13,20191017
9,d23,747,63,20191017
10,d26,273,21,20191017
12,d29,3285,292,20191017


In [16]:
filtered_data_5

Unnamed: 0,geohash5,no_of_points,no_of_unique_users,local_date
115,d21nc,156,22,20191017
182,d23tb,274,22,20191017
185,d23w1,55,12,20191017
257,d29e3,79,12,20191017
258,d29e4,302,34,20191017
...,...,...,...,...
1178,d3k00,129,22,20191017
1180,d3k02,504,48,20191017
1181,d3k08,182,24,20191017
1197,d3kky,138,17,20191017


In [17]:
# Insert filtered aggregated data for geohash5 into SQL table
if not filtered_data_5.empty:
    table_name_agg5 = f"pd_{country_code.lower()}_{formatted_current_date}_agg5"
    insert_data_with_retry(filtered_data_5, table_name_agg5, con)

# Insert filtered aggregated data for geohash3 into SQL table
if not filtered_data_3.empty:
    table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
    insert_data_with_retry(filtered_data_3, table_name_agg3, con)

2024-06-06 17:16:57,110 - INFO - failed after 3 attempts
2024-06-06 17:16:58,128 - INFO - failed after 3 attempts
2024-06-06 17:16:58,129 - ERROR - Attempt 1 failed with error: error 502: bad gateway
2024-06-06 17:17:03,340 - INFO - failed after 3 attempts
2024-06-06 17:17:03,341 - ERROR - Error closing cursor
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 1900, in _execute_context
    self.dialect.do_execute(
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/sqlalchemy/dialect.py", line 365, in do_execute
    cursor.execute(statement, parameters)
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/dbapi.py", line 482, in execute
    self._prepare_statement(operation, statement_name)
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/trino/dbapi.py", line 379, in _prepare_statement
    query.execute()
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/tr