In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install geohash2

Note: you may need to restart the kernel to use updated packages.


In [4]:
import gc
import os
import time
import geohash2
import logging
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from trino.dbapi import connect

In [5]:
class TrinoEngine:
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query: str):
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql_chunked(self, query: str, chunksize: int = 100000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()

In [6]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [7]:
# Database connection setup
output_schema_name = 'presence_data'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'IN'
start_date = 20190101
end_date = 20190103

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')


In [8]:
# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            break
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")


In [11]:
# Function to process and aggregate data
def process_and_aggregate_data(date, country_code, con):
    # Calculate the lookback and lookahead dates
    lookback_date = date - timedelta(days=1)
    lookahead_date = date + timedelta(days=35)
    
    # Format dates for the SQL query
    formatted_lookback_date = lookback_date.strftime('%Y%m%d')
    formatted_current_date = date.strftime('%Y%m%d')
    formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
    
    # Construct the SQL query
    query = f"""
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng
    FROM {pe_dl_table}
    WHERE 
        processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
        AND country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
    """
    
    logging.info(f"Executing SQL query for date {formatted_current_date}: {query}")
    
    try:
        pe_dl_table_gen = pd.read_sql_query(query, con, chunksize=100000)  # Adjust chunksize as needed
        
        for chunk in pe_dl_table_gen:
            # Calculate geohashes
            chunk['geohash5'] = chunk.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
            chunk['geohash3'] = chunk.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
            
            # Aggregate data for geohash5
            aggregated_data_5 = chunk.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_5.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash5 into SQL table
            if not filtered_data_5.empty:
                table_name_agg5 = f"pd_{country_code.lower()}_{formatted_current_date}_agg5"
                insert_data_with_retry(filtered_data_5, table_name_agg5, con)
            
            # Aggregate data for geohash3
            aggregated_data_3 = chunk.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_3.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash3 into SQL table
            if not filtered_data_3.empty:
                table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                insert_data_with_retry(filtered_data_3, table_name_agg3, con)
    
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")

In [13]:
# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    process_and_aggregate_data(current_date, country_code, con)
    current_date += timedelta(days=1)

logging.info("Data extraction, aggregation, and saving completed.")

end_time = time.time()  # End timing

# Calculate and print the total time taken
total_time = end_time - start_time
logging.info(f"Total time taken: {total_time} seconds")

2024-06-06 01:22:03,997 - INFO - Executing SQL query for date 20190101: 
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng
    FROM cuebiq.paas_cda_pe_v3.device_location_uplevelled
    WHERE 
        processing_date BETWEEN 20181231 AND 20190205
        AND country_code = 'IN' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('20190101', '%Y%m%d')
    
2024-06-06 01:22:04,579 - INFO - failed after 3 attempts
2024-06-06 01:22:04,580 - ERROR - Error while processing data for date 20190101: error 502: bad gateway
2024-06-06 01:22:04,581 - INFO - Executing SQL query for date 20190102: 
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng
    FROM cuebiq.paas_cda_pe_v3.devic

In [20]:
# try with one day
specific_date = 20190105
specific_date_dt = datetime.strptime(str(specific_date), '%Y%m%d')


In [21]:
# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            break
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")


In [22]:
# Function to process and aggregate data for a single day
def process_and_aggregate_data(date, country_code, con):
    # Calculate the lookback and lookahead dates
    lookback_date = date - timedelta(days=1)
    lookahead_date = date + timedelta(days=35)
    
    # Format dates for the SQL query
    formatted_lookback_date = lookback_date.strftime('%Y%m%d')
    formatted_current_date = date.strftime('%Y%m%d')
    formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
    
    # Construct the SQL query
    query = f"""
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng
    FROM {pe_dl_table}
    WHERE 
        processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
        AND country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
    """
    
    logging.info(f"Executing SQL query for date {formatted_current_date}: {query}")
    
    try:
        pe_dl_table_gen = pd.read_sql_query(query, con, chunksize=1000)  # Adjust chunksize as needed
        
        for chunk in pe_dl_table_gen:
            # Calculate geohashes
            chunk['geohash5'] = chunk.apply(lambda row: geohash.encode(row['lat'], row['lng'], precision=5), axis=1)
            chunk['geohash3'] = chunk.apply(lambda row: geohash.encode(row['lat'], row['lng'], precision=3), axis=1)
            
            # Aggregate data for geohash5
            aggregated_data_5 = chunk.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()

            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_5.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash5 into SQL table
            if not filtered_data_5.empty:
                table_name_agg5 = f"pd_{country_code.lower()}_{formatted_current_date}_agg5"
                insert_data_with_retry(filtered_data_5, table_name_agg5, con)
            
            # Aggregate data for geohash3
            aggregated_data_3 = chunk.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            
            # Add the local_date column
            filtered_data_3.loc[:, 'local_date'] = formatted_current_date
            
            # Insert filtered aggregated data for geohash3 into SQL table
            if not filtered_data_3.empty:
                table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                insert_data_with_retry(filtered_data_3, table_name_agg3, con)
    
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")


In [19]:
# Process data for the specific date
process_and_aggregate_data(specific_date_dt, country_code, con)

2024-06-06 01:24:50,658 - INFO - Executing SQL query for date 20191015: 
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng
    FROM cuebiq.paas_cda_pe_v3.device_location_uplevelled
    WHERE 
        processing_date BETWEEN 20191014 AND 20191119
        AND country_code = 'IN' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('20191015', '%Y%m%d')
    
2024-06-06 01:24:51,031 - INFO - failed after 3 attempts
2024-06-06 01:24:51,032 - ERROR - Error while processing data for date 20191015: error 502: bad gateway


In [25]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

# Define the input parameters for a single day
country_code = 'IN'
specific_date = 20190105

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer date to datetime object
specific_date_dt = datetime.strptime(str(specific_date), '%Y%m%d')

# Calculate the lookback and lookahead dates
lookback_date = specific_date_dt - timedelta(days=1)
lookahead_date = specific_date_dt + timedelta(days=35)

# Format dates for the SQL query
formatted_lookback_date = lookback_date.strftime('%Y%m%d')
formatted_current_date = specific_date_dt.strftime('%Y%m%d')
formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')


In [26]:
# Construct the SQL query
query = f"""
SELECT 
    cuebiq_id, 
    event_zoned_datetime, 
    processing_date,
    lat,
    lng
FROM {pe_dl_table}
WHERE 
    processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
    AND country_code = '{country_code}' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
"""

logging.info(f"Executing SQL query for date {formatted_current_date}: {query}")


2024-06-06 01:31:12,842 - INFO - Executing SQL query for date 20190105: 
SELECT 
    cuebiq_id, 
    event_zoned_datetime, 
    processing_date,
    lat,
    lng
FROM cuebiq.paas_cda_pe_v3.device_location_uplevelled
WHERE 
    processing_date BETWEEN 20190104 AND 20190209
    AND country_code = 'IN' 
    AND event_zoned_datetime IS NOT NULL
    AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('20190105', '%Y%m%d')



In [27]:
try:
    pe_dl_table_gen = pd.read_sql_query(query, con, chunksize=1000)  # Adjust chunksize as needed
    chunks = [chunk for chunk in pe_dl_table_gen]  # Collect all chunks
    if chunks:
        pe_dl_table_df = pd.concat(chunks, ignore_index=True)
        logging.info(f"Retrieved {len(pe_dl_table_df)} records for date {formatted_current_date}")
    else:
        pe_dl_table_df = pd.DataFrame()
        logging.info(f"No data retrieved for date {formatted_current_date}")
except exc.SQLAlchemyError as e:
    logging.error(f"Error executing query for date {formatted_current_date}: {e}")


2024-06-06 01:35:10,568 - INFO - Retrieved 12571991 records for date 20190105


In [28]:
# Check the first few rows of the retrieved DataFrame
if not pe_dl_table_df.empty:
    logging.info(pe_dl_table_df.head())


2024-06-06 01:35:10,574 - INFO -     cuebiq_id       event_zoned_datetime  processing_date        lat  \
0  1708989676  2019-01-05T11:07:02+05:30         20190105  28.414726   
1  1708989676  2019-01-05T07:48:52+05:30         20190105  28.414726   
2  1708989676  2019-01-05T07:55:01+05:30         20190105  28.414726   
3  1708989676  2019-01-05T07:55:37+05:30         20190105  28.414726   
4  1708989676  2019-01-05T08:21:12+05:30         20190105  28.414726   

         lng  
0  77.312957  
1  77.312957  
2  77.312957  
3  77.312957  
4  77.312957  
