This is the pop_density geohash 3 dataset for 2020 for IN  
Please run the file by 3 months  
It is set 20200101 - 20200331 for now, please change while submitting jobs  
Please remember to check logging information to fillin data missing dates  

In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install geohash2

Collecting geohash2
  Using cached geohash2-1.1-py3-none-any.whl
Collecting docutils>=0.3
  Using cached docutils-0.21.2-py3-none-any.whl (587 kB)
Installing collected packages: docutils, geohash2
Successfully installed docutils-0.21.2 geohash2-1.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
import gc
import os
import time
import logging
import geohash2
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from trino.dbapi import connect

In [5]:
# SQL engine class
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()

In [69]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'pop_density20'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'IN'
start_date = 20200101
end_date = 20200331

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Define the sorted longitude boundaries
sorted_lons = [67.5, 68.90625, 70.3125, 71.71875, 73.125, 74.53125, 75.9375, 77.34375, 78.75, 80.15625, 81.5625, 82.96875, 84.375, 85.78125, 87.1875, 88.59375]

# Initialize a list to store failed insertions
failed_inserts = []
# Define the buffer value
buffer_value = 20

In [8]:
# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            return True
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")
                return False


In [9]:
# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    try:
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        
        # Loop through each longitude range
        for i in range(len(sorted_lons) + 1):
            if i == 0:
                min_lon = sorted_lons[i] - buffer_value
                max_lon = sorted_lons[i]
            elif i == len(sorted_lons):
                min_lon = sorted_lons[i - 1]
                max_lon = sorted_lons[i - 1] + buffer_value
            else:
                min_lon = sorted_lons[i - 1]
                max_lon = sorted_lons[i]
            
            # Construct the SQL query
            query = f"""
            SELECT 
                cuebiq_id, 
                event_zoned_datetime, 
                processing_date,
                lat,
                lng
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
                AND lng BETWEEN {min_lon} AND {max_lon}
            """
            
            logging.info(f"Executing SQL query for date {formatted_current_date} and longitude range {min_lon} to {max_lon}")
            
            try:
                pe_dl_table_gen = sql_engine.read_sql_chunked(query)
                
                # Convert the generator to a DataFrame
                chunks = [chunk for chunk in pe_dl_table_gen]
                if chunks:
                    pe_dl_table_df = pd.concat(chunks, ignore_index=True)
                    
                    # Calculate geohashes
                    pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
                    
                    # Aggregate data for geohash3
                    aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                        no_of_points=('geohash3', 'size'),
                        no_of_unique_users=('cuebiq_id', 'nunique')
                    ).reset_index()
                    
                    # Filter rows with no_of_unique_users > 10
                    filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
                    
                    # Add the local_date column
                    filtered_data_3.loc[:, 'local_date'] = formatted_current_date
                    
                    # Insert filtered aggregated data for geohash3 into SQL table
                    if not filtered_data_3.empty:
                        table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                        if not insert_data_with_retry(filtered_data_3, table_name_agg3, con):
                            failed_inserts.append((formatted_current_date, min_lon, max_lon))
            
            except Exception as e:
                logging.error(f"Error while processing data for date {formatted_current_date} and longitude range {min_lon} to {max_lon}: {e}")
                failed_inserts.append((formatted_current_date, min_lon, max_lon))
    
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
        failed_inserts.append((formatted_current_date, None, None))
    
    # Move to the next day
    current_date += timedelta(days=1)

# Print the failed inserts
if failed_inserts:
    print("Failed inserts:")
    for failed_insert in failed_inserts:
        print(failed_insert)

logging.info("Data extraction, aggregation, and saving completed.")

2024-06-08 16:13:10,603 - INFO - Executing SQL query for date 20190323
2024-06-08 16:23:10,924 - INFO - Inserted data into table pd_in_20190323_agg3
2024-06-08 16:23:10,925 - INFO - Data extraction, aggregation, and saving completed.
