In [None]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [None]:
%sql trino://localhost:9090/cuebiq/

In [None]:
pip install geohash2

In [None]:
import gc
import os
import time
import logging
import geohash2
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from trino.dbapi import connect 

In [None]:
# SQL engine
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()

In [None]:
# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            return True
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")
    return False

In [None]:
# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Export to jupyter notebook
should be working for CO, ID, IN, MX

In [None]:
# # Define the date range
# start_date = '2019-01-01'
# end_date = '2019-01-02'
# date_range = pd.date_range(start=start_date, end=end_date)

# country_code = 'MX'

# # Define the export file paths
# export_file_name_5 = f"pd_{country_code.lower()}_2019_agg5_3h.csv"
# export_file_name_3 = f"pd_{country_code.lower()}_2019_agg3_3h.csv"

# # Define the export file paths
# export_path = '/home/jovyan/Data/pd3_test/'
# export_file_path_5 = f"{export_path}{export_file_name_5}"
# export_file_path_3 = f"{export_path}{export_file_name_3}"

# # Create a function to determine the 3-hour interval based on a given date
# def get_3_hour_interval(start_hour, current_date):
#     start_time = pd.Timestamp(current_date) + pd.Timedelta(hours=start_hour)
#     end_time = start_time + pd.Timedelta(hours=3)
#     return f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}/{end_time.strftime('%Y-%m-%d %H:%M:%S')}"

# # Check if files already exist to determine header writing
# write_header_5 = not os.path.exists(export_file_path_5)
# write_header_3 = not os.path.exists(export_file_path_3)

# # Configure logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
# import logging
# from datetime import timedelta
# import pandas as pd

# def get_3_hour_interval(start_hour, formatted_current_date):
#     return f"{formatted_current_date} {start_hour:02d}:00:00"

# # Loop through each day in the date range
# for current_date in date_range:
#     formatted_current_date = current_date.strftime('%Y%m%d')
    
#     for start_hour in range(0, 24, 3):
#         end_hour = start_hour + 3
        
#         # Calculate the lookback and lookahead dates
#         lookback_date = current_date - timedelta(days=1)
#         lookahead_date = current_date + timedelta(days=35)
        
#         # Format dates for the SQL query in 'yyyymmdd' format
#         formatted_lookback_date = lookback_date.strftime('%Y%m%d')
#         formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

#         try:
#             # SQL Query to fetch data for the current 3-hour interval
#             query = f"""
#             SELECT 
#                 cuebiq_id, 
#                 event_zoned_datetime, 
#                 processing_date,
#                 lat,
#                 lng,
#                 TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
#                 EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
#                 geohash_encode(lat, lng, 5) AS geohash5,
#                 geohash_encode(lat, lng, 3) AS geohash3
#             FROM {pe_dl_table}
#             WHERE 
#                 processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
#                 AND country_code = '{country_code}' 
#                 AND event_zoned_datetime IS NOT NULL
#                 AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
#                 AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
#                 AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN {start_hour} AND {end_hour-1}
#             """

#             logging.info(f"Executing SQL query for date {formatted_current_date} and interval {start_hour} to {end_hour}")
#             pe_dl_table_gen = sql_engine.read_sql(query)
            
#             # Convert event_datetime_local to datetime once
#             pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
            
#             # Create 3-hour interval column
#             interval = get_3_hour_interval(start_hour, formatted_current_date)
#             pe_dl_table_gen['3_hour_interval'] = interval
#             pe_dl_table_gen['local_date'] = formatted_current_date
            
#             for geohash_col, export_file_path, write_header in [
#                 ('geohash5', export_file_path_5, write_header_5),
#                 ('geohash3', export_file_path_3, write_header_3)
#             ]:
#                 # Aggregate data for geohash
#                 logging.info(f"Aggregating data for {geohash_col} for interval {start_hour} to {end_hour}")
#                 aggregated_data = pe_dl_table_gen.groupby(geohash_col).agg(
#                     no_of_points=(geohash_col, 'size'),
#                     no_of_unique_users=('cuebiq_id', 'nunique'),
#                     local_time=('3_hour_interval', 'first'),
#                     local_date=('local_date', 'first')
#                 ).reset_index()
                
#                 # Filter rows with no_of_unique_users > 10
#                 filtered_data = aggregated_data[aggregated_data['no_of_unique_users'] > 10].copy()
                
#                 # Append the DataFrame to the CSV file
#                 logging.info(f"Exporting data for interval {start_hour} to {end_hour}")
#                 filtered_data.to_csv(export_file_path, mode='a', header=write_header, index=False)
                
#                 # After the first write, set the header flag to False
#                 if geohash_col == 'geohash5':
#                     write_header_5 = False
#                 else:
#                     write_header_3 = False
                
#             logging.info(f"Appended data for date {formatted_current_date} interval {start_hour} to {end_hour} to {export_file_name_5} and {export_file_name_3}")
#         except Exception as e:
#             logging.error(f"Error while processing data for date {formatted_current_date} interval {start_hour} to {end_hour}: {e}")
    
#     # Move to the next day
#     current_date += timedelta(days=1)
# # test query by 3 hour

# Export to schema

In [None]:
def get_3_hour_interval(start_hour, formatted_current_date):
    end_hour = start_hour + 3
    return f"{formatted_current_date} {start_hour:02d}:00:00 - {end_hour:02d}:00:00"

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
# output_schema_name = 'presence_data'
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_date = '2019-01-01'
end_date = '2019-03-31'
date_range = pd.date_range(start=start_date, end=end_date)

country_code = 'MX'
country_abbre = country_code.lower()  
master_table_3 = f"pd_{country_abbre}_2019_3h_agg3"
master_table_5 = f"pd_{country_abbre}_2019_3h_agg5"

# Create the master tables if they do not exist
create_table_query_3 = f"""
CREATE TABLE IF NOT EXISTS {master_table_3}(
    geohash_3 varchar,
    no_of_points bigint,
    no_of_unique_users bigint,
    local_time varchar,
    local_date varchar
)
"""
create_table_query_5 = f"""
CREATE TABLE IF NOT EXISTS {master_table_5}(
    geohash_5 varchar,
    no_of_points bigint,
    no_of_unique_users bigint,
    local_time varchar,
    local_date varchar
)
"""
# with engine.connect() as connection:
#     connection.execute(create_table_query_3)
#     connection.execute(create_table_query_5)
    
with con.connect() as connection:
    logging.info(f"Creating master table: {master_table_3}")
    connection.execute(create_table_query_3)
    logging.info(f"Creating master table: {master_table_5}")
    connection.execute(create_table_query_5)


In [None]:
# Loop through each day in the date range
for current_date in date_range:
    formatted_current_date = current_date.strftime('%Y%m%d')
    
    for start_hour in range(0, 24, 3):
        end_hour = start_hour + 3
        
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query in 'yyyymmdd' format
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

        try:
            # SQL Query to fetch data for the current 3-hour interval
            query = f"""
            SELECT 
                cuebiq_id, 
                event_zoned_datetime, 
                processing_date,
                lat,
                lng,
                TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
                geohash_encode(lat, lng, 5) AS geohash_5,
                geohash_encode(lat, lng, 3) AS geohash_3
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
                AND EXTRACT(HOUR FROM TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) BETWEEN {start_hour} AND {end_hour-1}
            """

            logging.info(f"Executing SQL query for date {formatted_current_date} and interval {start_hour} to {end_hour}")
            pe_dl_table_gen = sql_engine.read_sql(query)
            
            # Convert event_datetime_local to datetime once
            pe_dl_table_gen['event_datetime_local'] = pd.to_datetime(pe_dl_table_gen['event_datetime_local'])
            
            # Create 3-hour interval column
            interval = get_3_hour_interval(start_hour, formatted_current_date)
            pe_dl_table_gen['3_hour_interval'] = interval
            pe_dl_table_gen['local_date'] = formatted_current_date
            
            # Process for geohash_5
            logging.info(f"Aggregating data for geohash_5 for interval {start_hour} to {end_hour}")
            aggregated_data_5 = pe_dl_table_gen.groupby('geohash_5').agg(
                no_of_points=('geohash_5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_5 = aggregated_data_5[aggregated_data_5['no_of_unique_users'] > 10].copy()
            filtered_data_5.to_sql(master_table_5, con, if_exists='append', index=False)
            logging.info(f"Inserted aggregated data for date {formatted_current_date} interval {start_hour} to {end_hour} into {master_table_3}")

            # Process for geohash_3
            logging.info(f"Aggregating data for geohash3 for interval {start_hour} to {end_hour}")
            aggregated_data_3 = pe_dl_table_gen.groupby('geohash_3').agg(
                no_of_points=('geohash_3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique'),
                local_time=('3_hour_interval', 'first'),
                local_date=('local_date', 'first')
            ).reset_index()
            
            # Filter rows with no_of_unique_users > 10
            filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
            filtered_data_3.to_sql(master_table_3, con, if_exists='append', index=False, method='multi')
            logging.info(f"Inserted aggregated data for date {formatted_current_date} interval {start_hour} to {end_hour} into the {master tables}")
        
        except Exception as e:
            logging.error(f"Error while processing data for date {formatted_current_date} interval {start_hour} to {end_hour}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)


# 1