move file (pd) to dedicated 

# In SQL

In [None]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [None]:
%sql trino://localhost:9090/cuebiq/

In [None]:
from sqlalchemy.engine import create_engine
output_schema_name = 'presence_data'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

In [None]:
import time
import pandas as pd
from sqlalchemy import create_engine
from trino.dbapi import connect
from datetime import datetime, timedelta

In [None]:
%%sql
show schemas from dedicated

In [None]:
%%sql
show tables from dedicated.pop_density

In [None]:
%%sql
show tables from dedicated.od_matrix

In [None]:
%%sql
show tables from dedicated.presence_data

## Create schema

In [None]:
# output_schema_name = "OD_matrix"

In [None]:
# %sql create schema if not exists dedicated.$output_schema_name

## Create table 

In [None]:
# output_schema_name = 'presence_data'
# table_name = f"dedicated.{output_schema_name}.new_table_demo"

In [None]:
# %sql create table if not exists $table_name (col_0 varchar, col_1 bigint, col_2 varchar)

In [None]:
# %sql show tables from dedicated.presence_data

## Delete table

In [None]:
%%sql
DROP TABLE IF EXISTS hive.od_matrix.od_id_20190301_agg5

## Rename table

In [None]:
# Rename the table to the final name based on date and country code

output_schema_name = 'od_matrix'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

rename_table_query = f"""
ALTER TABLE "od_id_20190301_test_agg5" RENAME TO "od_id_20190301_agg5"
"""

# Execute the rename table query
with con.connect() as connection:
    connection.execute(rename_table_query)

## Check table

In [None]:
# check
q2 = f"select * from dedicated.presence_data.od_id_20190301_agg5"

%sql $q2

In [None]:
# check
q2 = f"select * from dedicated.od_matrix.od_id_20190303_agg5"
%sql $q2

In [None]:
# check
q1 = f"select * from dedicated.pop_density.pd_in_20190401_agg3_2"
%sql $q1

In [None]:
import pandas as pd
from sqlalchemy.engine import create_engine

# Replace with your actual schema name
output_schema_name = "presence_data"
bucketized_table = f"dedicated.{output_schema_name}.bucketized_table"

# Partition size
partition_size = 5000

# Data preparation
df['partition_key'] = (df.index // partition_size) + 1
df = df.astype({
    'start_geohash_user': 'int',
    'end_geohash_user': 'int',
    'trip_count': 'int',
    'partition_key': 'int'
})

# Create the SQL engine
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

# Define the SQL query to create the bucketized and partitioned table
create_table_query = f"""
CREATE TABLE {bucketized_table} (
    start_geohash5 varchar,
    start_geohash_user bigint,
    end_geohash5 varchar,
    end_geohash_user bigint,
    trip_count bigint,
    m_duration_min double,
    mdn_duration_min double,
    sd_duration_min double,
    m_length_m double,
    mdn_length_m double,
    sd_length_m double,
    partition_key bigint
)
WITH (
  partitioned_by = ARRAY['partition_key'],
  bucketed_by = ARRAY['end_geohash5'],
  bucket_count = 5
)
"""

# Execute the create table query
with con.connect() as connection:
    connection.execute(create_table_query)

# Function to insert data in chunks
def insert_data_in_chunks(df, table_name, engine, chunk_size=5000):
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        chunk.to_sql(table_name, engine, index=False, if_exists='append', method='multi')

# Insert data into the bucketized table
insert_data_in_chunks(df, "bucketized_table", con, chunk_size)

country_code = 'id'
date_str = "20190101"
final_bucketized_table = f"od_{country_code.lower()}_{event_date}_agg3"

# Rename the table to the final name based on the date range and country code
rename_table_query = f"""
ALTER TABLE "bucketized_table" RENAME TO "{final_bucketized_table}"
"""

# Execute the rename table query
with con.connect() as connection:
    connection.execute(rename_table_query)

## From existing file (in jupyter)

In [None]:
# Generate date strings from 20190101 to 20190430
start_date = datetime.strptime("20190714", "%Y%m%d")
end_date = datetime.strptime("20190731", "%Y%m%d")
date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days+1)]

# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            print(f"Inserted data into table {table_name}")
            break
        except Exception as e:
            print(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                print(f"Failed to insert data into table {table_name} after {retries} attempts")

# Start timing the process
start_time = time.time()
                
# Iterate over each date, read the corresponding CSV file, and insert into the database for both agg3 and agg5
for date in date_generated:
    date_str = date.strftime("%Y%m%d")

    # Process agg5 files
    file_path_agg5 = f"/home/jovyan/Data/Agg_DL/ID5/{date_str}_ID_pe_dl_agg5.csv"
    table_name_agg5 = f"pd_id_{date_str}_agg5"
    
    try:
        # Read the CSV file for agg5
        df_agg5 = pd.read_csv(file_path_agg5)
        
        # Convert all column names to lowercase
        df_agg5.columns = [col.lower() for col in df_agg5.columns]
        
        # Insert DataFrame into the table with retry mechanism
        insert_data_with_retry(df_agg5, table_name_agg5, con)
    except Exception as e:
        print(f"Failed to process {file_path_agg5}: {e}")

    # Process agg3 files
    file_path_agg3 = f"/home/jovyan/Data/Agg_DL/ID3/{date_str}_ID_pe_dl_agg3.csv"
    table_name_agg3 = f"pd_id_{date_str}_agg3"
    
    try:
        # Read the CSV file for agg3
        df_agg3 = pd.read_csv(file_path_agg3)
        
        # Convert all column names to lowercase
        df_agg3.columns = [col.lower() for col in df_agg3.columns]
        
        # Insert DataFrame into the table with retry mechanism
        insert_data_with_retry(df_agg3, table_name_agg3, con)
    except Exception as e:
        print(f"Failed to process {file_path_agg3}: {e}")
        
# End timing the process
end_time = time.time()
total_time = end_time - start_time

print(f"Total time taken: {total_time} seconds")

# 638s for 20190511 - 20190713

In [None]:
# Single one for testing

# # Generate date strings from 20190101 to 20190430
# start_date = datetime.strptime("20190101", "%Y%m%d")
# end_date = datetime.strptime("20190104", "%Y%m%d")
# date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days+1)]

# # Iterate over each date, read the corresponding CSV file, and insert into the database
# for date in date_generated:
#     date_str = date.strftime("%Y%m%d")
#     file_path = f"/home/jovyan/Data/Agg_DL/CO5/{date_str}_CO_pe_dl_agg5.csv"
#     table_name = f"pd_co_{date_str}_agg5"
    
#     try:
#         # Read the CSV file
#         df = pd.read_csv(file_path)
        
#         # Convert all column names to lowercase
#         df.columns = [col.lower() for col in df.columns]
        
#         # Insert DataFrame into the table
#         df.to_sql(
#             table_name, 
#             con, 
#             index=False, 
#             if_exists="append", 
#             method="multi"
#         )
#         print(f"Inserted data into table {table_name} from {file_path}")
#     except Exception as e:
#         print(f"Failed to process {file_path}: {e}")

# CO 5 不行 不知道为啥？？？ 


### test 

In [None]:
# import pandas as pd
# df = pd.read_csv('/home/jovyan/Data/Agg_DL/ID3/20190101_ID_pe_dl_agg3.csv')
# df

In [None]:
# %%time
# df.to_sql(
#     "my_processed_dataset3", 
#     con, 
#     index=False, 
#     if_exists="append", 
#     method="multi"
# )

1. if_exists='fail': The method will raise a ValueError if the table already exists. This is the default behavior.
2. if_exists='replace': If the table exists, it will be dropped and replaced with the new data.
3. if_exists='append': If the table exists, the new data will be inserted into the existing table. If the table does not exist, it will be created.

# test

In [None]:
df = pd.read_csv('/home/jovyan/Data/DL/MX/20190128_MX_pe_dl.csv')
# df.sort_values('event_datetime_local')
df

In [None]:
import gc
import os
import time
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta

In [None]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [None]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}

# dl_table = f"{schema_name['cda']}.device_location"  
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

tj_table = f"{schema_name['cda']}.trajectory"     
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

# stop_table = f"{schema['cda']}.stop" 
pe_stop_table = f"{schema_name['cda']}.stop_uplevelled"

visit_table = f"{schema_name['cda']}.visit " 

In [None]:
class TrinoEngine:
    def __init__(self):
        self.conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = self.conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query: str):
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()


In [None]:
country_code = 'ID'
start_date = 20190101
end_date = 20190103

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')
formatted_date = start_date_dt.strftime('%Y%m%d')
next_date = (start_date_dt + timedelta(days=1)).strftime('%Y-%m-%d')

In [None]:
pe_tj_table = sql_engine.read_sql_chunked(
    f"""
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng, 
        TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local
    FROM {pe_dl_table}
    WHERE 
        processing_date = {formatted_date} 
        AND country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) >= date_parse('{start_date_dt.strftime('%Y%m%d')}', '%Y%m%d')
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) <= date_parse('{next_date}', '%Y-%m-%d')
    """
)

# pe_tj_table
first_chunk = next(pe_tj_table)
first_chunk

In [None]:
pe_tj_table = sql_engine.read_sql_chunked(
    f"""
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng, 
        TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_date_local

    FROM {pe_dl_table}
    WHERE 
        processing_date = {formatted_date} 
        AND country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) >= date_parse('{start_date_dt.strftime('%Y%m%d')}', '%Y%m%d')
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) <= date_parse('{next_date}', '%Y-%m-%d')
    """
)

# Convert the generator to a DataFrame
chunks = [chunk for chunk in pe_tj_table]
pe_tj_table_df = pd.concat(chunks, ignore_index=True)

pe_tj_table_df

# Process date 用前面的

In [None]:
# probabliy not correct!!!!
# 因为要求时间<开始时间

from datetime import datetime, timedelta
import pandas as pd

import time

# Start timing
start_time = time.time()

# Define the input parameters
country_code = 'ID'
start_date = 20190101
end_date = 20190103

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Calculate the date 45 days before the start_date
lookback_date_dt = start_date_dt - timedelta(days=45)
formatted_lookback_date = lookback_date_dt.strftime('%Y%m%d')
formatted_end_date = end_date_dt.strftime('%Y%m%d')

# Construct the SQL query
pe_tj_table = sql_engine.read_sql_chunked(
    f"""
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng, 
        TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_date_local

    FROM {pe_dl_table}
    WHERE 
        processing_date >= {formatted_lookback_date}
        AND processing_date <= {formatted_end_date}
        AND country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) >= date_parse('{start_date_dt.strftime('%Y%m%d')}', '%Y%m%d')
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) <= date_parse('{end_date_dt.strftime('%Y-%m-%d')}', '%Y-%m-%d')
    """
)

# Convert the generator to a DataFrame
chunks = [chunk for chunk in pe_tj_table]
pe_tj_table_df = pd.concat(chunks, ignore_index=True)


# End timing
end_time = time.time()

# Calculate and print the total time taken
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

pe_tj_table_df

# probabliy not correct!!!!

In [None]:
# This one seems like correct 
from datetime import datetime, timedelta
import pandas as pd
import time

# Start timing
start_time = time.time()

# Define the input parameters
country_code = 'MX'
start_date = 20190102
end_date = 20190103

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    # Calculate the lookback and lookahead dates
    lookback_date = current_date - timedelta(days=1)
    lookahead_date = current_date + timedelta(days=35)
    
    # Format dates for the SQL query
    formatted_lookback_date = lookback_date.strftime('%Y%m%d')
    formatted_current_date = current_date.strftime('%Y%m%d')
    formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
    
    # Construct and execute the SQL query
    pe_dl_table = sql_engine.read_sql_chunked(
        f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng, 
            TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_date_local

        FROM {pe_dl_table}
        WHERE 
            processing_date >= {formatted_lookback_date} 
            AND processing_date <= {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
        """
    )
    
    # Convert the generator to a DataFrame
    chunks = [chunk for chunk in pe_tj_table]
    if chunks:
        pe_tj_table_df = pd.concat(chunks, ignore_index=True)
        
        # Save the DataFrame to a CSV file
        output_filename = f'/home/jovyan/Data/Test/0604/pe_tj_table_{formatted_current_date}.csv'
        pe_tj_table_df.to_csv(output_filename, index=False)
        print(f"Saved data for {formatted_current_date} to {output_filename}")
    
    # Move to the next day
    current_date += timedelta(days=1)

print("Data extraction and saving completed.")


end_time = time.time() # End timing

# Calculate and print the total time taken
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")


In [None]:
# transform all event date and get one day. 
import time

country_code = 'MX'
start_date = 20190101

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
next_date = (start_date_dt + timedelta(days=1)).strftime('%Y-%m-%d')

# Start timing
start_time = time.time()

pe_tj_table = sql_engine.read_sql_chunked(
    f"""
    WITH event_data AS (
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            timezoneoffset_secs,
            lat,
            lng, 
            TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local
        FROM {pe_dl_table}
        WHERE 
            country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
    )
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        timezoneoffset_secs,
        lat,
        lng
    FROM event_data
    WHERE 
        event_datetime_local >= date_parse('{start_date_dt.strftime('%Y%m%d')}', '%Y%m%d')
        AND event_datetime_local < date_parse('{next_date}', '%Y-%m-%d')
    """
)

# Convert the generator to a DataFrame
chunks = [chunk for chunk in pe_tj_table]
pe_tj_table_df = pd.concat(chunks, ignore_index=True)

# End timing
end_time = time.time()

# Calculate and print the total time taken
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")


pe_tj_table_df


# transform all event date and get one day. 
# 877 for one day in ID 
# 1980 for one day in MX

In [None]:
pip install geohash2

In [None]:
pip install python-geohash

In [None]:
# with logging and works. export data to the system. 
# 1933.101960659027 for 3 day in IN
# process date -1 to +35
from datetime import datetime, timedelta
import pandas as pd
import time
import geohash2
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'IN'
start_date = 20190101
end_date = 20190103

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    # Calculate the lookback and lookahead dates
    lookback_date = current_date - timedelta(days=1)
    lookahead_date = current_date + timedelta(days=35)
    
    # Format dates for the SQL query
    formatted_lookback_date = lookback_date.strftime('%Y%m%d')
    formatted_current_date = current_date.strftime('%Y%m%d')
    formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
    
    # Construct the SQL query
    query = f"""
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng
    FROM {pe_dl_table}
    WHERE 
        processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
        AND country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
    """
    
    logging.info(f"Executing SQL query for date {formatted_current_date}: {query}")
    
    try:
        pe_dl_table_gen = sql_engine.read_sql_chunked(query)
        
        # Convert the generator to a DataFrame
        chunks = [chunk for chunk in pe_dl_table_gen]
        if chunks:
            pe_dl_table_df = pd.concat(chunks, ignore_index=True)
            
            # Calculate geohashes
            pe_dl_table_df['geohash5'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
            pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
            
            # Aggregate data for geohash5
            aggregated_data_5 = pe_dl_table_df.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Save the aggregated data to a CSV file for geohash5
            output_filename_5 = f'/home/jovyan/Data/Agg_DL/IN5/aggregated_pe_tj_table_geohash5_{formatted_current_date}.csv'
            aggregated_data_5.to_csv(output_filename_5, index=False)
            logging.info(f"Saved aggregated data for geohash5 for {formatted_current_date} to {output_filename_5}")

            # Aggregate data for geohash3
            aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Save the aggregated data to a CSV file for geohash3
            output_filename_3 = f'/home/jovyan/Data/Agg_DL/IN3/aggregated_pe_tj_table_geohash3_{formatted_current_date}.csv'
            aggregated_data_3.to_csv(output_filename_3, index=False)
            logging.info(f"Saved aggregated data for geohash3 for {formatted_current_date} to {output_filename_3}")
        
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

logging.info("Data extraction, aggregation, and saving completed.")

end_time = time.time()  # End timing

# Calculate and print the total time taken
total_time = end_time - start_time
logging.info(f"Total time taken: {total_time} seconds")


In [None]:
# Write in to the table

from datetime import datetime, timedelta
import pandas as pd
import time
import geohash2
import logging
from sqlalchemy import create_engine

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'presence_data'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'IN'
start_date = 20190101
end_date = 20190103

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            break
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")

# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    # Calculate the lookback and lookahead dates
    lookback_date = current_date - timedelta(days=1)
    lookahead_date = current_date + timedelta(days=35)
    
    # Format dates for the SQL query
    formatted_lookback_date = lookback_date.strftime('%Y%m%d')
    formatted_current_date = current_date.strftime('%Y%m%d')
    formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
    
    # Construct the SQL query
    query = f"""
    SELECT 
        cuebiq_id, 
        event_zoned_datetime, 
        processing_date,
        lat,
        lng
    FROM {pe_dl_table}
    WHERE 
        processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
        AND country_code = '{country_code}' 
        AND event_zoned_datetime IS NOT NULL
        AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
        AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
    """
    
    logging.info(f"Executing SQL query for date {formatted_current_date}: {query}")
    
    try:
        pe_dl_table_gen = sql_engine.read_sql_chunked(query)
        
        # Convert the generator to a DataFrame
        chunks = [chunk for chunk in pe_dl_table_gen]
        if chunks:
            pe_dl_table_df = pd.concat(chunks, ignore_index=True)
            
            # Calculate geohashes
            pe_dl_table_df['geohash5'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=5), axis=1)
            pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
            
            # Aggregate data for geohash5
            aggregated_data_5 = pe_dl_table_df.groupby('geohash5').agg(
                no_of_points=('geohash5', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Insert aggregated data for geohash5 into SQL table
            table_name_agg5 = f"pd_{country_code}_{formatted_current_date}_agg5"
            insert_data_with_retry(aggregated_data_5, table_name_agg5, con)
            
            # Aggregate data for geohash3
            aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                no_of_points=('geohash3', 'size'),
                no_of_unique_users=('cuebiq_id', 'nunique')
            ).reset_index()
            
            # Insert aggregated data for geohash3 into SQL table
            table_name_agg3 = f"pd_{country_code}_{formatted_current_date}_agg3"
            insert_data_with_retry(aggregated_data_3, table_name_agg3, con)
        
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
    
    # Move to the next day
    current_date += timedelta(days=1)

logging.info("Data extraction, aggregation, and saving completed.")

end_time = time.time()  # End timing

# Calculate and print the total time taken
total_time = end_time - start_time
logging.info(f"Total time taken: {total_time} seconds")
