In [1]:
pip install geohash2

Collecting geohash2
  Using cached geohash2-1.1-py3-none-any.whl
Collecting docutils>=0.3
  Using cached docutils-0.21.2-py3-none-any.whl (587 kB)
Installing collected packages: docutils, geohash2
Successfully installed docutils-0.21.2 geohash2-1.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [3]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [4]:
import gc
import os
import time
import logging
import geohash2
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from trino.dbapi import connect

In [5]:
# SQL engine class
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

    def read_sql_chunked(self, query: str, chunksize: int = 10000):
        return pd.read_sql(query, self.engine, chunksize=chunksize)

sql_engine = TrinoEngine()

In [6]:
# Function to insert data with retry mechanism
def insert_data_with_retry(df, table_name, con, retries=3, delay=5):
    for attempt in range(retries):
        try:
            df.to_sql(
                table_name, 
                con, 
                index=False, 
                if_exists="append", 
                method="multi"
            )
            logging.info(f"Inserted data into table {table_name}")
            return True
        except Exception as e:
            logging.error(f"Attempt {attempt+1} failed with error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                logging.error(f"Failed to insert data into table {table_name} after {retries} attempts")
                return False


In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Database connection setup
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

start_time = time.time()  # Start timing

# Define the input parameters
country_code = 'MX'
start_date = 20191215
end_date = 20191231

# Define the input schema and table name
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

# Convert integer dates to datetime objects
start_date_dt = datetime.strptime(str(start_date), '%Y%m%d')
end_date_dt = datetime.strptime(str(end_date), '%Y%m%d')

# Define the sorted longitude boundaries
sorted_lons = [-115.3125, -113.90625, -112.5, -111.09375, -109.6875, -108.28125, -106.875, -105.46875, -104.0625, -102.65625, -101.25, -99.84375, -98.4375, -97.03125, -95.625, -94.21875, -92.8125, -91.40625]

# Initialize a list to store failed insertions
failed_inserts = []
# Define the buffer value
buffer_value = 20

In [8]:
# Loop through each day from start_date to end_date
current_date = start_date_dt
while current_date <= end_date_dt:
    try:
        # Calculate the lookback and lookahead dates
        lookback_date = current_date - timedelta(days=1)
        lookahead_date = current_date + timedelta(days=35)
        
        # Format dates for the SQL query
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_current_date = current_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        
        # Loop through each longitude range
        for i in range(len(sorted_lons) + 1):
            if i == 0:
                min_lon = sorted_lons[i] - buffer_value
                max_lon = sorted_lons[i]
            elif i == len(sorted_lons):
                min_lon = sorted_lons[i - 1]
                max_lon = sorted_lons[i - 1] + buffer_value
            else:
                min_lon = sorted_lons[i - 1]
                max_lon = sorted_lons[i]
            
            # Construct the SQL query
            query = f"""
            SELECT 
                cuebiq_id, 
                event_zoned_datetime, 
                processing_date,
                lat,
                lng
            FROM {pe_dl_table}
            WHERE 
                processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
                AND country_code = '{country_code}' 
                AND event_zoned_datetime IS NOT NULL
                AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
                AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
                AND lng BETWEEN {min_lon} AND {max_lon}
            """
            
            logging.info(f"Executing SQL query for date {formatted_current_date} and longitude range {min_lon} to {max_lon}")
            
            try:
                pe_dl_table_gen = sql_engine.read_sql_chunked(query)
                
                # Convert the generator to a DataFrame
                chunks = [chunk for chunk in pe_dl_table_gen]
                if chunks:
                    pe_dl_table_df = pd.concat(chunks, ignore_index=True)
                    
                    # Calculate geohashes
                    pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
                    
                    # Aggregate data for geohash3
                    aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                        no_of_points=('geohash3', 'size'),
                        no_of_unique_users=('cuebiq_id', 'nunique')
                    ).reset_index()
                    
                    # Filter rows with no_of_unique_users > 10
                    filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
                    
                    # Add the local_date column
                    filtered_data_3.loc[:, 'local_date'] = formatted_current_date
                    
                    # Insert filtered aggregated data for geohash3 into SQL table
                    if not filtered_data_3.empty:
                        table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                        if not insert_data_with_retry(filtered_data_3, table_name_agg3, con):
                            failed_inserts.append((formatted_current_date, min_lon, max_lon))
            
            except Exception as e:
                logging.error(f"Error while processing data for date {formatted_current_date} and longitude range {min_lon} to {max_lon}: {e}")
                failed_inserts.append((formatted_current_date, min_lon, max_lon))
    
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
        failed_inserts.append((formatted_current_date, None, None))
    
    # Move to the next day
    current_date += timedelta(days=1)

# Print the failed inserts
if failed_inserts:
    print("Failed inserts:")
    for failed_insert in failed_inserts:
        print(failed_insert)

logging.info("Data extraction, aggregation, and saving completed.")

2024-06-13 14:21:10,425 - INFO - Executing SQL query for date 20191215 and longitude range -135.3125 to -115.3125
2024-06-13 14:22:05,519 - INFO - Inserted data into table pd_mx_20191215_agg3
2024-06-13 14:22:05,520 - INFO - Executing SQL query for date 20191215 and longitude range -115.3125 to -113.90625
2024-06-13 14:22:33,049 - INFO - Inserted data into table pd_mx_20191215_agg3
2024-06-13 14:22:33,050 - INFO - Executing SQL query for date 20191215 and longitude range -113.90625 to -112.5
2024-06-13 14:23:01,826 - INFO - Inserted data into table pd_mx_20191215_agg3
2024-06-13 14:23:01,827 - INFO - Executing SQL query for date 20191215 and longitude range -112.5 to -111.09375
2024-06-13 14:23:19,816 - INFO - Inserted data into table pd_mx_20191215_agg3
2024-06-13 14:23:19,817 - INFO - Executing SQL query for date 20191215 and longitude range -111.09375 to -109.6875
2024-06-13 14:23:55,685 - INFO - Inserted data into table pd_mx_20191215_agg3
2024-06-13 14:23:55,686 - INFO - Executing

Failed inserts:
('20191231', -98.4375, -97.03125)


# MISSING DATES

In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize the SQL engine
sql_engine = TrinoEngine()

# Database connection setup
output_schema_name = 'pop_density'
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

# Define the buffer value
buffer_value = 20
failed_inserts = []

In [8]:
# Function to process data for a given date and longitude range
def process_data_for_date_and_lon(formatted_current_date, min_lon, max_lon):
    try:
        lookback_date = datetime.strptime(formatted_current_date, '%Y%m%d') - timedelta(days=1)
        lookahead_date = datetime.strptime(formatted_current_date, '%Y%m%d') + timedelta(days=35)
        
        formatted_lookback_date = lookback_date.strftime('%Y%m%d')
        formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')
        
        query = f"""
        SELECT 
            cuebiq_id, 
            event_zoned_datetime, 
            processing_date,
            lat,
            lng
        FROM cuebiq.paas_cda_pe_v3.device_location_uplevelled
        WHERE 
            processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
            AND country_code = '{country_code}' 
            AND event_zoned_datetime IS NOT NULL
            AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
            AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
            AND lng BETWEEN {min_lon} AND {max_lon}
        """
        
        logging.info(f"Executing SQL query for date {formatted_current_date} and longitude range {min_lon} to {max_lon}")
        
        try:
            pe_dl_table_gen = sql_engine.read_sql_chunked(query)
            
            chunks = [chunk for chunk in pe_dl_table_gen]
            if chunks:
                pe_dl_table_df = pd.concat(chunks, ignore_index=True)
                
                pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
                
                aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
                    no_of_points=('geohash3', 'size'),
                    no_of_unique_users=('cuebiq_id', 'nunique')
                ).reset_index()
                
                filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
                
                filtered_data_3.loc[:, 'local_date'] = formatted_current_date
                
                if not filtered_data_3.empty:
                    table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
                    if not insert_data_with_retry(filtered_data_3, table_name_agg3, con):
                        failed_inserts.append((formatted_current_date, min_lon, max_lon))
        
        except Exception as e:
            logging.error(f"Error while processing data for date {formatted_current_date} and longitude range {min_lon} to {max_lon}: {e}")
            failed_inserts.append((formatted_current_date, min_lon, max_lon))
    
    except Exception as e:
        logging.error(f"Error while processing data for date {formatted_current_date}: {e}")
        failed_inserts.append((formatted_current_date, None, None))


In [9]:
# Define the input parameters
country_code = 'MX'
missing_sections = [
    ('20190621', -98.4375, -97.03125),
    ('20190621', -97.03125, -95.625),
    ('20190629', -95.625, -94.21875),
    ('20190703', -104.0625, -102.65625),
    ('20190703', -102.65625, -101.25),
    ('20190711', -95.625, -94.21875),
    ('20190711', -94.21875, -92.8125),
    ('20190717', -101.25, -99.84375),
    ('20190825', -94.21875, -92.8125)
]

# Loop through each missing section
for section in missing_sections:
    formatted_current_date, min_lon, max_lon = section
    process_data_for_date_and_lon(formatted_current_date, min_lon, max_lon)

# Print the failed inserts
if failed_inserts:
    print("Failed inserts:")
    for failed_insert in failed_inserts:
        print(failed_insert)

logging.info("Data extraction, aggregation, and saving completed for missing sections.")

2024-06-14 15:13:02,766 - INFO - Executing SQL query for date 20190621 and longitude range -98.4375 to -97.03125
2024-06-14 15:14:34,184 - INFO - Inserted data into table pd_mx_20190621_agg3
2024-06-14 15:14:34,251 - INFO - Executing SQL query for date 20190621 and longitude range -97.03125 to -95.625
2024-06-14 15:15:18,561 - INFO - Inserted data into table pd_mx_20190621_agg3
2024-06-14 15:15:18,587 - INFO - Executing SQL query for date 20190629 and longitude range -95.625 to -94.21875
2024-06-14 15:15:48,342 - INFO - Inserted data into table pd_mx_20190629_agg3
2024-06-14 15:15:48,349 - INFO - Executing SQL query for date 20190703 and longitude range -104.0625 to -102.65625
2024-06-14 15:17:10,532 - INFO - Inserted data into table pd_mx_20190703_agg3
2024-06-14 15:17:10,607 - INFO - Executing SQL query for date 20190703 and longitude range -102.65625 to -101.25
2024-06-14 15:18:10,940 - INFO - Inserted data into table pd_mx_20190703_agg3
2024-06-14 15:18:10,996 - INFO - Executing SQ

# BREAK DOWN FOR CHECKING

In [11]:
# # Process data for one day and one longitude range
# current_date = start_date_dt

# # Calculate the lookback and lookahead dates
# lookback_date = current_date - timedelta(days=1)
# lookahead_date = current_date + timedelta(days=35)

# # Format dates for the SQL query
# formatted_lookback_date = lookback_date.strftime('%Y%m%d')
# formatted_current_date = current_date.strftime('%Y%m%d')
# formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

# # Use the first longitude range for testing
# min_lon = sorted_lons[0]
# max_lon = sorted_lons[1]


In [14]:
# # Construct the SQL query
# query = f"""
# SELECT 
#     cuebiq_id, 
#     event_zoned_datetime, 
#     processing_date,
#     lat,
#     lng
# FROM {pe_dl_table}
# WHERE 
#     processing_date BETWEEN {formatted_lookback_date} AND {formatted_lookahead_date}
#     AND country_code = '{country_code}' 
#     AND event_zoned_datetime IS NOT NULL
#     AND TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) IS NOT NULL
#     AND DATE(TRY(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) = date_parse('{formatted_current_date}', '%Y%m%d')
#     AND lng BETWEEN {min_lon} AND {max_lon}
# """

# logging.info(f"Executing SQL query for date {formatted_current_date} and longitude range {min_lon} to {max_lon}")

# # Execute the query and fetch data in chunks
# pe_dl_table_gen = sql_engine.read_sql_chunked(query)

# # Convert the generator to a DataFrame
# chunks = [chunk for chunk in pe_dl_table_gen]
# if chunks:
#     pe_dl_table_df = pd.concat(chunks, ignore_index=True)
    
#     # Calculate geohashes
#     pe_dl_table_df['geohash3'] = pe_dl_table_df.apply(lambda row: geohash2.encode(row['lat'], row['lng'], precision=3), axis=1)
    
#     # Aggregate data for geohash3
#     aggregated_data_3 = pe_dl_table_df.groupby('geohash3').agg(
#         no_of_points=('geohash3', 'size'),
#         no_of_unique_users=('cuebiq_id', 'nunique')
#     ).reset_index()
    
#     # Filter rows with no_of_unique_users > 10
#     filtered_data_3 = aggregated_data_3[aggregated_data_3['no_of_unique_users'] > 10].copy()
    
#     # Add the local_date column
#     filtered_data_3.loc[:, 'local_date'] = formatted_current_date
    
#     # Insert filtered aggregated data for geohash3 into SQL table
#     if not filtered_data_3.empty:
#         table_name_agg3 = f"pd_{country_code.lower()}_{formatted_current_date}_agg3"
#         insert_data_with_retry(filtered_data_3, table_name_agg3, con)

# logging.info("Data processing for one day and one longitude range completed.")


2024-06-08 19:40:27,978 - INFO - Executing SQL query for date 20190101 and longitude range -115.3125 to -113.90625
2024-06-08 19:41:10,228 - INFO - Inserted data into table pd_mx_20190101_agg3
2024-06-08 19:41:10,229 - INFO - Data processing for one day and one longitude range completed.


In [15]:
# pe_dl_table_df

Unnamed: 0,cuebiq_id,event_zoned_datetime,processing_date,lat,lng,geohash3
0,1343567179,2019-01-01T14:02:33-07:00,20190101,32.449442,-114.787196,9my
1,1343567179,2019-01-01T05:06:31-07:00,20190101,32.459550,-114.796062,9my
2,1343567179,2019-01-01T05:06:35-07:00,20190101,32.459550,-114.796062,9my
3,1343567179,2019-01-01T12:46:02-07:00,20190101,32.459550,-114.796062,9my
4,1343567179,2019-01-01T12:50:02-07:00,20190101,32.459550,-114.796062,9my
...,...,...,...,...,...,...
159406,1337007288,2019-01-01T18:05:30-08:00,20190116,32.403619,-115.188200,9my
159407,888303874,2019-01-01T14:57:41-07:00,20190120,32.459543,-114.723860,9my
159408,1350491159,2019-01-01T07:07:10-08:00,20190112,31.131851,-114.889627,9mw
159409,757754946,2019-01-01T10:26:13-07:00,20190112,32.441030,-114.722863,9my
