Combine Pop density per country, worked for CO, ID, IN  
pd_co_20191_agg3  
pd_in_2019_agg3  
pd_mx_2019_agg3  
pd_id_2019_agg3  

In [1]:
import os
import time
import logging
import datetime
import pandas as pd
from trino.dbapi import connect 
from sqlalchemy import create_engine
from datetime import timedelta
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [3]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [4]:
class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [11]:
# Define the country
country_abbreviation = "MX".lower()  

# Define the date range
start_date = datetime.date(2019, 1, 1)
end_date = datetime.date(2019, 12, 31)

# Generate the list of dates
date_list = [(start_date + datetime.timedelta(days=x)).strftime('%Y%m%d') 
             for x in range((end_date - start_date).days + 1)]

# print(date_list)
len(date_list)

365

In [12]:
# Create master table
output_catalog = 'dedicated'
output_schema_name = "pop_density"
master_table = f"pd_{country_abbreviation}_2019_agg3"
con = create_engine(f"trino://localhost:9090/dedicated/{output_schema_name}")

# Create the master table in the presence_data schema
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {master_table}(
    geohash_3 varchar,
    no_of_points bigint,
    no_of_unique_users bigint,
    local_date varchar
)
WITH (
    format = 'TEXTFILE'
)
"""

with con.connect() as connection:
    logging.info(f"Creating master table: {master_table}")
    connection.execute(create_table_query)

2024-06-18 21:20:44,533 - INFO - Creating master table: pd_mx_2019_agg3


In [13]:
# Loop through each date and insert the data into the master table
for date in date_list:
    logging.info(f"Processing data for date: {date}")
    query = f"""
    INSERT INTO {output_catalog}.{output_schema_name}.{master_table} (geohash_3, no_of_points, no_of_unique_users, local_date)
        SELECT *
        FROM {output_catalog}.{output_schema_name}.pd_{country_abbreviation}_{date}_agg3
    """
    
    # Execute the insertion query
    # sql_engine.read_sql(query)
        # Execute the insertion query
    
    try:
        sql_engine.read_sql(query)
        logging.info(f"Successfully inserted data for date: {date}")
    except Exception as e:
        logging.error(f"Error inserting data for date: {date} - {e}")

logging.info("Data combined and inserted successfully.")

2024-06-18 21:20:54,576 - INFO - Processing data for date: 20190101
2024-06-18 21:20:56,159 - INFO - Successfully inserted data for date: 20190101
2024-06-18 21:20:56,160 - INFO - Processing data for date: 20190102
2024-06-18 21:20:57,579 - INFO - Successfully inserted data for date: 20190102
2024-06-18 21:20:57,580 - INFO - Processing data for date: 20190103
2024-06-18 21:20:58,899 - INFO - Successfully inserted data for date: 20190103
2024-06-18 21:20:58,900 - INFO - Processing data for date: 20190104
2024-06-18 21:21:00,573 - INFO - Successfully inserted data for date: 20190104
2024-06-18 21:21:00,574 - INFO - Processing data for date: 20190105
2024-06-18 21:21:01,949 - INFO - Successfully inserted data for date: 20190105
2024-06-18 21:21:01,950 - INFO - Processing data for date: 20190106
2024-06-18 21:21:03,395 - INFO - Successfully inserted data for date: 20190106
2024-06-18 21:21:03,396 - INFO - Processing data for date: 20190107
2024-06-18 21:21:04,834 - INFO - Successfully inse