In [1]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import time
import pandas as pd
from tqdm import tqdm
from datetime import datetime

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [3]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}

dl_table = f"{schema_name['cda']}.device_location"  
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

tj_table = f"{schema_name['cda']}.trajectory"     
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

# stop_table = f"{schema_name['cda']}.trajectory"    ?    
pe_stop_table = f"{schema_name['cda']}.stop_uplevelled"

visit_table = f"{schema_name['cda']}.visit " 


# pe_tj

In [3]:
country_code = 'CO'
start_date = 20190101
end_date = 20191231

In [None]:
sql_engine.read_sql(f"desc {pe_tj_table}")

In [4]:
start_time = time.time()

pe_tj_table = sql_engine.read_sql(
    f"""
    SELECT 
        event_date AS event_date,
        count(*) AS Observations, 
        count(DISTINCT cuebiq_id) AS Unique_Users
    
    FROM {pe_tj_table}
    WHERE 
        start_country = '{country_code}'
        AND end_country = '{country_code}'
        AND event_date BETWEEN {start_date} AND {end_date}
    GROUP BY 
        event_date
    ORDER BY 
        event_date ASC
    """
)

end_time = time.time()
execution_time = end_time - start_time
print(f"Query executed in: {execution_time:.2f} seconds")

pe_tj_table

Query executed in: 867.38 seconds


Unnamed: 0,event_date,Observations,Unique_Users
0,20191022,51190,26748
1,20191023,168894,45682
2,20191024,165601,44668
3,20191025,172919,44883
4,20191026,150994,40389
...,...,...,...
66,20191227,65933,18190
67,20191228,60694,17141
68,20191229,49744,15255
69,20191230,60589,16945


In [None]:
pe_tj_table.to_csv(f'/home/jovyan/Data/2019_{country_code}_pe_tj.csv', index=False) 

In [None]:
sql_engine.read_sql(
    f"""
    SELECT *
    FROM {pe_tj_table}
    WHERE 
        start_country = 'CO'
        AND end_country = 'CO'
        AND event_date = 20191201
    ORDER BY 
        cuebiq_id ASC
    """
)

# tj

In [None]:
sql_engine.read_sql(
    f"""
    SELECT *
    FROM {tj_table}
    WHERE 
        start_country = 'CO'
        AND end_country = 'CO'
        AND event_date = 20191201
    ORDER BY 
        cuebiq_id ASC
    """
)

In [None]:
sql_engine.read_sql(f"desc {tj_table}")

In [3]:
country_code = 'CO'
# Done: IN, ID, MX,
start_date = 20190101
end_date = 20191231

In [None]:
start_time = time.time()
print(f"Query executed from: {start_time:.2f} seconds")

tj_table = sql_engine.read_sql(
    f"""
    SELECT 
        event_date AS event_date,
        count(*) AS Observations, 
        count(DISTINCT cuebiq_id) AS Unique_Users
    
    FROM {tj_table}
    WHERE 
        start_country = '{country_code}'
        AND end_country = '{country_code}'
        AND event_date BETWEEN {start_date} AND {end_date}
    GROUP BY 
        event_date
    ORDER BY 
        event_date ASC
    """
)

end_time = time.time()
execution_time = end_time - start_time

print(f"Query executed in: {execution_time:.2f} seconds")


tj_table

In [None]:
tj_table.to_csv(f'/home/jovyan/Data/2019_{country_code}_tj.csv', index=False) 

# pe_dl

In [4]:
sql_engine.read_sql(f"desc {pe_dl_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,event_timestamp,bigint,,
1,cuebiq_id,bigint,,
2,device_type_code,varchar(5),,
3,os_name,varchar,,
4,lat,double,,
5,lng,double,,
6,accuracy_meters,double,,
7,device_manufacturer_name,varchar,,
8,device_model_code,varchar,,
9,carrier_code,varchar,,


## Group

In [None]:
country_code = 'CO'

In [None]:
start_time = time.time() # Start the timer

result = sql_engine.read_sql(
    f"""
    SELECT 
        event_date_utc,
        count(*) AS Total_Observations,
        count(DISTINCT cuebiq_id) AS Total_Unique_Users
    FROM (
        SELECT 
            date(
                try(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
                interval '1' second * timezoneoffset_secs)
            ) AS event_date_utc,
            cuebiq_id
        FROM {pe_dl_table}
        WHERE 
            country_code = '{country_code}'
            AND processing_date BETWEEN 20181210 AND 20200131
            AND event_zoned_datetime IS NOT NULL
            AND length(event_zoned_datetime) >= 19
    ) AS subquery
    WHERE 
        event_date_utc IS NOT NULL
        AND event_date_utc BETWEEN date '2019-01-01' AND date '2019-12-31'
    GROUP BY event_date_utc
    ORDER BY event_date_utc
    """
)


# result = sql_engine.read_sql(
#     f"""
#     SELECT 
#         event_date_utc,
#         count(*) AS Total_Observations,
#         count(DISTINCT cuebiq_id) AS Total_Unique_Users
#     FROM (
#         SELECT 
#             date(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
#             interval '1' second * timezoneoffset_secs) AS event_date_utc,
#             cuebiq_id
#         FROM {pe_dl_table}
#         WHERE 
#             country_code = '{country_code}'
#             AND event_zoned_datetime IS NOT NULL
#             AND processing_date BETWEEN 20181201 AND 20200131
#     ) AS subquery
#     WHERE 
#         event_date_utc IS NOT NULL
#         AND event_date_utc BETWEEN date '2019-01-01' AND date '2019-12-31'
#     GROUP BY event_date_utc
#     ORDER BY event_date_utc
#     """
# )


end_time = time.time() # Stop the timer
execution_time = end_time - start_time # Calculate the total time taken
print(f"Query executed in: {execution_time:.2f} seconds")

result

In [None]:
result.to_csv(f'/home/jovyan/Data/2019_{country_code}_pe_dl.csv', index=False)  

## by individual

In [None]:
result_df = pd.DataFrame(columns=['date_str', 'Observations', 'Unique_Users'])

In [None]:
country_code = 'CO'
start_date = 20200101
end_date = 20200102

# Read data from SQL table
query = f"""
    SELECT 
        event_zoned_datetime,
        cuebiq_id,  
        processing_date,
        timezoneoffset_secs,
        
        -- Extract only the date part
        date(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
        interval '1' second * timezoneoffset_secs) AS event_date_utc,
        
        -- Extract the date and time part and adjust by the timezone offset 
        date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
        interval '1' second * timezoneoffset_secs AS event_datetime_utc,
        
        date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') AS event_date  -- without consider timezone
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}'
        AND event_zoned_datetime IS NOT NULL
        -- AND processing_date = {start_date}
        AND processing_date BETWEEN {start_date} AND {end_date}
"""        

pe_dl_table_df = sql_engine.read_sql(query)
pe_dl_table_df
# pe_dl_table_df.dtypes
# pe_dl_table_df.sort_values(by='event_zoned_datetime')

In [None]:
country_code = 'IN'
start_date = 20190101
end_date = 20190105

query = f"""
    SELECT 
        event_zoned_datetime,
        cuebiq_id,  
        processing_date,
        timezoneoffset_secs,
        -- Extract only the date part
        date(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
        interval '1' second * timezoneoffset_secs) AS event_date_utc
        -- -- Extract the date and time part and adjust by the timezone offset 
        -- date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
        -- interval '1' second * timezoneoffset_secs AS event_datetime_utc
        
        -- date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') AS event_date # without consider timezone
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}'
        AND event_zoned_datetime IS NOT NULL
        -- AND processing_date = {start_date}
        AND processing_date BETWEEN {start_date} AND {end_date}

"""

pe_dl_table_df = sql_engine.read_sql(query)
# pe_dl_table_df
# pe_dl_table_df.dtypes
pe_dl_table_df.sort_values(by='event_zoned_datetime')

In [None]:
# Convert event_date_utc to datetime
pe_dl_table_df['event_date_utc'] = pd.to_datetime(pe_dl_table_df['event_date_utc'], errors='coerce')

# Define the date to compare
compare_date = pd.to_datetime('2019-12-31')

# Filter rows where event_date_utc is equal to the compare_date
selected = pe_dl_table_df[pe_dl_table_df['event_date_utc'] == compare_date]

# Display the filtered DataFrame
selected

In [None]:
pe_dl_table_df['event_zoned_datetime'] = pd.to_datetime(pe_dl_table_df['event_zoned_datetime'], utc=True, errors='coerce')
pe_dl_table_df

In [None]:
# Convert event_zoned_datetime to datetime format
pe_dl_table_df['date_str'] = pe_dl_table_df['event_zoned_datetime'].dt.strftime('%Y%m%d')

# pe_dl_table_df
pe_dl_table_df.sort_values(by='date_str')

In [None]:
# Group by date_str, calculate observations and unique users
grouped_data = pe_dl_table_df.groupby('date_str').agg({
    'event_zoned_datetime': 'count',  # Count all rows (observations)
    'cuebiq_id': 'nunique'            # Count unique cuebiq_id (unique users)
})

# Rename columns (optional)
grouped_data.columns = ['Observations', 'Unique_Users']
grouped_data = grouped_data.reset_index()

grouped_data

In [None]:
grouped_data.date_str = pd.to_numeric(grouped_data.date_str)
selected_df = grouped_data[(grouped_data.date_str >= start_date) & (grouped_data.date_str <= end_date)]
# selected_df
result_df = pd.concat([result_df, selected_df], ignore_index=True)

result_df

In [None]:
result_df.to_csv(f'/home/jovyan/Data/2019_{country_code}_pe_dl_12_3.csv', index=False) 

In [None]:
sql_engine.read_sql(f"desc {pe_dl_table}")

In [None]:
country_code = 'MX'
start_date = 20190101
end_date = 20190105

query = f"""
    SELECT 
        lat,
        lng,
        event_zoned_datetime,
        cuebiq_id,  
        processing_date,
        timezoneoffset_secs
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}'
        AND event_zoned_datetime IS NOT NULL
        AND processing_date = {start_date}
        -- AND processing_date BETWEEN {start_date} AND {end_date}
"""

pe_dl_table_df = sql_engine.read_sql(query)
pe_dl_table_df

# pe_stop

In [None]:
sql_engine.read_sql(f"desc {pe_stop_table}")

In [None]:
df_points = sql_engine.read_sql(
    f"""
    select *
    from {pe_stop_table}
    where 
        country_code = 'MX'
        and processing_date = 20190105
    limit 5
    """
)
df_points

In [None]:
country_code = 'ID'

start_time = time.time()

result = sql_engine.read_sql(
    f"""
    SELECT 
        event_date_utc,
        count(*) AS Observations,
        count(DISTINCT cuebiq_id) AS Unique_Users
    FROM (
        SELECT 
            date(
                try(date_parse(substr(stop_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
                interval '1' second * timezone_offset_seconds)
            ) AS event_date_utc,
            cuebiq_id
        FROM {pe_stop_table}
        WHERE 
            country_code = '{country_code}'
            AND stop_zoned_datetime IS NOT NULL
            AND processing_date BETWEEN 20181201 AND 20200131
            AND length(stop_zoned_datetime) >= 19
    ) AS subquery
    WHERE 
        event_date_utc IS NOT NULL
        AND event_date_utc BETWEEN date '2019-01-01' AND date '2019-12-31'
    GROUP BY event_date_utc
    ORDER BY event_date_utc
    """
)

end_time = time.time()
execution_time = end_time - start_time
print(f"Query executed in: {execution_time:.2f} seconds")

result


In [None]:
result.to_csv(f'/home/jovyan/Data/2019_{country_code}_pe_stop.csv', index=False)  

# poi / poi_history - no data

In [4]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}

poi_table = f"{schema_name['cda']}.poi" 
poi_his_table = f"{schema_name['cda']}.poi_history" 

In [5]:
sql_engine.read_sql(f"desc {poi_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,brand_id,bigint,,
1,geoset_id,integer,,
2,geoset_name,varchar,,
3,place_id,bigint,,
4,place_name,varchar,,
5,place_address_desc,varchar,,
6,zipcode_id,varchar,,
7,admin1_id,varchar,,
8,admin2_id,varchar,,
9,country_code,varchar,,


In [11]:
query = f"""
    SELECT *
    FROM {poi_his_table}
    WHERE 
        country_code = 'MX'
    LIMIT 10
"""
df = sql_engine.read_sql(query)
df

Unnamed: 0,brand_id,place_id,geoset_id,geoset_name,dma_id,zipcode_id,country_code,admin1_id,admin2_id,geometry_wkt,...,open_hours,popular_times,min_dwell_time_minutes,max_dwell_time_minutes,radius_meters,valid_from,valid_from_timestamp,valid_to,valid_to_timestamp,distributor_flag


# Combine

In [None]:
df_1 = pd.read_csv('/home/jovyan/Data/2019_CO_pe_dl_10.csv')
df_2 = pd.read_csv('/home/jovyan/Data/2019_CO_pe_dl_11.csv')
df_3 = pd.read_csv('/home/jovyan/Data/2019_CO_pe_dl_11_2.csv')

In [None]:
combined_df = pd.concat([df_1, df_2, df_3], ignore_index=True)
combined_df

In [None]:
combined_df.to_csv(f'/home/jovyan/Data/2019_CO_pe_dl_11_til2.csv', index=False)  