Country Code:  
India - IN; Indonesia - ID; Mexico - MX; Colombia - CO 

In [1]:
!pip install pydeck -q -q

In [2]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [3]:
pd.set_option("display.max_columns", None)

In [4]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}

# dl_table = f"{schema_name['cda']}.device_location"  
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

tj_table = f"{schema_name['cda']}.trajectory"     
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

# stop_table = f"{schema['cda']}.stop" 
pe_stop_table = f"{schema_name['cda']}.stop_uplevelled"

visit_table = f"{schema_name['cda']}.visit " 


# Example

In [5]:
# Documentaion
sql_engine.read_sql(f"desc {pe_dl_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,event_timestamp,bigint,,
1,cuebiq_id,bigint,,
2,device_type_code,varchar(5),,
3,os_name,varchar,,
4,lat,double,,
5,lng,double,,
6,accuracy_meters,double,,
7,device_manufacturer_name,varchar,,
8,device_model_code,varchar,,
9,carrier_code,varchar,,


In [None]:
# Check One Day Data
sql_engine.read_sql(
    f"""
    select * 
    from {pe_dl_table}
    where 
        country_code = 'CO'
        and processing_date = 20220201
        and provider_id = '700199'
    limit 5
    """
)

In [None]:
# Get provider_id
sql_engine.read_sql(
    f"""
    select 
        distinct provider_id
    from {pe_dl_table}
    where 
        country_code = 'ID'
        and processing_date BETWEEN 20190101 AND 20191231
    """
)

# MX & CO & ID & IN: 700199

# Column

In [None]:
from datetime import datetime
import pytz

# df['event_zoned_datetime'][0]
datetime.fromisoformat(df['event_zoned_datetime'][0]).timestamp()

In [None]:
sql_engine.read_sql(
    f"""
    select
        array_distinct(array_agg(classification_type)) classification_type,
        array_distinct(array_agg(transformation_type)) transformation_type
    from {pe_dl_table}
    where 
        country_code = 'MX'
        and processing_date = 20190101
        and provider_id = '700199'
        and cuebiq_id % 1000 = 0
    group by 
        country_code
    """
)

# the privacy enhancement

# Overall yearly 

In [None]:
# Check single data
pe_dl_df = sql_engine.read_sql(
    f"""
    select *
    from {pe_dl_table}
    where 
        country_code = 'CO'
        and processing_date = 20190101

    """
)
# and cuebiq_id % 1000 = 0

pe_dl_df

## pe_dl_table

In [None]:
sql_engine.read_sql(f"desc {pe_dl_table}")

In [None]:
pe_dl_df = sql_engine.read_sql(
    f"""
    select *
    from {pe_stop_table}
    where 
        country_code = 'MX'
        and processing_date = 20190101

    """
)

pe_dl_df

In [13]:
country_code = 'CO'

In [14]:
start_time = time.time() # Start the timer

result = sql_engine.read_sql(
    f"""
    SELECT 
        count(*) AS Total_Observations,
        count(DISTINCT cuebiq_id) AS Total_Unique_Users
    FROM (
        SELECT 
            try(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
                interval '1' second * timezoneoffset_secs) AS event_datetime_utc,
            cuebiq_id
        FROM {pe_dl_table}
        WHERE 
            country_code = '{country_code}'
            AND event_zoned_datetime IS NOT NULL
            AND processing_date BETWEEN 20181201 AND 20200131
            AND length(event_zoned_datetime) >= 19
    ) AS subquery
    WHERE 
        event_datetime_utc IS NOT NULL
        AND event_datetime_utc BETWEEN date '2019-01-01' AND date '2019-12-31'
    """
)

end_time = time.time() # Stop the timer
execution_time = end_time - start_time # Calculate the total time taken
print(f"Query executed in: {execution_time:.2f} seconds")

result

Query executed in: 26.09 seconds


Unnamed: 0,Total_Observations,Total_Unique_Users
0,152150003,521012


## tj_table

In [6]:
sql_engine.read_sql(f"desc {tj_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,cuebiq_id,bigint,,
1,device_type_code,varchar(5),,
2,os_name,varchar,,
3,start_zoned_datetime,varchar,,
4,start_geohash,varchar,,
5,start_lat,double,,
6,start_lng,double,,
7,start_country,varchar,,
8,start_admin1,varchar,,
9,start_admin2,varchar,,


In [7]:
country_code = 'CO'

start_time = time.time() # Start the timer

result = sql_engine.read_sql(
    f"""
    SELECT 
        count(*) AS Total_Observations,
        count(DISTINCT cuebiq_id) AS Total_Unique_Users
    
    FROM {tj_table}
    WHERE 
        start_country = '{country_code}'
        AND end_country = '{country_code}'
        AND event_date BETWEEN 20190101 AND 20191231
    """)

end_time = time.time() # Stop the timer

execution_time = end_time - start_time # Calculate the total time taken

# Extract the total observations and unique users from the result
total_observations = result.iloc[0]['Total_Observations']
total_unique_users = result.iloc[0]['Total_Unique_Users']

# Print the results
print(f"Total Observations: {total_observations}")
print(f"Total Unique Users: {total_unique_users}")

# Print the execution time
print(f"Query executed in: {execution_time:.2f} seconds")

Total Observations: 6483701
Total Unique Users: 167252
Query executed in: 557.61 seconds


## pe_stop_table

In [8]:
sql_engine.read_sql(f"desc {pe_stop_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,cuebiq_id,bigint,,
1,device_type_code,varchar(5),,
2,os_name,varchar,,
3,lat,double,,
4,lng,double,,
5,geohash_id,varchar,,
6,cluster_size,integer,,
7,avg_distance_meters,double,,
8,avg_accuracy_meters,double,,
9,std_accuracy_meters,double,,


In [None]:
pe_dl_df = sql_engine.read_sql(
    f"""
    select *
    from {pe_stop_table}
    where 
        country_code = 'MX'
        and processing_date = 20190101

    """
)

pe_dl_df

In [10]:
country_code = 'MX'

In [11]:
start_time = time.time() # Start the timer

result = sql_engine.read_sql(
    f"""
    SELECT 
        count(*) AS Total_Observations,
        count(DISTINCT cuebiq_id) AS Total_Unique_Users
    FROM (
        SELECT 
            try(date_parse(substr(stop_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
                interval '1' second * timezone_offset_seconds) AS event_datetime_utc,
            cuebiq_id
        FROM {pe_stop_table}
        WHERE 
            country_code = '{country_code}'
            AND stop_zoned_datetime IS NOT NULL
            AND processing_date BETWEEN 20181201 AND 20200131
            AND length(stop_zoned_datetime) >= 19
    ) AS subquery
    WHERE 
        event_datetime_utc IS NOT NULL
        AND event_datetime_utc BETWEEN date '2019-01-01' AND date '2019-12-31'
    """
)

end_time = time.time() # Stop the timer
execution_time = end_time - start_time # Calculate the total time taken
print(f"Query executed in: {execution_time:.2f} seconds")

result


Query executed in: 33.50 seconds


Unnamed: 0,Total_Observations,Total_Unique_Users
0,292956571,3437467


## visit_table

In [None]:
sql_engine.read_sql(f"desc {visit_table}")

In [None]:
df = sql_engine.read_sql(
    f"""
    select * 
    from {visit_table}
    where 
        country_code = 'IN'
    """
)
df.to_csv(f'/home/jovyan/Data/IN_visit.csv', index=False) 

In [None]:
country_code = 'CO'

start_time = time.time() # Start the timer

result = sql_engine.read_sql(
    f"""
    SELECT 
        count(*) AS Total_Observations,
        count(DISTINCT cuebiq_id) AS Total_Unique_Users
    
    FROM {visit_table}
    WHERE 
        country_code = '{country_code}'
        AND event_date BETWEEN 20230101 AND 20231231
    """)

end_time = time.time()  # Stop the timer
execution_time = end_time - start_time  # Calculate the total time taken

# Extract the total observations and unique users from the result
total_observations = result.iloc[0]['Total_Observations']
total_unique_users = result.iloc[0]['Total_Unique_Users']

# Print the results
print(f"Total Observations: {total_observations}")
print(f"Total Unique Users: {total_unique_users}")

# Print the execution time
print(f"Query executed in: {execution_time:.2f} seconds")