In [1]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import time
import pandas as pd
from tqdm import tqdm
from datetime import datetime

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [2]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}

dl_table = f"{schema_name['cda']}.device_location"  
pe_dl_table = f"{schema_name['cda']}.device_location_uplevelled"

tj_table = f"{schema_name['cda']}.trajectory"     
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

# stop_table = f"{schema_name['cda']}.trajectory"    ?    
pe_stop_table = f"{schema_name['cda']}.stop_uplevelled"

visit_table = f"{schema_name['cda']}.visit " 


## count per day 
### pe_tj

In [11]:
country_code = 'ID'
start_date = 20190101
end_date = 20191231

In [4]:
sql_engine.read_sql(f"desc {pe_tj_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,cuebiq_id,bigint,,
1,device_type_code,varchar(5),,
2,os_name,varchar,,
3,start_zoned_datetime,varchar,,
4,start_geohash,varchar,,
5,start_lat,double,,
6,start_lng,double,,
7,start_country,varchar,,
8,start_admin1,varchar,,
9,start_admin2,varchar,,


In [7]:
start_time = time.time()

pe_tj_table = sql_engine.read_sql(
    f"""
    SELECT 
        event_date AS event_date,
        count(*) AS Observations, 
        count(DISTINCT cuebiq_id) AS Unique_Users
    
    FROM {pe_tj_table}
    WHERE 
        start_country = '{country_code}'
        AND end_country = '{country_code}'
        AND event_date BETWEEN {start_date} AND {end_date}
    GROUP BY 
        event_date
    ORDER BY 
        event_date ASC
    """
)

end_time = time.time()
execution_time = end_time - start_time
print(f"Query executed in: {execution_time:.2f} seconds")


pe_tj_table

Query executed from: 1715461049.30 seconds
Query executed in: 806.24 seconds


Unnamed: 0,event_date,Observations,Unique_Users
0,20190101,194621,65500
1,20190102,200840,66767
2,20190103,205471,68437
3,20190104,216970,70622
4,20190105,211913,68869
...,...,...,...
360,20191227,169349,56039
361,20191228,157960,52569
362,20191229,135968,47262
363,20191230,159653,53438


In [None]:
pe_tj_table.to_csv(f'/home/jovyan/Data/2019_{country_code}_pe_tj.csv', index=False) 

In [6]:
sql_engine.read_sql(
    f"""
    SELECT *
    FROM {pe_tj_table}
    WHERE 
        start_country = 'CO'
        AND end_country = 'CO'
        AND event_date = 20191201
    ORDER BY 
        cuebiq_id ASC
    """
)

Unnamed: 0,cuebiq_id,device_type_code,os_name,start_zoned_datetime,start_geohash,start_lat,start_lng,start_country,start_admin1,start_admin2,...,speed_gps_ms_min,speed_gps_ms_avg,speed_gps_ms_max,max_time_gap_seconds,length_meters,trajectory_wkt,start_classification_type,end_classification_type,provider_id,event_date
0,1553603708,GAID,ANDROID,2019-12-01T11:20:55-05:00,d3k02r8vc,7.117413,-73.112924,CO,CO.0021,CO.0021.0067,...,,,,1012,932.467763,"LINESTRING (-73.11292 7.11741, -73.1126 7.1177...",OTHER,OTHER,700199,20191201
1,1553603708,GAID,ANDROID,2019-12-01T12:53:25-05:00,d3k02p7tk,7.115938,-73.120017,CO,CO.0021,CO.0021.0067,...,,,,1313,24291.296855,"LINESTRING (-73.12002 7.11594, -73.12024 7.117...",OTHER,OTHER,700199,20191201
2,1554305121,GAID,ANDROID,2019-12-01T08:15:34-05:00,d2g6987bg,4.659726,-74.108473,CO,CO.0030,CO.0030.0001,...,,,,493,5808.983115,"LINESTRING (-74.10847 4.65973, -74.1089 4.6590...",OTHER,OTHER,700199,20191201
3,1554305121,GAID,ANDROID,2019-12-01T08:41:47-05:00,d2g68zk55,4.698574,-74.141107,CO,CO.0030,CO.0030.0001,...,,,,244,362.390361,"LINESTRING (-74.14111 4.69857, -74.14206 4.699...",OTHER,OTHER,700199,20191201
4,1557343708,GAID,ANDROID,2019-12-01T11:01:29-05:00,d3f70sgmz,10.398204,-75.559171,CO,CO.0014,CO.0014.0019,...,,,,7437,228.717454,"LINESTRING (-75.55917 10.3982, -75.55915 10.40...",OTHER,OTHER,700199,20191201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47087,2805892188,GAID,ANDROID,2019-12-01T17:39:26-05:00,d3fy40fdn,10.902938,-74.791201,CO,CO.0024,CO.0024.0015,...,0.0,0.0,0.0,6487,4719.406161,"LINESTRING (-74.7912 10.90294, -74.79101 10.90...",OTHER,OTHER,700199,20191201
47088,2805895034,GAID,ANDROID,2019-12-01T20:11:46-05:00,d2g68scjm,4.685229,-74.156104,CO,CO.0030,CO.0030.0001,...,,,,8512,7377.358462,"LINESTRING (-74.1561 4.68523, -74.20266 4.63791)",OTHER,OTHER,700199,20191201
47089,2805905610,GAID,ANDROID,2019-12-01T22:08:17-05:00,d34700dmr,6.156031,-75.582526,CO,CO.0013,CO.0013.0052,...,0.0,0.0,0.0,141,1901.327070,"LINESTRING (-75.58253 6.15603, -75.58175 6.159...",OTHER,OTHER,700199,20191201
47090,2805905610,GAID,ANDROID,2019-12-01T22:03:53-05:00,d34700d2q,6.155166,-75.582586,CO,CO.0013,CO.0013.0052,...,,,,134,96.490408,"LINESTRING (-75.58259 6.15517, -75.58253 6.15603)",OTHER,OTHER,700199,20191201


### tj

In [7]:
sql_engine.read_sql(
    f"""
    SELECT *
    FROM {tj_table}
    WHERE 
        start_country = 'CO'
        AND end_country = 'CO'
        AND event_date = 20191201
    ORDER BY 
        cuebiq_id ASC
    """
)

Unnamed: 0,cuebiq_id,device_type_code,os_name,start_zoned_datetime,start_geohash,start_lat,start_lng,start_country,start_admin1,start_admin2,...,speed_kinematic_ms_avg,speed_kinematic_ms_max,speed_gps_ms_min,speed_gps_ms_avg,speed_gps_ms_max,max_time_gap_seconds,length_meters,trajectory_wkt,provider_id,event_date
0,1553603708,GAID,ANDROID,2019-12-01T11:20:55-05:00,d3k02r8vc,7.117413,-73.112924,CO,CO.0021,CO.0021.0067,...,0.610760,1.165369,,,,1012,932.467763,"LINESTRING (-73.11292 7.11741, -73.1126 7.1177...",700199,20191201
1,1553603708,GAID,ANDROID,2019-12-01T12:53:25-05:00,d3k02p7tk,7.115938,-73.120017,CO,CO.0021,CO.0021.0067,...,3.659197,13.996349,,,,1313,24291.296855,"LINESTRING (-73.12002 7.11594, -73.12024 7.117...",700199,20191201
2,1554305121,GAID,ANDROID,2019-12-01T08:15:34-05:00,d2g6987bg,4.659726,-74.108473,CO,CO.0030,CO.0030.0001,...,4.406152,8.528048,,,,493,5808.983115,"LINESTRING (-74.10847 4.65973, -74.1089 4.6590...",700199,20191201
3,1554305121,GAID,ANDROID,2019-12-01T08:41:47-05:00,d2g68zk55,4.698574,-74.141107,CO,CO.0030,CO.0030.0001,...,0.746068,0.845489,,,,244,362.390361,"LINESTRING (-74.14111 4.69857, -74.14206 4.699...",700199,20191201
4,1557343708,GAID,ANDROID,2019-12-01T05:29:50-05:00,d3f70sgmr,10.398110,-75.559186,CO,CO.0014,CO.0014.0019,...,0.049577,0.112233,,,,2140,314.732154,"LINESTRING (-75.55919 10.39811, -75.5593 10.39...",700199,20191201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47087,2805892188,GAID,ANDROID,2019-12-01T17:39:26-05:00,d3fy40fdn,10.902938,-74.791201,CO,CO.0024,CO.0024.0015,...,1.042492,2.996952,0.0,0.0,0.0,6487,4719.406161,"LINESTRING (-74.7912 10.90294, -74.79101 10.90...",700199,20191201
47088,2805895034,GAID,ANDROID,2019-12-01T20:11:46-05:00,d2g68scjm,4.685229,-74.156104,CO,CO.0030,CO.0030.0001,...,0.866701,0.866701,,,,8512,7377.358462,"LINESTRING (-74.1561 4.68523, -74.20266 4.63791)",700199,20191201
47089,2805905610,GAID,ANDROID,2019-12-01T22:03:53-05:00,d34700d2q,6.155166,-75.582586,CO,CO.0013,CO.0013.0052,...,0.720078,0.720078,,,,134,96.490408,"LINESTRING (-75.58259 6.15517, -75.58253 6.15603)",700199,20191201
47090,2805905610,GAID,ANDROID,2019-12-01T22:08:17-05:00,d34700dmr,6.156031,-75.582526,CO,CO.0013,CO.0013.0052,...,2.450278,4.381821,0.0,0.0,0.0,141,1901.327070,"LINESTRING (-75.58253 6.15603, -75.58175 6.159...",700199,20191201


In [5]:
sql_engine.read_sql(f"desc {tj_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,cuebiq_id,bigint,,
1,device_type_code,varchar(5),,
2,os_name,varchar,,
3,start_zoned_datetime,varchar,,
4,start_geohash,varchar,,
5,start_lat,double,,
6,start_lng,double,,
7,start_country,varchar,,
8,start_admin1,varchar,,
9,start_admin2,varchar,,


In [3]:
country_code = 'CO'
# Done: IN, ID, MX,
start_date = 20190101
end_date = 20191231

In [4]:
start_time = time.time()
print(f"Query executed from: {start_time:.2f} seconds")

tj_table = sql_engine.read_sql(
    f"""
    SELECT 
        event_date AS event_date,
        count(*) AS Observations, 
        count(DISTINCT cuebiq_id) AS Unique_Users
    
    FROM {tj_table}
    WHERE 
        start_country = '{country_code}'
        AND end_country = '{country_code}'
        AND event_date BETWEEN {start_date} AND {end_date}
    GROUP BY 
        event_date
    ORDER BY 
        event_date ASC
    """
)

end_time = time.time()
execution_time = end_time - start_time

print(f"Query executed in: {execution_time:.2f} seconds")


tj_table

Query executed from: 1715623664.76 seconds
Query executed in: 486.34 seconds


Unnamed: 0,event_date,Observations,Unique_Users
0,20191022,51190,26748
1,20191023,168894,45682
2,20191024,165601,44668
3,20191025,172919,44883
4,20191026,150994,40389
...,...,...,...
66,20191227,65933,18190
67,20191228,60694,17141
68,20191229,49744,15255
69,20191230,60589,16945


In [5]:
tj_table.to_csv(f'/home/jovyan/Data/2019_{country_code}_tj.csv', index=False) 

In [20]:
# Only need if not sort within SQL

# tj_table_sorted = tj_table.sort_values(by='event_date', ascending=True)
# tj_table_sorted.reset_index(drop=True, inplace=True)
# tj_table_sorted

Unnamed: 0,event_date,Observations,Unique_Users
0,20190101,321993,109571
1,20190102,420810,124898
2,20190103,431074,125826
3,20190104,453939,129807
4,20190105,442477,127782
...,...,...,...
360,20191227,353510,103802
361,20191228,337890,99857
362,20191229,283928,89615
363,20191230,344246,101149


### pe_dl

In [4]:
sql_engine.read_sql(f"desc {pe_dl_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,event_timestamp,bigint,,
1,cuebiq_id,bigint,,
2,device_type_code,varchar(5),,
3,os_name,varchar,,
4,lat,double,,
5,lng,double,,
6,accuracy_meters,double,,
7,device_manufacturer_name,varchar,,
8,device_model_code,varchar,,
9,carrier_code,varchar,,


#### Test group

In [4]:
country_code = 'CO'

In [None]:
start_time = time.time() # Start the timer

result = sql_engine.read_sql(
    f"""
    SELECT 
        event_date_utc,
        count(*) AS Total_Observations,
        count(DISTINCT cuebiq_id) AS Total_Unique_Users
    FROM (
        SELECT 
            date(
                try(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
                interval '1' second * timezoneoffset_secs)
            ) AS event_date_utc,
            cuebiq_id
        FROM {pe_dl_table}
        WHERE 
            country_code = '{country_code}'
            AND processing_date BETWEEN 20181210 AND 20200131
            AND event_zoned_datetime IS NOT NULL
            AND length(event_zoned_datetime) >= 19
    ) AS subquery
    WHERE 
        event_date_utc IS NOT NULL
        AND event_date_utc BETWEEN date '2019-01-01' AND date '2019-12-31'
    GROUP BY event_date_utc
    ORDER BY event_date_utc
    """
)


# result = sql_engine.read_sql(
#     f"""
#     SELECT 
#         event_date_utc,
#         count(*) AS Total_Observations,
#         count(DISTINCT cuebiq_id) AS Total_Unique_Users
#     FROM (
#         SELECT 
#             date(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
#             interval '1' second * timezoneoffset_secs) AS event_date_utc,
#             cuebiq_id
#         FROM {pe_dl_table}
#         WHERE 
#             country_code = '{country_code}'
#             AND event_zoned_datetime IS NOT NULL
#             AND processing_date BETWEEN 20181201 AND 20200131
#     ) AS subquery
#     WHERE 
#         event_date_utc IS NOT NULL
#         AND event_date_utc BETWEEN date '2019-01-01' AND date '2019-12-31'
#     GROUP BY event_date_utc
#     ORDER BY event_date_utc
#     """
# )


end_time = time.time() # Stop the timer
execution_time = end_time - start_time # Calculate the total time taken
print(f"Query executed in: {execution_time:.2f} seconds")

result

Query executed in: 37.34 seconds


Unnamed: 0,event_date_utc,Total_Observations,Total_Unique_Users
0,2019-01-01,38224,21753
1,2019-01-02,51911,29777
2,2019-01-03,52603,30268
3,2019-01-04,56426,32429
4,2019-01-05,48450,27640
...,...,...,...
109,2019-12-27,1814739,54480
110,2019-12-28,1725838,52297
111,2019-12-29,1522364,49173
112,2019-12-30,1703218,50889


In [None]:
result.to_csv(f'/home/jovyan/Data/2019_{country_code}_pe_dl.csv', index=False)  

#### by individual - 失败了 （in case 有用）

In [None]:
result_df = pd.DataFrame(columns=['date_str', 'Observations', 'Unique_Users'])

In [7]:
country_code = 'CO'
start_date = 20200101
end_date = 20200102

# Read data from SQL table
query = f"""
    SELECT 
        event_zoned_datetime,
        cuebiq_id,  
        processing_date,
        timezoneoffset_secs,
        
        -- Extract only the date part
        date(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
        interval '1' second * timezoneoffset_secs) AS event_date_utc,
        
        -- Extract the date and time part and adjust by the timezone offset 
        date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
        interval '1' second * timezoneoffset_secs AS event_datetime_utc,
        
        date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') AS event_date  -- without consider timezone
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}'
        AND event_zoned_datetime IS NOT NULL
        -- AND processing_date = {start_date}
        AND processing_date BETWEEN {start_date} AND {end_date}
"""        

pe_dl_table_df = sql_engine.read_sql(query)
pe_dl_table_df
# pe_dl_table_df.dtypes
# pe_dl_table_df.sort_values(by='event_zoned_datetime')

Unnamed: 0,event_zoned_datetime,cuebiq_id,processing_date,timezoneoffset_secs,event_date_utc,event_datetime_utc,event_date
0,2020-01-02T17:14:54-05:00,2944944722,20200102,-18000,2020-01-02,2020-01-02 12:14:54,2020-01-02 17:14:54
1,2020-01-02T17:10:49-05:00,2944944722,20200102,-18000,2020-01-02,2020-01-02 12:10:49,2020-01-02 17:10:49
2,2020-01-02T17:08:48-05:00,2944944722,20200102,-18000,2020-01-02,2020-01-02 12:08:48,2020-01-02 17:08:48
3,2020-01-02T07:58:55-05:00,2944944722,20200102,-18000,2020-01-02,2020-01-02 02:58:55,2020-01-02 07:58:55
4,2020-01-02T12:31:48-05:00,2944944722,20200102,-18000,2020-01-02,2020-01-02 07:31:48,2020-01-02 12:31:48
...,...,...,...,...,...,...,...
6772466,2020-01-01T15:29:21-05:00,2421440357,20200101,-18000,2020-01-01,2020-01-01 10:29:21,2020-01-01 15:29:21
6772467,2020-01-01T15:29:21-05:00,2421440357,20200101,-18000,2020-01-01,2020-01-01 10:29:21,2020-01-01 15:29:21
6772468,2020-01-01T16:12:35-05:00,1731215661,20200101,-18000,2020-01-01,2020-01-01 11:12:35,2020-01-01 16:12:35
6772469,2020-01-01T12:49:00-05:00,2380344327,20200101,-18000,2020-01-01,2020-01-01 07:49:00,2020-01-01 12:49:00


In [None]:
country_code = 'IN'
start_date = 20190101
end_date = 20190105

query = f"""
    SELECT 
        event_zoned_datetime,
        cuebiq_id,  
        processing_date,
        timezoneoffset_secs,
        -- Extract only the date part
        date(date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
        interval '1' second * timezoneoffset_secs) AS event_date_utc
        -- -- Extract the date and time part and adjust by the timezone offset 
        -- date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
        -- interval '1' second * timezoneoffset_secs AS event_datetime_utc
        
        -- date_parse(substr(event_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') AS event_date # without consider timezone
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}'
        AND event_zoned_datetime IS NOT NULL
        -- AND processing_date = {start_date}
        AND processing_date BETWEEN {start_date} AND {end_date}

"""

pe_dl_table_df = sql_engine.read_sql(query)
# pe_dl_table_df
# pe_dl_table_df.dtypes
pe_dl_table_df.sort_values(by='event_zoned_datetime')

In [14]:
# Convert event_date_utc to datetime
pe_dl_table_df['event_date_utc'] = pd.to_datetime(pe_dl_table_df['event_date_utc'], errors='coerce')

# Define the date to compare
compare_date = pd.to_datetime('2019-12-31')

# Filter rows where event_date_utc is equal to the compare_date
selected = pe_dl_table_df[pe_dl_table_df['event_date_utc'] == compare_date]

# Display the filtered DataFrame
selected

Unnamed: 0,event_zoned_datetime,cuebiq_id,processing_date,timezoneoffset_secs,event_date_utc
0,2019-12-31T09:38:17-05:00,2936976869,20191231,-18000,2019-12-31
1,2019-12-31T09:29:57-05:00,2936976869,20191231,-18000,2019-12-31
2,2019-12-31T09:33:57-05:00,2936976869,20191231,-18000,2019-12-31
3,2019-12-31T09:35:57-05:00,2936976869,20191231,-18000,2019-12-31
4,2019-12-31T09:31:57-05:00,2936976869,20191231,-18000,2019-12-31
...,...,...,...,...,...
1741480,2019-12-31T07:52:02-05:00,2405999467,20191231,-18000,2019-12-31
1741481,2019-12-31T15:13:11-05:00,2165974266,20191231,-18000,2019-12-31
1741484,2019-12-31T18:46:00-05:00,1859781444,20191231,-18000,2019-12-31
1741485,2019-12-31T12:44:56-05:00,2392314573,20191231,-18000,2019-12-31


In [6]:
pe_dl_table_df['event_zoned_datetime'] = pd.to_datetime(pe_dl_table_df['event_zoned_datetime'], utc=True, errors='coerce')
pe_dl_table_df

KeyboardInterrupt: 

In [None]:
# Convert event_zoned_datetime to datetime format
pe_dl_table_df['date_str'] = pe_dl_table_df['event_zoned_datetime'].dt.strftime('%Y%m%d')

# pe_dl_table_df
pe_dl_table_df.sort_values(by='date_str')

In [7]:
# Group by date_str, calculate observations and unique users
grouped_data = pe_dl_table_df.groupby('date_str').agg({
    'event_zoned_datetime': 'count',  # Count all rows (observations)
    'cuebiq_id': 'nunique'            # Count unique cuebiq_id (unique users)
})

# Rename columns (optional)
grouped_data.columns = ['Observations', 'Unique_Users']
grouped_data = grouped_data.reset_index()

grouped_data

Unnamed: 0,date_str,Observations,Unique_Users
0,20011226,6576,1
1,20130121,2524,1
2,20190101,708,1
3,20191120,15,2
4,20191121,165,18
...,...,...,...
109,22011219,1,1
110,22011222,2,1
111,22011223,1,1
112,22011227,1,1


In [8]:
grouped_data.date_str = pd.to_numeric(grouped_data.date_str)
selected_df = grouped_data[(grouped_data.date_str >= start_date) & (grouped_data.date_str <= end_date)]
# selected_df
result_df = pd.concat([result_df, selected_df], ignore_index=True)

result_df

Unnamed: 0,date_str,Observations,Unique_Users
0,20191220,1805525,58014
1,20191221,1769515,56514
2,20191222,1642407,54728
3,20191223,1760908,56698
4,20191224,1785584,56071
5,20191225,1535532,50709
6,20191226,1663954,53620
7,20191227,1751878,53302
8,20191228,1698693,51591
9,20191229,1564260,48774


In [9]:
result_df.to_csv(f'/home/jovyan/Data/2019_{country_code}_pe_dl_12_3.csv', index=False) 

In [6]:
sql_engine.read_sql(f"desc {pe_dl_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,event_timestamp,bigint,,
1,cuebiq_id,bigint,,
2,device_type_code,varchar(5),,
3,os_name,varchar,,
4,lat,double,,
5,lng,double,,
6,accuracy_meters,double,,
7,device_manufacturer_name,varchar,,
8,device_model_code,varchar,,
9,carrier_code,varchar,,


In [None]:
country_code = 'MX'
start_date = 20190101
end_date = 20190105

query = f"""
    SELECT 
        lat,
        lng,
        event_zoned_datetime,
        cuebiq_id,  
        processing_date,
        timezoneoffset_secs
    FROM {pe_dl_table}
    WHERE 
        country_code = '{country_code}'
        AND event_zoned_datetime IS NOT NULL
        AND processing_date = {start_date}
        -- AND processing_date BETWEEN {start_date} AND {end_date}
"""

pe_dl_table_df = sql_engine.read_sql(query)
pe_dl_table_df

### pe_stop

In [3]:
sql_engine.read_sql(f"desc {pe_stop_table}")

Unnamed: 0,Column,Type,Extra,Comment
0,cuebiq_id,bigint,,
1,device_type_code,varchar(5),,
2,os_name,varchar,,
3,lat,double,,
4,lng,double,,
5,geohash_id,varchar,,
6,cluster_size,integer,,
7,avg_distance_meters,double,,
8,avg_accuracy_meters,double,,
9,std_accuracy_meters,double,,


In [6]:
df_points = sql_engine.read_sql(
    f"""
    select *
    from {pe_stop_table}
    where 
        country_code = 'MX'
        and processing_date = 20190105
    limit 5
    """
)
df_points

Unnamed: 0,cuebiq_id,device_type_code,os_name,lat,lng,geohash_id,cluster_size,avg_distance_meters,avg_accuracy_meters,std_accuracy_meters,...,stop_zoned_datetime,timezone_offset_seconds,admin1_id,admin2_id,classification_type,transformation_type,block_group_id,processing_date,country_code,provider_id
0,1708990686,GAID,ANDROID,18.507592,-88.29069,d531qchwt,1,0.0,9.0,0.0,...,2018-12-31T16:34:22-05:00,-18000,MX.0028,MX.0028.0007,OTHER,KEEP,,20190105,MX,700199
1,1708990686,GAID,ANDROID,18.507797,-88.290313,d531qchzw,1,1.500128e-09,14.0,0.0,...,2019-01-04T22:52:00-05:00,-18000,MX.0028,MX.0028.0007,OTHER,KEEP,,20190105,MX,700199
2,1708990390,GAID,ANDROID,17.548869,-99.489924,9g17b49ts,2,8.561209,27.5,3.535534,...,2019-01-04T16:12:20-06:00,-21600,MX.0016,MX.0016.0067,OTHER,KEEP,,20190105,MX,700199
3,1708990390,GAID,ANDROID,17.549158,-99.489831,9g17b49xq,6,19.87059,43.0,23.93324,...,2019-01-04T19:12:22-06:00,-21600,MX.0016,MX.0016.0067,OTHER,KEEP,,20190105,MX,700199
4,1708989457,GAID,ANDROID,19.162383,-98.382894,9g6j10rub,2,20.86179,37.0,7.071068,...,2019-01-05T02:32:05-06:00,-21600,MX.0010,MX.0010.0006,OTHER,KEEP,,20190105,MX,700199


In [14]:
country_code = 'ID'

start_time = time.time()

result = sql_engine.read_sql(
    f"""
    SELECT 
        event_date_utc,
        count(*) AS Observations,
        count(DISTINCT cuebiq_id) AS Unique_Users
    FROM (
        SELECT 
            date(
                try(date_parse(substr(stop_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s') +
                interval '1' second * timezone_offset_seconds)
            ) AS event_date_utc,
            cuebiq_id
        FROM {pe_stop_table}
        WHERE 
            country_code = '{country_code}'
            AND stop_zoned_datetime IS NOT NULL
            AND processing_date BETWEEN 20181201 AND 20200131
            AND length(stop_zoned_datetime) >= 19
    ) AS subquery
    WHERE 
        event_date_utc IS NOT NULL
        AND event_date_utc BETWEEN date '2019-01-01' AND date '2019-12-31'
    GROUP BY event_date_utc
    ORDER BY event_date_utc
    """
)

end_time = time.time()
execution_time = end_time - start_time
print(f"Query executed in: {execution_time:.2f} seconds")

result


Query executed in: 28.98 seconds


Unnamed: 0,event_date_utc,Observations,Unique_Users
0,2019-01-01,83029,35825
1,2019-01-02,126367,40432
2,2019-01-03,132134,41179
3,2019-01-04,136989,41607
4,2019-01-05,133429,41269
...,...,...,...
360,2019-12-27,154598,42950
361,2019-12-28,148956,42158
362,2019-12-29,145379,41299
363,2019-12-30,149709,42494


In [15]:
result.to_csv(f'/home/jovyan/Data/2019_{country_code}_pe_stop.csv', index=False)  

### count per year 

# Combine

In [3]:
df_1 = pd.read_csv('/home/jovyan/Data/2019_CO_pe_dl_10.csv')
df_2 = pd.read_csv('/home/jovyan/Data/2019_CO_pe_dl_11.csv')
df_3 = pd.read_csv('/home/jovyan/Data/2019_CO_pe_dl_11_2.csv')

In [4]:
combined_df = pd.concat([df_1, df_2, df_3], ignore_index=True)
combined_df

Unnamed: 0,date_str,Observations,Unique_Users
0,20190101,41209,23610
1,20190102,51834,29779
2,20190103,52601,30325
3,20190104,55854,32123
4,20190105,47762,27550
...,...,...,...
56,20191116,2549174,78708
57,20191117,2296789,75761
58,20191118,2520380,76597
59,20191119,2555878,75094


In [5]:
combined_df.to_csv(f'/home/jovyan/Data/2019_CO_pe_dl_11_til2.csv', index=False)  