In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
pip install python-geohash

Collecting python-geohash
  Using cached python_geohash-0.8.5-cp39-cp39-linux_x86_64.whl
Installing collected packages: python-geohash
Successfully installed python-geohash-0.8.5
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import time
import geohash
import logging
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine

In [5]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [6]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [7]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Aggregate by weekday (Monday / Tuesday / ...)

In [20]:
country_code = 'CO'
event_date = '2020-02-10'

event_date_dt = datetime.strptime(event_date, '%Y-%m-%d')
formatted_current_date = event_date_dt.strftime('%Y%m%d')

lookahead_date = event_date_dt + timedelta(days=14)
formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

In [21]:
pe_tj_df3 = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        duration_minutes,
        length_meters,
        number_of_points,
        TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
        -- geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
        geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
        -- geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
        geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
        DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date,
        DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%W') AS day_of_week
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
    """
)

pe_tj_df3

Unnamed: 0,cuebiq_id,duration_minutes,length_meters,number_of_points,event_datetime_local,event_hour,start_geohash3,end_geohash3,local_date,day_of_week
0,2542421864,4.733333,887.332275,3,2019-11-01 16:38:58,16,d29,d29,20191101,Friday
1,2542421864,15.283333,885.803007,4,2019-11-01 17:04:42,17,d29,d29,20191101,Friday
2,2542094214,5.433333,267.882334,3,2019-11-01 21:20:58,21,d34,d34,20191101,Friday
3,2542094214,2.033333,123.426615,2,2019-11-01 21:31:41,21,d34,d34,20191101,Friday
4,2542094214,55.450000,152.322199,2,2019-11-01 21:36:03,21,d34,d34,20191101,Friday
...,...,...,...,...,...,...,...,...,...,...
1139309,2531233229,833.466667,4267.391503,12,2019-11-03 07:45:31,7,d2g,d2g,20191103,Sunday
1139310,1955753448,48.750000,1686.573642,12,2019-11-03 08:35:19,8,d2g,d2g,20191103,Sunday
1139311,1701335404,252.700000,30069.918737,10,2019-11-03 07:58:35,7,d29,d29,20191103,Sunday
1139312,2001169528,22.000000,280.950938,5,2019-11-03 06:23:03,6,d2g,d2g,20191103,Sunday


In [46]:
pe_tj_df3 = sql_engine.read_sql(
    f"""
    WITH calculated_data AS (
        SELECT 
            cuebiq_id,
            duration_minutes,
            length_meters,
            number_of_points,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
            geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%W') AS day_of_week
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    )
    SELECT 
        day_of_week,
        start_geohash3,
        end_geohash3,
        COUNT(*) AS trip_count,
        COUNT(DISTINCT cuebiq_id) AS unique_cuebiq_ids,
        AVG(duration_minutes) AS m_duration_min,
        APPROX_PERCENTILE(duration_minutes, 0.5) AS mdn_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        AVG(length_meters) AS m_length_m,
        APPROX_PERCENTILE(length_meters, 0.5) AS mdn_length_m,
        STDDEV(length_meters) AS sd_length_m,
        AVG(number_of_points) AS m_points_no,
        APPROX_PERCENTILE(number_of_points, 0.5) AS mdn_points_no,
        STDDEV(number_of_points) AS sd_points_no
    FROM calculated_data
    GROUP BY day_of_week, start_geohash3, end_geohash3
    HAVING COUNT(*) > 9
    """
)

pe_tj_df3

Unnamed: 0,day_of_week,start_geohash3,end_geohash3,trip_count,unique_cuebiq_ids,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
0,Sunday,t9y,tdp,25,25,232.742667,189.716667,154.168553,120094.857637,95284.781449,67613.310013,11.080000,10,6.818602
1,Sunday,tg8,tgb,48,46,155.807639,65.450000,196.966896,87623.347963,14175.078218,340461.666107,11.958333,7,14.601018
2,Sunday,t9v,t9y,23,21,133.372464,84.133333,126.077162,54619.947743,31144.191658,60067.017894,11.000000,9,8.079154
3,Sunday,tdw,tdt,11,11,245.000000,218.200000,164.605948,61308.463633,60792.544241,45793.854842,8.818182,9,4.020403
4,Sunday,tf3,tf2,275,260,158.858364,96.189456,173.121444,32225.061234,14900.441948,61390.609431,9.821818,7,8.741684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2687,Saturday,tg5,tg7,10,10,190.205000,169.050000,117.477589,69592.097946,74010.274263,70247.664524,15.100000,9,13.067772
2688,Saturday,tdm,tdj,11,11,230.934848,145.716667,216.944580,61613.743316,36023.385424,55341.925611,14.181818,6,18.082136
2689,Wednesday,tdr,tf2,16,14,153.830208,96.100000,163.410386,300749.134725,28850.999664,831199.754500,14.937500,9,22.781480
2690,Monday,t9y,t9v,18,17,193.671296,148.766667,127.313951,45969.547281,43555.584063,41975.157211,11.611111,10,7.349581


In [47]:
pe_tj_df3[pe_tj_df3['day_of_week'] == 'Thursday']

Unnamed: 0,day_of_week,start_geohash3,end_geohash3,trip_count,unique_cuebiq_ids,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
26,Thursday,tgb,tgb,1509,530,88.306417,34.694224,138.772566,6460.621963,1554.000039,55888.815927,5.711067,4,7.106710
29,Thursday,tdm,tdm,1354,502,98.750123,36.685348,157.098461,7865.108617,1570.538548,26996.840404,5.495569,4,4.986491
32,Thursday,t9y,t9z,39,35,133.349145,74.933333,186.843645,33137.747998,6139.753650,51990.133293,9.871795,7,8.697185
38,Thursday,tgd,tgd,51,29,138.585294,82.900000,159.007605,4658.005212,1889.363140,7119.933812,4.705882,4,3.651269
44,Thursday,tdr,tdp,20,19,195.785000,176.450000,127.395596,88365.723030,60462.426111,71944.872435,17.800000,18,12.738669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2653,Thursday,tf2,tf3,255,238,119.719869,72.652381,133.707162,24445.599354,16527.895182,25254.605247,11.462745,9,18.363285
2671,Thursday,tdr,tdq,27,27,198.198765,180.400000,154.495859,78161.753577,67445.970412,42293.950354,12.555556,9,11.318240
2677,Thursday,te7,tee,37,36,223.441892,136.216667,244.791486,94293.293480,68340.405110,99243.560424,9.972973,9,6.861675
2681,Thursday,tdq,tdm,10,10,251.896667,305.283333,131.491652,64056.563444,57056.865918,56424.646690,11.900000,10,12.422650


In [48]:
pe_tj_df3[pe_tj_df3['day_of_week'] == 'Tuesday']

Unnamed: 0,day_of_week,start_geohash3,end_geohash3,trip_count,unique_cuebiq_ids,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
20,Tuesday,ttn,ttn,17428,5866,73.092481,30.439198,125.988568,6829.449917,1110.206942,29727.304717,6.181145,5,8.368901
24,Tuesday,ttp,ttp,4110,1565,81.432839,32.762603,137.152146,6574.258004,969.810476,28305.503737,5.682238,4,7.924427
30,Tuesday,ttn,ttp,409,392,109.419478,65.723525,128.261297,28665.504718,14845.344631,113829.665719,9.088020,7,6.894306
36,Tuesday,teg,teu,46,44,156.546739,124.666667,111.868474,73597.059928,41637.962625,161372.570303,8.826087,7,8.133072
91,Tuesday,tet,tes,15,15,178.523333,167.300000,114.824934,89562.546045,102818.576271,70638.501549,13.066667,9,13.729357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2657,Tuesday,tsh,tsh,448,179,90.149219,38.232352,132.332034,10933.375906,1208.685954,42305.997861,4.955357,4,3.420267
2667,Tuesday,teg,tee,30,30,122.641667,78.566667,113.719446,43159.651692,35087.044000,60312.990702,10.000000,10,6.669540
2668,Tuesday,tsz,ttn,12,12,227.436111,227.166667,70.644178,222780.729049,174874.927805,279875.011063,23.666667,21,16.983058
2678,Tuesday,ts5,tef,11,10,338.675758,204.816667,310.192672,235080.843158,204810.598785,130711.347351,13.818182,12,10.980147


In [49]:
pe_tj_df3[pe_tj_df3['day_of_week'] == 'Friday']

Unnamed: 0,day_of_week,start_geohash3,end_geohash3,trip_count,unique_cuebiq_ids,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
8,Friday,tej,tej,1133,421,111.483495,45.932558,166.565980,7687.023388,1328.667988,32931.571607,4.982348,4,3.768904
11,Friday,tdv,tdv,1202,425,114.429659,53.833064,152.649744,9914.995809,1715.402426,40863.975403,5.278702,4,4.693755
12,Friday,tus,tus,4252,1341,100.791040,45.094871,146.462573,9723.262921,1152.943451,74061.401072,5.459078,4,4.443161
14,Friday,wh1,wh1,1166,287,80.705660,37.702898,118.400321,3727.440322,1153.586322,9728.626031,6.166381,5,4.913826
35,Friday,tf2,tf0,38,36,208.598684,193.800000,159.980187,95434.554303,77828.841532,60973.715056,17.289474,13,18.754326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2639,Friday,tuz,tuz,326,108,116.228016,40.925000,180.932337,3775.247684,705.074061,12142.860017,5.239264,4,4.387162
2642,Friday,tsq,tsq,573,203,110.509250,46.108333,152.257215,8330.462591,1187.516841,44671.199219,4.834206,4,4.629498
2662,Friday,tud,tuc,12,12,395.901389,367.233333,324.262716,243380.640371,180316.045153,245719.016369,9.583333,10,4.718596
2663,Friday,tu7,tud,15,15,192.932222,101.766667,205.654177,134890.655815,22686.159901,248007.710633,12.200000,10,8.620573


# Aggregate weekly

In [8]:
country_code = 'CO'
event_date = '2020-02-10'

event_date_dt = datetime.strptime(event_date, '%Y-%m-%d')
formatted_current_date = event_date_dt.strftime('%Y%m%d')

lookahead_date = event_date_dt + timedelta(days=7)
formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

In [9]:
pe_tj_df3 = sql_engine.read_sql(
    f"""
    WITH calculated_data AS (
        SELECT 
            cuebiq_id,
            duration_minutes,
            length_meters,
            number_of_points,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            -- EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            h3_encode(start_lat, start_lng, 7) AS start_h3,
            h3_encode(end_lat, end_lng, 7) AS end_h3,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    )
    SELECT 
        start_h3,
        end_h3,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        APPROX_PERCENTILE(duration_minutes, 0.5) AS mdn_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        AVG(length_meters) AS m_length_m,
        APPROX_PERCENTILE(length_meters, 0.5) AS mdn_length_m,
        STDDEV(length_meters) AS sd_length_m,
        AVG(number_of_points) AS m_points_no,
        APPROX_PERCENTILE(number_of_points, 0.5) AS mdn_points_no,
        STDDEV(number_of_points) AS sd_points_no
    FROM calculated_data
    GROUP BY start_h3, end_h3
    HAVING COUNT(*) > 9
    """
)

pe_tj_df3


KeyboardInterrupt



In [11]:
pe_tj_df7 = sql_engine.read_sql(
    f"""
    WITH calculated_data AS (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            -- EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            h3_encode(start_lat, start_lng, 7) AS start_h3,
            h3_encode(end_lat, end_lng, 7) AS end_h3,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    )
    SELECT 
        start_h3,
        end_h3,
        COUNT(*) AS trip_count
    FROM calculated_data
    GROUP BY start_h3, end_h3
    HAVING COUNT(*) > 9
    """
)

pe_tj_df7

ProgrammingError: (trino.exceptions.TrinoUserError) TrinoUserError(type=USER_ERROR, name=PERMISSION_DENIED, message="Access Denied: Cannot execute function h3_encode", query_id=20240709_161751_00879_p2r3p)
[SQL: 
    WITH calculated_data AS (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            -- EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            h3_encode(start_lat, start_lng, 7) AS start_h3,
            h3_encode(end_lat, end_lng, 7) AS end_h3,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20200210 AND 20200217
            AND end_country = 'CO' 
            AND start_country = 'CO'
    )
    SELECT 
        start_h3,
        end_h3,
        COUNT(*) AS trip_count
    FROM calculated_data
    GROUP BY start_h3, end_h3
    HAVING COUNT(*) > 9
    ]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [None]:
pe_tj_df7.to_csv('/home/jovyan/CO/Feb_h3_7.csv', index=False)

In [12]:
row_count_df = sql_engine.read_sql(
    f"""
    SELECT 
        COUNT(*) AS row_count
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}'
    """
)

row_count = row_count_df.iloc[0]['row_count']
print(f"Total number of rows: {row_count}")


Total number of rows: 353815


In [13]:
# Load a sample of the data (e.g., 1000 rows)
sample_df = sql_engine.read_sql(
    f"""
    SELECT *
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}'
    LIMIT 1000
    """
)

# Check the memory usage of the sample
sample_memory_usage = sample_df.memory_usage(deep=True).sum()
print(f"Memory usage of the sample (1000 rows): {sample_memory_usage / 1024 ** 2:.2f} MB")

# Estimate the total memory usage
total_rows = 353815
estimated_total_memory_usage = (sample_memory_usage / 1000) * total_rows
print(f"Estimated total memory usage: {estimated_total_memory_usage / 1024 ** 2:.2f} MB")


Memory usage of the sample (1000 rows): 1.31 MB
Estimated total memory usage: 464.69 MB


In [10]:
# Query to retrieve the necessary data without H3 encoding and aggregation
pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        start_lat,
        start_lng,
        end_lat,
        end_lng,
        DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}'
    """
)

# Display the retrieved data
pe_tj_df7.to_csv('/home/jovyan/CO/Feb_h3_7.csv', index=False)
pe_tj_df7


Unnamed: 0,cuebiq_id,event_datetime_local,start_lat,start_lng,end_lat,end_lng,local_date
0,3075494696,2020-02-10 18:43:30,4.718235,-74.121335,4.721026,-74.124147,20200210
1,3075479828,2020-02-10 14:12:41,7.997892,-76.630989,7.998976,-76.629274,20200210
2,3054856098,2020-02-10 00:11:58,10.434842,-75.536390,10.434915,-75.536349,20200210
3,3054856098,2020-02-10 14:01:42,10.434915,-75.536349,10.435014,-75.536212,20200210
4,3054856098,2020-02-10 20:01:01,10.435014,-75.536212,10.434998,-75.536393,20200210
...,...,...,...,...,...,...,...
353810,2280496728,2020-02-16 14:45:15,4.680406,-74.134529,4.679082,-74.134936,20200216
353811,2026212676,2020-02-16 15:27:17,11.228078,-74.198927,11.228502,-74.197873,20200216
353812,2026212676,2020-02-16 15:39:32,11.228502,-74.197873,11.232461,-74.199917,20200216
353813,2026212676,2020-02-16 16:26:42,11.232461,-74.199917,11.231872,-74.199845,20200216


In [15]:
pe_tj_df7

Unnamed: 0,cuebiq_id,event_datetime_local,start_lat,start_lng,end_lat,end_lng,local_date
0,3087207169,2020-02-16 22:21:11,3.396430,-76.530012,3.437595,-76.472645,20200216
1,3081303010,2020-02-16 12:42:02,4.654254,-74.086806,4.651615,-74.092378,20200216
2,3079802315,2020-02-16 03:20:28,6.973649,-73.046410,6.973728,-73.046158,20200216
3,3079802315,2020-02-16 04:40:34,6.973728,-73.046158,6.973786,-73.046290,20200216
4,3079802315,2020-02-16 07:52:27,6.973786,-73.046290,6.975289,-73.046874,20200216
...,...,...,...,...,...,...,...
353810,2726829608,2020-02-10 03:58:46,6.202814,-75.562954,6.202990,-75.562711,20200210
353811,2726829608,2020-02-10 05:45:03,6.202990,-75.562711,6.202143,-75.562252,20200210
353812,2726829608,2020-02-10 09:23:30,6.202143,-75.562252,6.207976,-75.568022,20200210
353813,2726829608,2020-02-10 18:55:52,6.207976,-75.568022,6.202492,-75.562755,20200210


In [11]:
# pip install h3 --pre

Note: you may need to restart the kernel to use updated packages.


In [13]:
import h3
h3.versions()

{'c': '4.0.0', 'python': '4.0.0b5'}

In [19]:
pe_tj_df7['start_h37'] = pe_tj_df7.apply(lambda row: h3.latlng_to_cell(row['start_lat'], row['start_lng'], 7), axis=1)
pe_tj_df7['end_h37'] = pe_tj_df7.apply(lambda row: h3.latlng_to_cell(row['end_lat'], row['end_lng'], 7), axis=1)
pe_tj_df7

Unnamed: 0,cuebiq_id,event_datetime_local,start_lat,start_lng,end_lat,end_lng,local_date,start_h37,end_h37
0,3075494696,2020-02-10 18:43:30,4.718235,-74.121335,4.721026,-74.124147,20200210,8766e429dffffff,8766e429cffffff
1,3075479828,2020-02-10 14:12:41,7.997892,-76.630989,7.998976,-76.629274,20200210,876611311ffffff,876611315ffffff
2,3054856098,2020-02-10 00:11:58,10.434842,-75.536390,10.434915,-75.536349,20200210,8766313b1ffffff,8766313b1ffffff
3,3054856098,2020-02-10 14:01:42,10.434915,-75.536349,10.435014,-75.536212,20200210,8766313b1ffffff,8766313b1ffffff
4,3054856098,2020-02-10 20:01:01,10.435014,-75.536212,10.434998,-75.536393,20200210,8766313b1ffffff,8766313b1ffffff
...,...,...,...,...,...,...,...,...,...
353810,2280496728,2020-02-16 14:45:15,4.680406,-74.134529,4.679082,-74.134936,20200216,8766e429bffffff,8766e429bffffff
353811,2026212676,2020-02-16 15:27:17,11.228078,-74.198927,11.228502,-74.197873,20200216,876622b20ffffff,876622b20ffffff
353812,2026212676,2020-02-16 15:39:32,11.228502,-74.197873,11.232461,-74.199917,20200216,876622b20ffffff,876622b20ffffff
353813,2026212676,2020-02-16 16:26:42,11.232461,-74.199917,11.231872,-74.199845,20200216,876622b20ffffff,876622b20ffffff


In [21]:
pe_tj_df7['start_gh5'] = pe_tj_df7.apply(lambda row: geohash.encode(row['start_lat'], row['start_lng'], 5), axis=1)
pe_tj_df7['end_gh5'] = pe_tj_df7.apply(lambda row: geohash.encode(row['end_lat'], row['end_lng'], 5), axis=1)
pe_tj_df7

Unnamed: 0,cuebiq_id,event_datetime_local,start_lat,start_lng,end_lat,end_lng,local_date,start_h37,end_h37,start_gh5,end_gh5
0,3075494696,2020-02-10 18:43:30,4.718235,-74.121335,4.721026,-74.124147,20200210,8766e429dffffff,8766e429cffffff,d2g6c,d2g6c
1,3075479828,2020-02-10 14:12:41,7.997892,-76.630989,7.998976,-76.629274,20200210,876611311ffffff,876611315ffffff,d33t2,d33t8
2,3054856098,2020-02-10 00:11:58,10.434842,-75.536390,10.434915,-75.536349,20200210,8766313b1ffffff,8766313b1ffffff,d3f73,d3f73
3,3054856098,2020-02-10 14:01:42,10.434915,-75.536349,10.435014,-75.536212,20200210,8766313b1ffffff,8766313b1ffffff,d3f73,d3f73
4,3054856098,2020-02-10 20:01:01,10.435014,-75.536212,10.434998,-75.536393,20200210,8766313b1ffffff,8766313b1ffffff,d3f73,d3f73
...,...,...,...,...,...,...,...,...,...,...,...
353810,2280496728,2020-02-16 14:45:15,4.680406,-74.134529,4.679082,-74.134936,20200216,8766e429bffffff,8766e429bffffff,d2g69,d2g69
353811,2026212676,2020-02-16 15:27:17,11.228078,-74.198927,11.228502,-74.197873,20200216,876622b20ffffff,876622b20ffffff,d3gpz,d3gpz
353812,2026212676,2020-02-16 15:39:32,11.228502,-74.197873,11.232461,-74.199917,20200216,876622b20ffffff,876622b20ffffff,d3gpz,d3gpz
353813,2026212676,2020-02-16 16:26:42,11.232461,-74.199917,11.231872,-74.199845,20200216,876622b20ffffff,876622b20ffffff,d3gpz,d3gpz


# Aggregation

## Only Bogota

In [3]:
import pandas as pd
pe_tj_df7 = pd.read_csv('/home/jovyan/CO/Feb.csv')
pe_tj_df7

Unnamed: 0,cuebiq_id,event_datetime_local,start_lat,start_lng,end_lat,end_lng,local_date,start_h37,end_h37,start_gh5,end_gh5
0,3075494696,2020-02-10 18:43:30,4.718235,-74.121335,4.721026,-74.124147,20200210,8766e429dffffff,8766e429cffffff,d2g6c,d2g6c
1,3075479828,2020-02-10 14:12:41,7.997892,-76.630989,7.998976,-76.629274,20200210,876611311ffffff,876611315ffffff,d33t2,d33t8
2,3054856098,2020-02-10 00:11:58,10.434842,-75.536390,10.434915,-75.536349,20200210,8766313b1ffffff,8766313b1ffffff,d3f73,d3f73
3,3054856098,2020-02-10 14:01:42,10.434915,-75.536349,10.435014,-75.536212,20200210,8766313b1ffffff,8766313b1ffffff,d3f73,d3f73
4,3054856098,2020-02-10 20:01:01,10.435014,-75.536212,10.434998,-75.536393,20200210,8766313b1ffffff,8766313b1ffffff,d3f73,d3f73
...,...,...,...,...,...,...,...,...,...,...,...
353810,2280496728,2020-02-16 14:45:15,4.680406,-74.134529,4.679082,-74.134936,20200216,8766e429bffffff,8766e429bffffff,d2g69,d2g69
353811,2026212676,2020-02-16 15:27:17,11.228078,-74.198927,11.228502,-74.197873,20200216,876622b20ffffff,876622b20ffffff,d3gpz,d3gpz
353812,2026212676,2020-02-16 15:39:32,11.228502,-74.197873,11.232461,-74.199917,20200216,876622b20ffffff,876622b20ffffff,d3gpz,d3gpz
353813,2026212676,2020-02-16 16:26:42,11.232461,-74.199917,11.231872,-74.199845,20200216,876622b20ffffff,876622b20ffffff,d3gpz,d3gpz


In [5]:
import geopandas as gpd

# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(
    pe_tj_df7, 
    geometry=gpd.points_from_xy(pe_tj_df7.start_lng, pe_tj_df7.start_lat),
    crs="EPSG:4326"  # assuming your data is in WGS84
)
gdf

Unnamed: 0,cuebiq_id,event_datetime_local,start_lat,start_lng,end_lat,end_lng,local_date,start_h37,end_h37,start_gh5,end_gh5,geometry
0,3075494696,2020-02-10 18:43:30,4.718235,-74.121335,4.721026,-74.124147,20200210,8766e429dffffff,8766e429cffffff,d2g6c,d2g6c,POINT (-74.12133 4.71823)
1,3075479828,2020-02-10 14:12:41,7.997892,-76.630989,7.998976,-76.629274,20200210,876611311ffffff,876611315ffffff,d33t2,d33t8,POINT (-76.63099 7.99789)
2,3054856098,2020-02-10 00:11:58,10.434842,-75.536390,10.434915,-75.536349,20200210,8766313b1ffffff,8766313b1ffffff,d3f73,d3f73,POINT (-75.53639 10.43484)
3,3054856098,2020-02-10 14:01:42,10.434915,-75.536349,10.435014,-75.536212,20200210,8766313b1ffffff,8766313b1ffffff,d3f73,d3f73,POINT (-75.53635 10.43491)
4,3054856098,2020-02-10 20:01:01,10.435014,-75.536212,10.434998,-75.536393,20200210,8766313b1ffffff,8766313b1ffffff,d3f73,d3f73,POINT (-75.53621 10.43501)
...,...,...,...,...,...,...,...,...,...,...,...,...
353810,2280496728,2020-02-16 14:45:15,4.680406,-74.134529,4.679082,-74.134936,20200216,8766e429bffffff,8766e429bffffff,d2g69,d2g69,POINT (-74.13453 4.68041)
353811,2026212676,2020-02-16 15:27:17,11.228078,-74.198927,11.228502,-74.197873,20200216,876622b20ffffff,876622b20ffffff,d3gpz,d3gpz,POINT (-74.19893 11.22808)
353812,2026212676,2020-02-16 15:39:32,11.228502,-74.197873,11.232461,-74.199917,20200216,876622b20ffffff,876622b20ffffff,d3gpz,d3gpz,POINT (-74.19787 11.22850)
353813,2026212676,2020-02-16 16:26:42,11.232461,-74.199917,11.231872,-74.199845,20200216,876622b20ffffff,876622b20ffffff,d3gpz,d3gpz,POINT (-74.19992 11.23246)


In [6]:
# Load the shapefile
shapefile_path = '/home/jovyan/CO/shp/gadm41_COL_bogota.shp'  # replace with your shapefile path
shapefile_gdf = gpd.read_file(shapefile_path)
shapefile_gdf

Unnamed: 0,GID_1,GID_0,COUNTRY,NAME_1,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,ISO_1,geometry
0,COL.5_2,COL,Colombia,Bogotá D.C.,Distrito Capital de Bogotá,,Distrito Capital,Capital District,,CO.DC,,"POLYGON ((-74.24053 4.18671, -74.24079 4.18739..."


In [9]:
# Perform the spatial join to filter points within the shapefile
points_within_shapefile = gpd.sjoin(gdf, shapefile_gdf, op='within')
columns_to_keep = gdf.columns.tolist()
points_within_shapefile = points_within_shapefile[columns_to_keep]
points_within_shapefile

  if await self.run_code(code, result, async_=asy):


Unnamed: 0,cuebiq_id,event_datetime_local,start_lat,start_lng,end_lat,end_lng,local_date,start_h37,end_h37,start_gh5,end_gh5,geometry
0,3075494696,2020-02-10 18:43:30,4.718235,-74.121335,4.721026,-74.124147,20200210,8766e429dffffff,8766e429cffffff,d2g6c,d2g6c,POINT (-74.12133 4.71823)
12,3037729580,2020-02-10 06:43:52,4.706017,-74.088743,4.745051,-74.117155,20200210,8766e428affffff,8766e4283ffffff,d2g6f,d2g6c,POINT (-74.08874 4.70602)
13,3037637587,2020-02-10 00:32:49,4.689980,-74.042259,4.690039,-74.042194,20200210,8766e42d4ffffff,8766e42d4ffffff,d2g6e,d2g6e,POINT (-74.04226 4.68998)
14,3037637587,2020-02-10 07:00:57,4.690039,-74.042194,4.688621,-74.046370,20200210,8766e42d4ffffff,8766e4289ffffff,d2g6e,d2g6e,POINT (-74.04219 4.69004)
15,3037637587,2020-02-10 09:54:56,4.688621,-74.046370,4.656315,-74.057828,20200210,8766e4289ffffff,8766e42d2ffffff,d2g6e,d2g66,POINT (-74.04637 4.68862)
...,...,...,...,...,...,...,...,...,...,...,...,...
353793,2488746225,2020-02-16 10:10:41,4.639784,-74.195141,4.642271,-74.192838,20200216,8766e4649ffffff,8766e4649ffffff,d2g4r,d2g4r,POINT (-74.19514 4.63978)
353794,2488640225,2020-02-16 18:51:02,4.690640,-74.036974,4.690707,-74.036597,20200216,8766e42d4ffffff,8766e42d4ffffff,d2g6e,d2g6e,POINT (-74.03697 4.69064)
353795,2488621826,2020-02-16 16:37:50,4.773400,-74.044647,4.689453,-74.053959,20200216,8766e42acffffff,8766e4289ffffff,d2g75,d2g6d,POINT (-74.04465 4.77340)
353802,2418016396,2020-02-16 18:44:20,4.740471,-74.064597,4.738359,-74.064597,20200216,8766e42abffffff,8766e42abffffff,d2g6f,d2g6f,POINT (-74.06460 4.74047)


In [10]:
points_within_shapefile.to_csv('/home/jovyan/CO/Feb_Bogata.csv', index=False)

In [11]:
# Add user numbers to the aggregated data for geohash3
aggregated_gh = points_within_shapefile.groupby(['start_gh5', 'end_gh5']).agg({
    'cuebiq_id': 'count'
}).reset_index()

aggregated_gh = aggregated_gh[aggregated_gh['cuebiq_id'] > 9]
aggregated_gh

Unnamed: 0,start_gh5,end_gh5,cuebiq_id
7,d2g33,d2g33,26
8,d2g33,d2g39,11
30,d2g39,d2g39,508
32,d2g39,d2g3c,41
40,d2g39,d2g61,29
...,...,...,...
1461,d2g77,d2g6f,14
1462,d2g77,d2g6g,14
1465,d2g77,d2g75,26
1466,d2g77,d2g76,13


In [12]:
aggregated_gh.to_csv('/home/jovyan/CO/Feb_agg_gh_Bogata.csv', index=False)

In [13]:
# Add user numbers to the aggregated data for geohash3
aggregated_h3 = points_within_shapefile.groupby(['start_h37', 'end_h37']).agg({
    'cuebiq_id': 'count'
}).reset_index()
aggregated_h3 = aggregated_h3[aggregated_h3['cuebiq_id'] > 9]
aggregated_h3

Unnamed: 0,start_h37,end_h37,cuebiq_id
13,8766e0820ffffff,8766e0820ffffff,244
16,8766e0820ffffff,8766e0824ffffff,29
17,8766e0820ffffff,8766e0825ffffff,18
54,8766e0820ffffff,8766e42d2ffffff,11
66,8766e0821ffffff,8766e0821ffffff,15
...,...,...,...
6126,8766e4649ffffff,8766e0932ffffff,22
6127,8766e4649ffffff,8766e0933ffffff,16
6149,8766e4649ffffff,8766e4649ffffff,299
6168,8766e464bffffff,8766e464bffffff,16


In [14]:
aggregated_h3.to_csv('/home/jovyan/CO/Feb_agg_h3_Bogata.csv', index=False)