In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [2]:
%sql trino://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [3]:
import os
import time
# import geohash
import logging
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine

# pip install python-geohash

In [4]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [5]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [6]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Weekly Aggregation

## GH5

In [None]:
def extract_od_data(sql_engine, country_code, year):
    query = f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {year}0101 AND {year}1231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_geohash5, end_geohash5, week_number
    """
    
    pe_tj_df7 = sql_engine.read_sql(query)
    output_file_path = f'/home/jovyan/Data/Week/od_{country_code.lower()}_{year}_week_gh5.csv'
    pe_tj_df7.to_csv(output_file_path, index=False)
    
    return output_file_path


In [None]:
output_file = extract_od_data(sql_engine, 'IN', 2020)
print(f"Data saved to: {output_file}")
# didn't test tho

In [None]:
country_code = 'IN'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20200101 AND 20201231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_geohash5, end_geohash5, week_number
    """
)

# pe_tj_df7
pe_tj_df7.to_csv(f'/home/jovyan/Data/Week/od_{country_code.lower()}_{year}_week_gh5.csv', index=False)

## H3 7
need to break down to 3 times: 
0101-0503, 0427-0904, 0827-1231

In [9]:
country_code = 'CO'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20200101 AND 20200703
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, week_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week1_h37.csv', index=False)

In [21]:
country_code = 'ID'
query = f"""
    SELECT 
        cuebiq_id,
        TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
        geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
        duration_minutes,
        length_meters,
        number_of_points, 
        WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN 20191228 AND 20191231
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}'
        AND start_lat <> 0
        AND end_lat <> 0
"""

pe_tj_df = sql_engine.read_sql(query)
pe_tj_df

Unnamed: 0,cuebiq_id,event_datetime_local,start_geohash5,end_geohash5,duration_minutes,length_meters,number_of_points,week_number
0,2933109088,2019-12-29 15:46:56,qqws0,qqws0,20.666667,3481.067091,2,52
1,2933109088,2019-12-29 16:10:13,qqws0,qqweb,24.683333,5709.272299,6,52
2,2933109088,2019-12-29 17:37:28,qqweb,qqweb,14.066667,1171.797641,5,52
3,2933109088,2019-12-29 21:13:00,qqweb,qqw7x,137.766667,14394.922914,10,52
4,2932963298,2019-12-29 19:27:01,qquhe,qquhe,42.533333,369.295141,4,52
...,...,...,...,...,...,...,...,...
227063,2778499449,2019-12-28 23:27:45,rps19,rps19,5.250000,123.650918,2,52
227064,2395248189,2019-12-28 14:41:13,qrvxx,qrvz8,51.000000,2089.524072,6,52
227065,2395248189,2019-12-28 15:52:41,qrvz8,qrvz8,10.066667,149.941144,3,52
227066,2395248189,2019-12-28 16:21:41,qrvz8,qrvxx,38.166667,3110.325350,4,52


In [22]:
pe_tj_df['day_of_month'] = pe_tj_df['event_datetime_local'].dt.day
pe_tj_df

Unnamed: 0,cuebiq_id,event_datetime_local,start_geohash5,end_geohash5,duration_minutes,length_meters,number_of_points,week_number,day_of_month
0,2933109088,2019-12-29 15:46:56,qqws0,qqws0,20.666667,3481.067091,2,52,29
1,2933109088,2019-12-29 16:10:13,qqws0,qqweb,24.683333,5709.272299,6,52,29
2,2933109088,2019-12-29 17:37:28,qqweb,qqweb,14.066667,1171.797641,5,52,29
3,2933109088,2019-12-29 21:13:00,qqweb,qqw7x,137.766667,14394.922914,10,52,29
4,2932963298,2019-12-29 19:27:01,qquhe,qquhe,42.533333,369.295141,4,52,29
...,...,...,...,...,...,...,...,...,...
227063,2778499449,2019-12-28 23:27:45,rps19,rps19,5.250000,123.650918,2,52,28
227064,2395248189,2019-12-28 14:41:13,qrvxx,qrvz8,51.000000,2089.524072,6,52,28
227065,2395248189,2019-12-28 15:52:41,qrvz8,qrvz8,10.066667,149.941144,3,52,28
227066,2395248189,2019-12-28 16:21:41,qrvz8,qrvxx,38.166667,3110.325350,4,52,28


In [27]:
pe_tj_df[pe_tj_df['day_of_month'] == 31]

Unnamed: 0,cuebiq_id,event_datetime_local,start_geohash5,end_geohash5,duration_minutes,length_meters,number_of_points,week_number,day_of_month
19287,1738620109,2019-12-31 14:38:33,qqgv5,qqgv5,25.683333,419.367798,5,1,31
19288,1738620109,2019-12-31 15:14:23,qqgv5,qqgu3,82.966667,17891.819392,3,1,31
22874,2589300224,2019-12-31 23:49:11,qr4c6,qr4c6,2.4,630.876544,2,1,31


In [8]:
country_code = 'ID'
query = f"""
    SELECT 
        cuebiq_id,
        TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
        geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
        duration_minutes,
        length_meters,
        number_of_points, 
        WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN 20200101 AND 20200110
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}'
        AND start_lat <> 0
        AND end_lat <> 0
"""

pe_tj_df7 = sql_engine.read_sql(query)
pe_tj_df7

Unnamed: 0,cuebiq_id,event_datetime_local,start_geohash5,end_geohash5,duration_minutes,length_meters,number_of_points,week_number
0,2938220046,2020-01-01 22:35:14,qqu7g,qqu7g,28.533333,6169.292068,11,1
1,2938063370,2020-01-01 17:06:05,qwpyh,qwpyh,35.366667,173.140301,5,1
2,2938063370,2020-01-01 19:29:53,qwpyh,qwpyj,69.166667,6989.542886,3,1
3,2936940036,2020-01-01 09:26:55,qqgt2,qqgkq,326.800000,23291.484653,2,1
4,2936833033,2020-01-01 21:00:10,qxkud,qxkud,49.983333,479.830337,7,1
...,...,...,...,...,...,...,...,...
700866,2758846576,2020-01-03 16:48:49,qqggz,qqggz,52.300000,173.170719,4,1
700867,2758846576,2020-01-03 20:22:01,qqggz,qqggz,95.783333,283.070065,5,1
700868,2758846576,2020-01-03 22:04:32,qqggz,qqggz,16.083333,1916.808336,3,1
700869,2725096904,2020-01-03 10:32:17,qqguy,qqguy,683.333333,8892.672995,6,1


In [10]:
# Convert 'event_datetime_local' to datetime
pe_tj_df7['event_datetime_local'] = pd.to_datetime(pe_tj_df7['event_datetime_local'])
pe_tj_df7['local_date'] = pe_tj_df7['event_datetime_local'].dt.date
pe_tj_df7

Unnamed: 0,cuebiq_id,event_datetime_local,start_geohash5,end_geohash5,duration_minutes,length_meters,number_of_points,week_number,local_date
0,2938220046,2020-01-01 22:35:14,qqu7g,qqu7g,28.533333,6169.292068,11,1,2020-01-01
1,2938063370,2020-01-01 17:06:05,qwpyh,qwpyh,35.366667,173.140301,5,1,2020-01-01
2,2938063370,2020-01-01 19:29:53,qwpyh,qwpyj,69.166667,6989.542886,3,1,2020-01-01
3,2936940036,2020-01-01 09:26:55,qqgt2,qqgkq,326.800000,23291.484653,2,1,2020-01-01
4,2936833033,2020-01-01 21:00:10,qxkud,qxkud,49.983333,479.830337,7,1,2020-01-01
...,...,...,...,...,...,...,...,...,...
700866,2758846576,2020-01-03 16:48:49,qqggz,qqggz,52.300000,173.170719,4,1,2020-01-03
700867,2758846576,2020-01-03 20:22:01,qqggz,qqggz,95.783333,283.070065,5,1,2020-01-03
700868,2758846576,2020-01-03 22:04:32,qqggz,qqggz,16.083333,1916.808336,3,1,2020-01-03
700869,2725096904,2020-01-03 10:32:17,qqguy,qqguy,683.333333,8892.672995,6,1,2020-01-03


In [15]:
pe_tj_df7['day_of_month'] = pe_tj_df7['event_datetime_local'].dt.day
pe_tj_df7

Unnamed: 0,cuebiq_id,event_datetime_local,start_geohash5,end_geohash5,duration_minutes,length_meters,number_of_points,week_number,local_date,day_of_month
0,2938220046,2020-01-01 22:35:14,qqu7g,qqu7g,28.533333,6169.292068,11,1,2020-01-01,1
1,2938063370,2020-01-01 17:06:05,qwpyh,qwpyh,35.366667,173.140301,5,1,2020-01-01,1
2,2938063370,2020-01-01 19:29:53,qwpyh,qwpyj,69.166667,6989.542886,3,1,2020-01-01,1
3,2936940036,2020-01-01 09:26:55,qqgt2,qqgkq,326.800000,23291.484653,2,1,2020-01-01,1
4,2936833033,2020-01-01 21:00:10,qxkud,qxkud,49.983333,479.830337,7,1,2020-01-01,1
...,...,...,...,...,...,...,...,...,...,...
700866,2758846576,2020-01-03 16:48:49,qqggz,qqggz,52.300000,173.170719,4,1,2020-01-03,3
700867,2758846576,2020-01-03 20:22:01,qqggz,qqggz,95.783333,283.070065,5,1,2020-01-03,3
700868,2758846576,2020-01-03 22:04:32,qqggz,qqggz,16.083333,1916.808336,3,1,2020-01-03,3
700869,2725096904,2020-01-03 10:32:17,qqguy,qqguy,683.333333,8892.672995,6,1,2020-01-03,3


In [20]:
pe_tj_df7[pe_tj_df7['day_of_month'] == 6]

Unnamed: 0,cuebiq_id,event_datetime_local,start_geohash5,end_geohash5,duration_minutes,length_meters,number_of_points,week_number,local_date,day_of_month
614,2951738845,2020-01-06 10:37:24,w8pdb,w8pdb,6.966667,3.293141e+02,2,2,2020-01-06,6
615,2951738845,2020-01-06 11:00:44,w8pdb,w8pdb,28.483333,3.287818e+02,2,2,2020-01-06,6
616,2951738845,2020-01-06 21:44:04,w8pdb,w8pdb,9.266667,1.694160e+02,4,2,2020-01-06,6
617,2951732938,2020-01-06 11:39:59,qqu80,qqu80,29.150000,2.003634e+02,2,2,2020-01-06,6
618,2951732938,2020-01-06 15:00:23,qqu80,qqsxb,116.533333,8.674817e+03,6,2,2020-01-06,6
...,...,...,...,...,...,...,...,...,...,...
535188,2724532333,2020-01-06 08:57:58,qxj50,qqgv4,165.266667,1.434040e+06,3,2,2020-01-06,6
535189,2724532333,2020-01-06 10:54:55,qqgv4,qqgv5,8.216667,4.359096e+02,3,2,2020-01-06,6
535190,2238293503,2020-01-06 06:26:59,qpz6e,qpz6e,205.016667,2.483859e+02,2,2,2020-01-06,6
535191,2238293503,2020-01-06 13:23:47,qpz6e,qpz6e,10.700000,2.504492e+03,5,2,2020-01-06,6


# Check - Week

2019: need to change week 1 from Dec to week 53  
2020: no need to change

## GH5

In [75]:
# country_code = 'CO' 
# country_code = 'ID' 
# country_code = 'IN' 
country_code = 'MX' 

##### Remove week 1 since it is combination of 0101 and 1230

In [67]:
file1 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week.csv'

df1 = pd.read_csv(file1)
df1

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,9g995,9g995,6,203,60.405993,143.300773,17.025000,3718.925813,13731.502694,503.143027,5.241379,3.892855,4
1,9g3qr,9g3mw,6,74,139.002027,177.103662,80.183333,17513.415243,4611.038373,16677.289359,15.027027,11.188268,14
2,9u8d2,9u8d1,6,33,81.321212,117.584744,42.583333,7210.013726,4006.938820,6150.999834,6.060606,4.885817,5
3,9fyp5,9fyp5,6,1354,61.615990,100.332968,27.434377,2809.206251,7829.470119,930.018084,5.531758,4.244133,4
4,9ewtc,9ewt8,6,152,111.794956,154.009574,56.800000,7456.405306,6257.918197,6512.856717,8.394737,11.813203,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8064690,9g3rv,9g3qm,49,1,156.033333,,156.033333,31784.397040,,31784.397040,3.000000,,3
8064691,9er81,9erqw,49,1,161.916667,,161.916667,129950.357141,,129950.357141,7.000000,,7
8064692,d58w1,d58w1,49,1,20.966667,,20.966667,735.483018,,735.483018,6.000000,,6
8064693,9g859,9g856,49,1,79.600000,,79.600000,5226.034895,,5226.034895,5.000000,,5


In [68]:
df1 = df1[df1['week_number'] != 1]
# df1
sorted(df1['week_number'].unique())

[2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52]

In [None]:
df1.to_csv(f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week.csv', index=False)

##### 2020

In [76]:
file = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week_gh5.csv'
df = pd.read_csv(file)
# combined = df
combined

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,9g995,9g995,6,203,60.405993,143.300773,17.025000,3718.925813,13731.502694,503.143027,5.241379,3.892855,4
1,9g3qr,9g3mw,6,74,139.002027,177.103662,80.183333,17513.415243,4611.038373,16677.289359,15.027027,11.188268,14
2,9u8d2,9u8d1,6,33,81.321212,117.584744,42.583333,7210.013726,4006.938820,6150.999834,6.060606,4.885817,5
3,9fyp5,9fyp5,6,1354,61.615990,100.332968,27.434377,2809.206251,7829.470119,930.018084,5.531758,4.244133,4
4,9ewtc,9ewt8,6,152,111.794956,154.009574,56.800000,7456.405306,6257.918197,6512.856717,8.394737,11.813203,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8105621,9scg8,9scu3,53,1,74.616667,,74.616667,29783.837792,,29783.837792,256.000000,,256
8105622,9u1e0,9u1e0,53,2,15.458333,4.537269,18.666667,289.187465,70.721337,339.195002,4.500000,0.707107,5
8105623,9g92j,9g920,53,1,274.716667,,274.716667,42701.794801,,42701.794801,19.000000,,19
8105624,9sp19,9ezk3,53,1,428.716667,,428.716667,107770.469328,,107770.469328,3.000000,,3


##### for 2019 only

In [None]:
file = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week_gh5.csv'
df = pd.read_csv(file)
# df

In [41]:
file1 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week_gh5_w1.csv'
df1 = pd.read_csv(file1)
df1

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,9gjyk,9gjyk,1,1178,63.081494,115.575374,26.536966,3352.922811,8213.118819,962.321671,6.612903,17.359253,4
1,9g989,9g98d,1,26,51.086538,83.171319,25.566667,4539.941363,5702.375901,3269.224633,5.615385,4.089762,5
2,9g8td,9g8ht,1,1,98.483333,,98.483333,81476.079254,,81476.079254,18.000000,,18
3,9g37c,9g37c,1,43,66.186434,76.898620,30.900000,5389.237474,16018.399670,610.254286,5.162791,4.684982,4
4,9g3jv,9g3jv,1,1143,63.473214,121.790597,23.564987,7832.011035,147477.045402,514.102718,5.738408,10.269434,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
169842,9exgp,9etz9,1,1,521.416667,,521.416667,335551.694698,,335551.694698,5.000000,,5
169843,9g3jm,9g170,1,1,206.000000,,206.000000,236513.212410,,236513.212410,15.000000,,15
169844,9gnn0,9gnjx,1,1,39.583333,,39.583333,40262.779415,,40262.779415,6.000000,,6
169845,9g2t7,9g2td,1,1,288.633333,,288.633333,10909.022743,,10909.022743,3.000000,,3


In [42]:
file53 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week_gh5_w53.csv'
df53 = pd.read_csv(file53)
df53

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,9g4ps,9g4ps,53,6,14.002778,9.581507,11.800000,532.314563,465.326937,549.895235,3.500000,1.048809,4
1,9ezhr,9ezhr,53,169,51.886193,106.052111,23.029630,3835.228273,8477.127540,1272.234385,5.207101,4.427828,4
2,9ez8w,9ez8n,53,17,126.983333,155.798871,62.566667,14890.078117,19668.988406,9404.672072,11.941176,20.311544,7
3,9gfqt,9gfqu,53,47,57.713830,51.827224,42.033333,6664.298959,2778.673777,6402.489167,9.468085,8.734865,7
4,9gbju,9gbju,53,99,48.871212,81.575069,21.662500,1295.581568,2297.601339,423.948116,4.545455,2.956393,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63158,9scg8,9scu3,53,1,74.616667,,74.616667,29783.837792,,29783.837792,256.000000,,256
63159,9u1e0,9u1e0,53,2,15.458333,4.537269,18.666667,289.187465,70.721337,339.195002,4.500000,0.707107,5
63160,9g92j,9g920,53,1,274.716667,,274.716667,42701.794801,,42701.794801,19.000000,,19
63161,9sp19,9ezk3,53,1,428.716667,,428.716667,107770.469328,,107770.469328,3.000000,,3


In [43]:
combined = pd.concat([df, df1, df53], ignore_index=True)
# combined = pd.concat([df, df53], ignore_index=True)

combined

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,9g995,9g995,6,203,60.405993,143.300773,17.025000,3718.925813,13731.502694,503.143027,5.241379,3.892855,4
1,9g3qr,9g3mw,6,74,139.002027,177.103662,80.183333,17513.415243,4611.038373,16677.289359,15.027027,11.188268,14
2,9u8d2,9u8d1,6,33,81.321212,117.584744,42.583333,7210.013726,4006.938820,6150.999834,6.060606,4.885817,5
3,9fyp5,9fyp5,6,1354,61.615990,100.332968,27.434377,2809.206251,7829.470119,930.018084,5.531758,4.244133,4
4,9ewtc,9ewt8,6,152,111.794956,154.009574,56.800000,7456.405306,6257.918197,6512.856717,8.394737,11.813203,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8105621,9scg8,9scu3,53,1,74.616667,,74.616667,29783.837792,,29783.837792,256.000000,,256
8105622,9u1e0,9u1e0,53,2,15.458333,4.537269,18.666667,289.187465,70.721337,339.195002,4.500000,0.707107,5
8105623,9g92j,9g920,53,1,274.716667,,274.716667,42701.794801,,42701.794801,19.000000,,19
8105624,9sp19,9ezk3,53,1,428.716667,,428.716667,107770.469328,,107770.469328,3.000000,,3


##### plot

In [77]:
sorted(combined['week_number'].unique())

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53]

In [None]:
# Plot (1): Number of rows per week
rows_per_week = combined.groupby('week_number').size()
plt.figure(figsize=(8, 6))
rows_per_week.plot(kind='bar')
plt.title('Number of Rows per Week')
plt.xlabel('Week Number')
plt.ylabel('Number of Rows')
plt.show()

In [None]:
# Plot (2): Number of trip_count per week
trip_count_per_week = combined.groupby('week_number')['trip_count'].sum()
plt.figure(figsize=(8, 6))
trip_count_per_week.plot(kind='bar')
plt.title('Number of Trip Counts per Week')
plt.xlabel('Week Number')
plt.ylabel('Trip Count')
plt.show()

##### export 

In [78]:
# combined.to_csv(f"/home/jovyan/Data/Week/combined/od_week_gh5_{country_code.lower()}_2019_all.csv", index=False)
combined.to_csv(f"/home/jovyan/Data/Week/combined/od_week_gh5_{country_code.lower()}_2020_all.csv", index=False)

In [79]:
filtered = combined[combined['trip_count'] > 9]
filtered

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,9g995,9g995,6,203,60.405993,143.300773,17.025000,3718.925813,13731.502694,503.143027,5.241379,3.892855,4
1,9g3qr,9g3mw,6,74,139.002027,177.103662,80.183333,17513.415243,4611.038373,16677.289359,15.027027,11.188268,14
2,9u8d2,9u8d1,6,33,81.321212,117.584744,42.583333,7210.013726,4006.938820,6150.999834,6.060606,4.885817,5
3,9fyp5,9fyp5,6,1354,61.615990,100.332968,27.434377,2809.206251,7829.470119,930.018084,5.531758,4.244133,4
4,9ewtc,9ewt8,6,152,111.794956,154.009574,56.800000,7456.405306,6257.918197,6512.856717,8.394737,11.813203,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8104716,9smej,9smej,53,12,55.004167,53.718055,45.766667,3690.818590,7187.219261,634.951100,5.416667,4.100074,5
8104750,9swde,9swde,53,24,52.932639,67.284160,22.950000,2858.469697,9229.885261,628.018065,3.708333,1.966661,3
8104997,9ex2m,9ex2m,53,16,39.725000,41.528182,25.516667,480.617649,591.973133,298.952981,5.500000,4.993329,4
8105015,9sp3c,9sp3c,53,12,127.113889,217.525685,35.166667,781.320740,625.045905,631.033294,5.083333,2.998737,4


In [80]:
# filtered.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_gh5_{country_code.lower()}_2019.csv', index=False)
filtered.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_gh5_{country_code.lower()}_2020.csv', index=False)

In [17]:
# df1[(df1['week_number'] == 53) & (df1['start_geohash5']== 'd2g66') & (df1['end_geohash5']== 'd2g66')]

## H37

In [40]:
country_code = 'CO' 
# country_code = 'ID' 
# country_code = 'IN'
# country_code = 'MX'

In [41]:
# file1 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week1_h37.csv'
file1 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week1_h37.csv'
df1 = pd.read_csv(file1)
# df1['week_number'] = df1['week_number'].replace(1, 53) # For CO

sorted(df1['week_number'].unique())

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27]

In [42]:
fil1 = df1[df1['week_number'] < 27] # For ID
# fil1 = df1[df1['week_number'] < 18] # For MX, IN
sorted(fil1['week_number'].unique())

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26]

In [43]:
# file2 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week2_h37.csv'
file2 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week2_h37.csv'
df2 = pd.read_csv(file2)
# df2['week_number'] = df2['week_number'].replace(1, 53) # For ID
# df2
sorted(df2['week_number'].unique())

[26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53]

In [44]:
# fil2 = df2[(df2['week_number']>17) & (df2['week_number']<36)] # For IN & MX
fil2 = df2[df2['week_number'] > 26]
sorted(fil2['week_number'].unique())

[27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53]

In [9]:
# # For MX, IN
# # file3 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week3_h37.csv'
# file3 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week3_h37.csv'
# df3 = pd.read_csv(file3)
# df3['week_number'] = df3['week_number'].replace(1, 53) # for 2019
# # df3
# sorted(df3['week_number'].unique())

[34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53]

In [10]:
# fil3 = df3[df3['week_number'] > 35]
# sorted(fil3['week_number'].unique())

[36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]

In [45]:
# combined_df = pd.concat([fil1, fil2, fil3], ignore_index=True)
combined_df = pd.concat([fil1, fil2], ignore_index=True)
combined_df

Unnamed: 0,start_h3_7,end_h3_7,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,8766f1cecffffff,8766f1c5affffff,4,9,40.872222,25.664192,32.333333,5204.429883,1181.579682,5863.729789,8.444444,4.558265,8
1,8766e4281ffffff,8766e42f6ffffff,4,12,127.323611,111.174746,116.350000,11479.141063,11029.532031,8664.211937,19.000000,24.011361,14
2,876618341ffffff,87661836dffffff,4,2,30.708333,0.695322,31.200000,10297.201243,2395.977195,11991.412965,9.000000,9.899495,16
3,8766e0900ffffff,8766e0901ffffff,4,46,40.069565,28.417027,31.883333,3213.223141,3119.658954,2756.092210,6.152174,3.729566,6
4,8766e42abffffff,8766e42abffffff,4,806,58.118859,123.551970,19.577903,1747.753945,5553.980481,342.003849,5.239454,5.462326,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062511,8766f1910ffffff,8766f1916ffffff,48,1,61.433333,,61.433333,1767.111265,,1767.111265,5.000000,,5
1062512,8766d53a9ffffff,8766e2105ffffff,48,1,41.566667,,41.566667,364241.853950,,364241.853950,8.000000,,8
1062513,8766e09a5ffffff,8766e0985ffffff,48,1,472.866667,,472.866667,5716.396675,,5716.396675,6.000000,,6
1062514,8766e2a21ffffff,8766e2a34ffffff,48,1,54.850000,,54.850000,10293.598724,,10293.598724,10.000000,,10


In [46]:
sorted(combined_df['week_number'].unique())

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53]

In [47]:
combined_df = combined_df.drop_duplicates()
combined_df

Unnamed: 0,start_h3_7,end_h3_7,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,8766f1cecffffff,8766f1c5affffff,4,9,40.872222,25.664192,32.333333,5204.429883,1181.579682,5863.729789,8.444444,4.558265,8
1,8766e4281ffffff,8766e42f6ffffff,4,12,127.323611,111.174746,116.350000,11479.141063,11029.532031,8664.211937,19.000000,24.011361,14
2,876618341ffffff,87661836dffffff,4,2,30.708333,0.695322,31.200000,10297.201243,2395.977195,11991.412965,9.000000,9.899495,16
3,8766e0900ffffff,8766e0901ffffff,4,46,40.069565,28.417027,31.883333,3213.223141,3119.658954,2756.092210,6.152174,3.729566,6
4,8766e42abffffff,8766e42abffffff,4,806,58.118859,123.551970,19.577903,1747.753945,5553.980481,342.003849,5.239454,5.462326,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062511,8766f1910ffffff,8766f1916ffffff,48,1,61.433333,,61.433333,1767.111265,,1767.111265,5.000000,,5
1062512,8766d53a9ffffff,8766e2105ffffff,48,1,41.566667,,41.566667,364241.853950,,364241.853950,8.000000,,8
1062513,8766e09a5ffffff,8766e0985ffffff,48,1,472.866667,,472.866667,5716.396675,,5716.396675,6.000000,,6
1062514,8766e2a21ffffff,8766e2a34ffffff,48,1,54.850000,,54.850000,10293.598724,,10293.598724,10.000000,,10


In [48]:
# combined_df.to_csv(f'/home/jovyan/Data/Week/combined/od_week_h37_{country_code.lower()}_2019_all.csv', index=False)
combined_df.to_csv(f'/home/jovyan/Data/Week/combined/od_week_h37_{country_code.lower()}_2020_all.csv', index=False)

In [49]:
filter_df = combined_df[combined_df['trip_count'] > 9]
filter_df

Unnamed: 0,start_h3_7,end_h3_7,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
1,8766e4281ffffff,8766e42f6ffffff,4,12,127.323611,111.174746,116.350000,11479.141063,11029.532031,8664.211937,19.000000,24.011361,14
3,8766e0900ffffff,8766e0901ffffff,4,46,40.069565,28.417027,31.883333,3213.223141,3119.658954,2756.092210,6.152174,3.729566,6
4,8766e42abffffff,8766e42abffffff,4,806,58.118859,123.551970,19.577903,1747.753945,5553.980481,342.003849,5.239454,5.462326,4
6,8766f11b0ffffff,8766f11b0ffffff,4,652,44.578042,100.601666,16.081767,1159.378197,3205.097237,301.135005,4.875767,4.563755,4
7,8766f5172ffffff,8766f5172ffffff,4,133,76.648747,146.256915,26.433333,3129.215369,11604.100715,765.791803,5.105263,3.138865,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060532,87661d84bffffff,87661d84bffffff,53,27,56.995062,71.216507,27.033333,1177.254206,671.978343,958.208359,4.740741,2.246238,4
1061190,876603c23ffffff,876603c23ffffff,53,12,42.730556,26.347438,42.266667,2481.995649,1720.014121,2103.908800,6.750000,3.441062,6
1061237,87661d50affffff,87661d50affffff,53,15,114.052222,245.181969,17.850000,808.086962,640.690843,490.068844,8.533333,9.395034,5
1061246,87662b4d3ffffff,87662b4d3ffffff,47,12,66.151389,86.199994,50.900000,1102.235078,1083.195876,793.326726,3.583333,1.564279,3


In [50]:
# filter_df.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_h37_{country_code.lower()}_2019.csv', index=False)
filter_df.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_h37_{country_code.lower()}_2020.csv', index=False)

In [29]:
country_code = 'CO'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20191101 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month_h37.csv', index=False)
# 1133

In [None]:
# country_code = 'MX'
country_code = 'IN'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190101 AND 20190630
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month1_h37.csv', index=False)

In [None]:
# country_code = 'MX'
country_code = 'IN'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190701 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month2_h37.csv', index=False)

In [None]:
country_code = 'ID'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190101 AND 20190630
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month1_h37.csv', index=False)

In [None]:
country_code = 'ID'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190701 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month2_h37.csv', index=False)

In [None]:
country_code = 'ID'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        MONTH(event_datetime_local) AS month,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190101 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    ) subquery
    GROUP BY start_geohash5, end_geohash5, MONTH(event_datetime_local)
    """
)

pe_tj_df7


In [None]:
pe_tj_df7.to_csv('/home/jovyan/Data/Month/od_id_2019_month.csv', index=False)

In [None]:
country_code = 'MX'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        MONTH(event_datetime_local) AS month,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190101 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    ) subquery
    GROUP BY start_geohash5, end_geohash5, MONTH(event_datetime_local)
    """
)

pe_tj_df7


In [None]:
pe_tj_df7.to_csv('/home/jovyan/Data/Month/od_mx_2019_month.csv', index=False)

In [7]:
country_code = 'CO'
event_date = '2020-11-10'

event_date_dt = datetime.strptime(event_date, '%Y-%m-%d')
formatted_current_date = event_date_dt.strftime('%Y%m%d')

lookahead_date = event_date_dt + timedelta(days=7)
formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

In [8]:
pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_geohash5, 
            h3_encode(end_lat, end_lng, 7) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    ) subquery
    GROUP BY start_geohash5, end_geohash5, week_number
    """
)

pe_tj_df7


Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,8766e6242ffffff,8766e626affffff,47,1,45.133333,,45.133333,8334.702517,,8334.702517,3.000000,,3
1,8766d52f4ffffff,8766d52e2ffffff,47,4,72.729167,36.792175,81.233333,4714.453808,637.034634,5209.268396,6.750000,1.258306,7
2,8766f1885ffffff,8766f1881ffffff,47,7,36.426190,18.304034,34.183333,4159.911810,5135.234339,2404.821424,7.000000,5.033223,6
3,8766f1cecffffff,8766f1c50ffffff,47,3,88.861111,76.749789,76.166667,8503.703903,6132.811973,4985.727077,10.333333,8.386497,6
4,8766f1cedffffff,8766f1cecffffff,47,1,14.650000,,14.650000,1737.819743,,1737.819743,4.000000,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23250,8766e651effffff,8766e6501ffffff,46,1,44.250000,,44.250000,8757.255776,,8757.255776,10.000000,,10
23251,8766e581effffff,8766e581effffff,46,4,255.237500,484.763593,20.766667,3629.046423,6730.149878,493.085830,6.500000,9.000000,2
23252,8766e0a43ffffff,8766e0a50ffffff,46,1,31.916667,,31.916667,5406.014678,,5406.014678,6.000000,,6
23253,8766f568bffffff,8766f568bffffff,46,1,181.833333,,181.833333,7227.974710,,7227.974710,6.000000,,6


In [None]:
country_code = 'CO'
event_date = '2020-02-10'

event_date_dt = datetime.strptime(event_date, '%Y-%m-%d')
formatted_current_date = event_date_dt.strftime('%Y%m%d')

lookahead_date = event_date_dt + timedelta(days=14)
formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

In [21]:
pe_tj_df3 = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        duration_minutes,
        length_meters,
        number_of_points,
        TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
        -- geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
        geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
        -- geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
        geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
        DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date,
        DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%W') AS day_of_week
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
    """
)

pe_tj_df3

Unnamed: 0,cuebiq_id,duration_minutes,length_meters,number_of_points,event_datetime_local,event_hour,start_geohash3,end_geohash3,local_date,day_of_week
0,2542421864,4.733333,887.332275,3,2019-11-01 16:38:58,16,d29,d29,20191101,Friday
1,2542421864,15.283333,885.803007,4,2019-11-01 17:04:42,17,d29,d29,20191101,Friday
2,2542094214,5.433333,267.882334,3,2019-11-01 21:20:58,21,d34,d34,20191101,Friday
3,2542094214,2.033333,123.426615,2,2019-11-01 21:31:41,21,d34,d34,20191101,Friday
4,2542094214,55.450000,152.322199,2,2019-11-01 21:36:03,21,d34,d34,20191101,Friday
...,...,...,...,...,...,...,...,...,...,...
1139309,2531233229,833.466667,4267.391503,12,2019-11-03 07:45:31,7,d2g,d2g,20191103,Sunday
1139310,1955753448,48.750000,1686.573642,12,2019-11-03 08:35:19,8,d2g,d2g,20191103,Sunday
1139311,1701335404,252.700000,30069.918737,10,2019-11-03 07:58:35,7,d29,d29,20191103,Sunday
1139312,2001169528,22.000000,280.950938,5,2019-11-03 06:23:03,6,d2g,d2g,20191103,Sunday


In [46]:
pe_tj_df3 = sql_engine.read_sql(
    f"""
    WITH calculated_data AS (
        SELECT 
            cuebiq_id,
            duration_minutes,
            length_meters,
            number_of_points,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
            geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%W') AS day_of_week
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    )
    SELECT 
        day_of_week,
        start_geohash3,
        end_geohash3,
        COUNT(*) AS trip_count,
        COUNT(DISTINCT cuebiq_id) AS unique_cuebiq_ids,
        AVG(duration_minutes) AS m_duration_min,
        APPROX_PERCENTILE(duration_minutes, 0.5) AS mdn_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        AVG(length_meters) AS m_length_m,
        APPROX_PERCENTILE(length_meters, 0.5) AS mdn_length_m,
        STDDEV(length_meters) AS sd_length_m,
        AVG(number_of_points) AS m_points_no,
        APPROX_PERCENTILE(number_of_points, 0.5) AS mdn_points_no,
        STDDEV(number_of_points) AS sd_points_no
    FROM calculated_data
    GROUP BY day_of_week, start_geohash3, end_geohash3
    HAVING COUNT(*) > 9
    """
)

pe_tj_df3

Unnamed: 0,day_of_week,start_geohash3,end_geohash3,trip_count,unique_cuebiq_ids,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
0,Sunday,t9y,tdp,25,25,232.742667,189.716667,154.168553,120094.857637,95284.781449,67613.310013,11.080000,10,6.818602
1,Sunday,tg8,tgb,48,46,155.807639,65.450000,196.966896,87623.347963,14175.078218,340461.666107,11.958333,7,14.601018
2,Sunday,t9v,t9y,23,21,133.372464,84.133333,126.077162,54619.947743,31144.191658,60067.017894,11.000000,9,8.079154
3,Sunday,tdw,tdt,11,11,245.000000,218.200000,164.605948,61308.463633,60792.544241,45793.854842,8.818182,9,4.020403
4,Sunday,tf3,tf2,275,260,158.858364,96.189456,173.121444,32225.061234,14900.441948,61390.609431,9.821818,7,8.741684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2687,Saturday,tg5,tg7,10,10,190.205000,169.050000,117.477589,69592.097946,74010.274263,70247.664524,15.100000,9,13.067772
2688,Saturday,tdm,tdj,11,11,230.934848,145.716667,216.944580,61613.743316,36023.385424,55341.925611,14.181818,6,18.082136
2689,Wednesday,tdr,tf2,16,14,153.830208,96.100000,163.410386,300749.134725,28850.999664,831199.754500,14.937500,9,22.781480
2690,Monday,t9y,t9v,18,17,193.671296,148.766667,127.313951,45969.547281,43555.584063,41975.157211,11.611111,10,7.349581


In [47]:
pe_tj_df3[pe_tj_df3['day_of_week'] == 'Thursday']

Unnamed: 0,day_of_week,start_geohash3,end_geohash3,trip_count,unique_cuebiq_ids,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
26,Thursday,tgb,tgb,1509,530,88.306417,34.694224,138.772566,6460.621963,1554.000039,55888.815927,5.711067,4,7.106710
29,Thursday,tdm,tdm,1354,502,98.750123,36.685348,157.098461,7865.108617,1570.538548,26996.840404,5.495569,4,4.986491
32,Thursday,t9y,t9z,39,35,133.349145,74.933333,186.843645,33137.747998,6139.753650,51990.133293,9.871795,7,8.697185
38,Thursday,tgd,tgd,51,29,138.585294,82.900000,159.007605,4658.005212,1889.363140,7119.933812,4.705882,4,3.651269
44,Thursday,tdr,tdp,20,19,195.785000,176.450000,127.395596,88365.723030,60462.426111,71944.872435,17.800000,18,12.738669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2653,Thursday,tf2,tf3,255,238,119.719869,72.652381,133.707162,24445.599354,16527.895182,25254.605247,11.462745,9,18.363285
2671,Thursday,tdr,tdq,27,27,198.198765,180.400000,154.495859,78161.753577,67445.970412,42293.950354,12.555556,9,11.318240
2677,Thursday,te7,tee,37,36,223.441892,136.216667,244.791486,94293.293480,68340.405110,99243.560424,9.972973,9,6.861675
2681,Thursday,tdq,tdm,10,10,251.896667,305.283333,131.491652,64056.563444,57056.865918,56424.646690,11.900000,10,12.422650


In [48]:
pe_tj_df3[pe_tj_df3['day_of_week'] == 'Tuesday']

Unnamed: 0,day_of_week,start_geohash3,end_geohash3,trip_count,unique_cuebiq_ids,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
20,Tuesday,ttn,ttn,17428,5866,73.092481,30.439198,125.988568,6829.449917,1110.206942,29727.304717,6.181145,5,8.368901
24,Tuesday,ttp,ttp,4110,1565,81.432839,32.762603,137.152146,6574.258004,969.810476,28305.503737,5.682238,4,7.924427
30,Tuesday,ttn,ttp,409,392,109.419478,65.723525,128.261297,28665.504718,14845.344631,113829.665719,9.088020,7,6.894306
36,Tuesday,teg,teu,46,44,156.546739,124.666667,111.868474,73597.059928,41637.962625,161372.570303,8.826087,7,8.133072
91,Tuesday,tet,tes,15,15,178.523333,167.300000,114.824934,89562.546045,102818.576271,70638.501549,13.066667,9,13.729357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2657,Tuesday,tsh,tsh,448,179,90.149219,38.232352,132.332034,10933.375906,1208.685954,42305.997861,4.955357,4,3.420267
2667,Tuesday,teg,tee,30,30,122.641667,78.566667,113.719446,43159.651692,35087.044000,60312.990702,10.000000,10,6.669540
2668,Tuesday,tsz,ttn,12,12,227.436111,227.166667,70.644178,222780.729049,174874.927805,279875.011063,23.666667,21,16.983058
2678,Tuesday,ts5,tef,11,10,338.675758,204.816667,310.192672,235080.843158,204810.598785,130711.347351,13.818182,12,10.980147


In [49]:
pe_tj_df3[pe_tj_df3['day_of_week'] == 'Friday']

Unnamed: 0,day_of_week,start_geohash3,end_geohash3,trip_count,unique_cuebiq_ids,m_duration_min,mdn_duration_min,sd_duration_min,m_length_m,mdn_length_m,sd_length_m,m_points_no,mdn_points_no,sd_points_no
8,Friday,tej,tej,1133,421,111.483495,45.932558,166.565980,7687.023388,1328.667988,32931.571607,4.982348,4,3.768904
11,Friday,tdv,tdv,1202,425,114.429659,53.833064,152.649744,9914.995809,1715.402426,40863.975403,5.278702,4,4.693755
12,Friday,tus,tus,4252,1341,100.791040,45.094871,146.462573,9723.262921,1152.943451,74061.401072,5.459078,4,4.443161
14,Friday,wh1,wh1,1166,287,80.705660,37.702898,118.400321,3727.440322,1153.586322,9728.626031,6.166381,5,4.913826
35,Friday,tf2,tf0,38,36,208.598684,193.800000,159.980187,95434.554303,77828.841532,60973.715056,17.289474,13,18.754326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2639,Friday,tuz,tuz,326,108,116.228016,40.925000,180.932337,3775.247684,705.074061,12142.860017,5.239264,4,4.387162
2642,Friday,tsq,tsq,573,203,110.509250,46.108333,152.257215,8330.462591,1187.516841,44671.199219,4.834206,4,4.629498
2662,Friday,tud,tuc,12,12,395.901389,367.233333,324.262716,243380.640371,180316.045153,245719.016369,9.583333,10,4.718596
2663,Friday,tu7,tud,15,15,192.932222,101.766667,205.654177,134890.655815,22686.159901,248007.710633,12.200000,10,8.620573


In [17]:
country_code = 'MX'
file = f'/home/jovyan/Data/Week/combined/od_week_gh5_{country_code.lower()}_2020_all.csv'
df = pd.read_csv(file)
df

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,9g6gn,9g6gn,4,1123,52.303829,84.694805,24.492758,2478.805552,14313.106264,883.025480,5.249332,3.724011,4
1,9g3qn,9g3qn,4,11099,47.509409,79.890257,22.370194,2302.092784,11422.596758,482.089316,5.404451,5.083493,4
2,9g8tk,9g8th,4,92,73.238406,111.532303,42.133333,8763.635928,8687.627861,7923.722165,7.989130,10.384365,6
3,9erxm,9g800,4,2,90.758333,15.002449,101.366667,70775.134047,9390.730270,77415.383102,9.000000,1.414214,10
4,9g3qp,9g3qr,4,2608,66.169095,87.248769,39.887464,6692.192799,8537.310240,4666.388650,9.883052,10.514814,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3674029,9g9gd,9g9f3,45,1,169.150000,,169.150000,21111.143372,,21111.143372,2.000000,,2
3674030,9g3du,9g3ct,45,1,432.850000,,432.850000,46460.111681,,46460.111681,6.000000,,6
3674031,9ewt9,9ewt5,45,1,32.983333,,32.983333,16946.059808,,16946.059808,7.000000,,7
3674032,9ew76,9ewt8,45,1,93.250000,,93.250000,55031.737648,,55031.737648,13.000000,,13


In [18]:
filter_df = df[df['trip_count']>9]
filter_df

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,9g6gn,9g6gn,4,1123,52.303829,84.694805,24.492758,2478.805552,14313.106264,883.025480,5.249332,3.724011,4
1,9g3qn,9g3qn,4,11099,47.509409,79.890257,22.370194,2302.092784,11422.596758,482.089316,5.404451,5.083493,4
2,9g8tk,9g8th,4,92,73.238406,111.532303,42.133333,8763.635928,8687.627861,7923.722165,7.989130,10.384365,6
4,9g3qp,9g3qr,4,2608,66.169095,87.248769,39.887464,6692.192799,8537.310240,4666.388650,9.883052,10.514814,7
5,9g3js,9g3js,4,8770,47.854647,79.319795,23.523991,3436.889710,13181.683961,827.116864,5.695553,5.301225,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3671491,9g3fy,9g3fy,53,11,27.477273,20.350980,18.000000,419.303410,344.497136,292.460929,3.727273,1.272078,4
3671542,9eqwp,9eqwp,53,16,87.979167,99.758190,54.433333,780.806827,637.544687,670.684228,4.437500,2.988171,4
3671633,9erug,9erug,53,17,49.506863,28.567640,50.633333,632.690548,500.693706,498.353594,5.941176,2.925547,5
3671659,9fypm,9fypm,53,12,11.706944,9.712856,10.666667,860.711383,507.096099,755.410580,4.583333,3.449857,4


In [19]:
filter_df.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_gh5_{country_code.lower()}_2020.csv', index=False)

In [16]:
df['trip_count'].sum()

32827456

In [4]:
file = f'/home/jovyan/Data/Week/cleaned/od_week_gh5_id_2020.csv'
df2 = pd.read_csv(file)
df2

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,9g995,9g995,6,203,60.405993,143.300773,17.025000,3718.925813,13731.502694,503.143027,5.241379,3.892855,4
1,9g3qr,9g3mw,6,74,139.002027,177.103662,80.183333,17513.415243,4611.038373,16677.289359,15.027027,11.188268,14
2,9u8d2,9u8d1,6,33,81.321212,117.584744,42.583333,7210.013726,4006.938820,6150.999834,6.060606,4.885817,5
3,9fyp5,9fyp5,6,1354,61.615990,100.332968,27.434377,2809.206251,7829.470119,930.018084,5.531758,4.244133,4
4,9ewtc,9ewt8,6,152,111.794956,154.009574,56.800000,7456.405306,6257.918197,6512.856717,8.394737,11.813203,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1093239,9smej,9smej,53,12,55.004167,53.718055,45.766667,3690.818590,7187.219261,634.951100,5.416667,4.100074,5
1093240,9swde,9swde,53,24,52.932639,67.284160,22.950000,2858.469697,9229.885261,628.018065,3.708333,1.966661,3
1093241,9ex2m,9ex2m,53,16,39.725000,41.528182,25.516667,480.617649,591.973133,298.952981,5.500000,4.993329,4
1093242,9sp3c,9sp3c,53,12,127.113889,217.525685,35.166667,781.320740,625.045905,631.033294,5.083333,2.998737,4


In [9]:
file = f'/home/jovyan/Data/Week/od_co_2020_week_gh5.csv'
df3 = pd.read_csv(file)
df3

Unnamed: 0,start_geohash5,end_geohash5,week_number,trip_count,m_duration_min,sd_duration_min,mdn_duration_min,m_length_m,sd_length_m,mdn_length_m,m_points_no,sd_points_no,mdn_points_no
0,d2g69,d2g66,4,291,100.028121,135.639745,50.605144,9320.914519,6220.760817,7676.119142,11.271478,13.078484,8
1,d3fyd,d3fy9,4,187,54.585561,71.321449,33.668000,4073.167008,3480.072318,3218.587790,7.122995,4.907845,6
2,d2ewh,d2etg,4,4,85.737500,46.057543,77.816667,19658.851292,13776.338726,13933.991186,9.750000,5.377422,11
3,d2g66,d2g61,4,277,81.459085,97.542714,51.523958,8679.297887,5836.750163,7790.156048,11.155235,9.214498,10
4,d2g6f,d2g66,4,199,136.272948,196.721680,65.472222,13102.333753,5539.325497,11993.410301,12.984925,8.972166,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540775,d3475,d345x,50,1,53.566667,,53.566667,22658.612768,,22658.612768,11.000000,,11
540776,d3gpz,d3gpr,53,1,72.666667,,72.666667,10576.055503,,10576.055503,5.000000,,5
540777,d2fb0,d2d2x,50,1,161.233333,,161.233333,162355.974951,,162355.974951,19.000000,,19
540778,d3gpp,d3gpn,46,1,9.750000,,9.750000,3328.177843,,3328.177843,4.000000,,4


In [9]:
country_code = 'CO'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_9, 
        end_h3_9, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 9) AS start_h3_9, 
            h3_encode(end_lat, end_lng, 9) AS end_h3_9, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20191101 AND 20191103
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_9, end_h3_9, week_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/CO/{country_code.lower()}_2019_h39.csv', index=False)