In [None]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [None]:
%sql trino://localhost:9090/cuebiq/

In [None]:
import os
import time
# import geohash
import logging
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine

# pip install python-geohash

In [None]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [None]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Weekly Aggregation

## GH5

In [None]:
def extract_od_data(sql_engine, country_code, year):
    query = f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {year}0101 AND {year}1231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_geohash5, end_geohash5, week_number
    """
    
    pe_tj_df7 = sql_engine.read_sql(query)
    output_file_path = f'/home/jovyan/Data/Week/od_{country_code.lower()}_{year}_week_gh5.csv'
    pe_tj_df7.to_csv(output_file_path, index=False)
    
    return output_file_path


In [None]:
output_file = extract_od_data(sql_engine, 'IN', 2020)
print(f"Data saved to: {output_file}")
# didn't test tho

In [None]:
country_code = 'IN'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20200101 AND 20201231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_geohash5, end_geohash5, week_number
    """
)

# pe_tj_df7
pe_tj_df7.to_csv(f'/home/jovyan/Data/Week/od_{country_code.lower()}_{year}_week_gh5.csv', index=False)

## H3 7
need to break down to 3 times: 
0101-0503, 0427-0904, 0827-1231

In [None]:
country_code = 'CO'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20200101 AND 20200703
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, week_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week1_h37.csv', index=False)

In [None]:
country_code = 'ID'
query = f"""
    SELECT 
        cuebiq_id,
        TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
        geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
        duration_minutes,
        length_meters,
        number_of_points, 
        WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN 20191228 AND 20191231
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}'
        AND start_lat <> 0
        AND end_lat <> 0
"""

pe_tj_df = sql_engine.read_sql(query)
pe_tj_df

In [None]:
pe_tj_df['day_of_month'] = pe_tj_df['event_datetime_local'].dt.day
pe_tj_df

In [None]:
pe_tj_df[pe_tj_df['day_of_month'] == 31]

In [None]:
country_code = 'ID'
query = f"""
    SELECT 
        cuebiq_id,
        TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
        geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
        duration_minutes,
        length_meters,
        number_of_points, 
        WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN 20200101 AND 20200110
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}'
        AND start_lat <> 0
        AND end_lat <> 0
"""

pe_tj_df7 = sql_engine.read_sql(query)
pe_tj_df7

In [None]:
# Convert 'event_datetime_local' to datetime
pe_tj_df7['event_datetime_local'] = pd.to_datetime(pe_tj_df7['event_datetime_local'])
pe_tj_df7['local_date'] = pe_tj_df7['event_datetime_local'].dt.date
pe_tj_df7

In [None]:
pe_tj_df7['day_of_month'] = pe_tj_df7['event_datetime_local'].dt.day
pe_tj_df7

In [None]:
pe_tj_df7[pe_tj_df7['day_of_month'] == 6]

# Check - Week

2019: need to change week 1 from Dec to week 53  
2020: no need to change

## GH5

In [None]:
# country_code = 'CO' 
# country_code = 'ID' 
# country_code = 'IN' 
country_code = 'MX' 

##### Remove week 1 since it is combination of 0101 and 1230

In [None]:
file1 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week.csv'

df1 = pd.read_csv(file1)
df1

In [None]:
df1 = df1[df1['week_number'] != 1]
# df1
sorted(df1['week_number'].unique())

In [None]:
df1.to_csv(f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week.csv', index=False)

##### 2020

In [None]:
file = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week_gh5.csv'
df = pd.read_csv(file)
# combined = df
combined

##### for 2019 only

In [None]:
file = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week_gh5.csv'
df = pd.read_csv(file)
# df

In [None]:
file1 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week_gh5_w1.csv'
df1 = pd.read_csv(file1)
df1

In [None]:
file53 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week_gh5_w53.csv'
df53 = pd.read_csv(file53)
df53

In [None]:
combined = pd.concat([df, df1, df53], ignore_index=True)
# combined = pd.concat([df, df53], ignore_index=True)

combined

##### plot

In [None]:
sorted(combined['week_number'].unique())

In [None]:
# Plot (1): Number of rows per week
rows_per_week = combined.groupby('week_number').size()
plt.figure(figsize=(8, 6))
rows_per_week.plot(kind='bar')
plt.title('Number of Rows per Week')
plt.xlabel('Week Number')
plt.ylabel('Number of Rows')
plt.show()

In [None]:
# Plot (2): Number of trip_count per week
trip_count_per_week = combined.groupby('week_number')['trip_count'].sum()
plt.figure(figsize=(8, 6))
trip_count_per_week.plot(kind='bar')
plt.title('Number of Trip Counts per Week')
plt.xlabel('Week Number')
plt.ylabel('Trip Count')
plt.show()

##### export 

In [None]:
# combined.to_csv(f"/home/jovyan/Data/Week/combined/od_week_gh5_{country_code.lower()}_2019_all.csv", index=False)
combined.to_csv(f"/home/jovyan/Data/Week/combined/od_week_gh5_{country_code.lower()}_2020_all.csv", index=False)

In [None]:
filtered = combined[combined['trip_count'] > 9]
filtered

In [None]:
# filtered.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_gh5_{country_code.lower()}_2019.csv', index=False)
filtered.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_gh5_{country_code.lower()}_2020.csv', index=False)

In [None]:
# df1[(df1['week_number'] == 53) & (df1['start_geohash5']== 'd2g66') & (df1['end_geohash5']== 'd2g66')]

## H37

In [None]:
country_code = 'CO' 
# country_code = 'ID' 
# country_code = 'IN'
# country_code = 'MX'

In [None]:
# file1 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week1_h37.csv'
file1 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week1_h37.csv'
df1 = pd.read_csv(file1)
# df1['week_number'] = df1['week_number'].replace(1, 53) # For CO

sorted(df1['week_number'].unique())

In [None]:
fil1 = df1[df1['week_number'] < 27] # For ID
# fil1 = df1[df1['week_number'] < 18] # For MX, IN
sorted(fil1['week_number'].unique())

In [None]:
# file2 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week2_h37.csv'
file2 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week2_h37.csv'
df2 = pd.read_csv(file2)
# df2['week_number'] = df2['week_number'].replace(1, 53) # For ID
# df2
sorted(df2['week_number'].unique())

In [None]:
# fil2 = df2[(df2['week_number']>17) & (df2['week_number']<36)] # For IN & MX
fil2 = df2[df2['week_number'] > 26]
sorted(fil2['week_number'].unique())

In [None]:
# # For MX, IN
# # file3 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2019_week3_h37.csv'
# file3 = f'/home/jovyan/Data/Week/od_{country_code.lower()}_2020_week3_h37.csv'
# df3 = pd.read_csv(file3)
# df3['week_number'] = df3['week_number'].replace(1, 53) # for 2019
# # df3
# sorted(df3['week_number'].unique())

In [None]:
# fil3 = df3[df3['week_number'] > 35]
# sorted(fil3['week_number'].unique())

In [None]:
# combined_df = pd.concat([fil1, fil2, fil3], ignore_index=True)
combined_df = pd.concat([fil1, fil2], ignore_index=True)
combined_df

In [None]:
sorted(combined_df['week_number'].unique())

In [None]:
combined_df = combined_df.drop_duplicates()
combined_df

In [None]:
# combined_df.to_csv(f'/home/jovyan/Data/Week/combined/od_week_h37_{country_code.lower()}_2019_all.csv', index=False)
combined_df.to_csv(f'/home/jovyan/Data/Week/combined/od_week_h37_{country_code.lower()}_2020_all.csv', index=False)

In [None]:
filter_df = combined_df[combined_df['trip_count'] > 9]
filter_df

In [None]:
# filter_df.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_h37_{country_code.lower()}_2019.csv', index=False)
filter_df.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_h37_{country_code.lower()}_2020.csv', index=False)

In [None]:
country_code = 'CO'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20191101 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month_h37.csv', index=False)
# 1133

In [None]:
# country_code = 'MX'
country_code = 'IN'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190101 AND 20190630
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month1_h37.csv', index=False)

In [None]:
# country_code = 'MX'
country_code = 'IN'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190701 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month2_h37.csv', index=False)

In [None]:
country_code = 'ID'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190101 AND 20190630
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month1_h37.csv', index=False)

In [None]:
country_code = 'ID'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190701 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month2_h37.csv', index=False)

In [None]:
country_code = 'ID'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        MONTH(event_datetime_local) AS month,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190101 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    ) subquery
    GROUP BY start_geohash5, end_geohash5, MONTH(event_datetime_local)
    """
)

pe_tj_df7


In [None]:
pe_tj_df7.to_csv('/home/jovyan/Data/Month/od_id_2019_month.csv', index=False)

In [None]:
country_code = 'MX'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        MONTH(event_datetime_local) AS month,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190101 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    ) subquery
    GROUP BY start_geohash5, end_geohash5, MONTH(event_datetime_local)
    """
)

pe_tj_df7


In [None]:
pe_tj_df7.to_csv('/home/jovyan/Data/Month/od_mx_2019_month.csv', index=False)

In [None]:
country_code = 'CO'
event_date = '2020-11-10'

event_date_dt = datetime.strptime(event_date, '%Y-%m-%d')
formatted_current_date = event_date_dt.strftime('%Y%m%d')

lookahead_date = event_date_dt + timedelta(days=7)
formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

In [None]:
pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_geohash5, 
        end_geohash5, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_geohash5, 
            h3_encode(end_lat, end_lng, 7) AS end_geohash5, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    ) subquery
    GROUP BY start_geohash5, end_geohash5, week_number
    """
)

pe_tj_df7


In [None]:
country_code = 'CO'
event_date = '2020-02-10'

event_date_dt = datetime.strptime(event_date, '%Y-%m-%d')
formatted_current_date = event_date_dt.strftime('%Y%m%d')

lookahead_date = event_date_dt + timedelta(days=14)
formatted_lookahead_date = lookahead_date.strftime('%Y%m%d')

In [None]:
pe_tj_df3 = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        duration_minutes,
        length_meters,
        number_of_points,
        TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
        EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
        -- geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
        geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
        -- geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
        geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
        DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date,
        DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%W') AS day_of_week
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
    """
)

pe_tj_df3

In [None]:
pe_tj_df3 = sql_engine.read_sql(
    f"""
    WITH calculated_data AS (
        SELECT 
            cuebiq_id,
            duration_minutes,
            length_meters,
            number_of_points,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
            geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
            geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%Y%m%d') AS local_date,
            DATE_FORMAT(date_parse(substr(start_zoned_datetime, 1, 10), '%Y-%m-%d'), '%W') AS day_of_week
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {formatted_current_date} AND {formatted_lookahead_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    )
    SELECT 
        day_of_week,
        start_geohash3,
        end_geohash3,
        COUNT(*) AS trip_count,
        COUNT(DISTINCT cuebiq_id) AS unique_cuebiq_ids,
        AVG(duration_minutes) AS m_duration_min,
        APPROX_PERCENTILE(duration_minutes, 0.5) AS mdn_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        AVG(length_meters) AS m_length_m,
        APPROX_PERCENTILE(length_meters, 0.5) AS mdn_length_m,
        STDDEV(length_meters) AS sd_length_m,
        AVG(number_of_points) AS m_points_no,
        APPROX_PERCENTILE(number_of_points, 0.5) AS mdn_points_no,
        STDDEV(number_of_points) AS sd_points_no
    FROM calculated_data
    GROUP BY day_of_week, start_geohash3, end_geohash3
    HAVING COUNT(*) > 9
    """
)

pe_tj_df3

In [None]:
pe_tj_df3[pe_tj_df3['day_of_week'] == 'Thursday']

In [None]:
pe_tj_df3[pe_tj_df3['day_of_week'] == 'Tuesday']

In [None]:
pe_tj_df3[pe_tj_df3['day_of_week'] == 'Friday']

In [None]:
country_code = 'MX'
file = f'/home/jovyan/Data/Week/combined/od_week_gh5_{country_code.lower()}_2020_all.csv'
df = pd.read_csv(file)
df

In [None]:
filter_df = df[df['trip_count']>9]
filter_df

In [None]:
filter_df.to_csv(f'/home/jovyan/Data/Week/cleaned/od_week_gh5_{country_code.lower()}_2020.csv', index=False)

In [None]:
df['trip_count'].sum()

In [None]:
file = f'/home/jovyan/Data/Week/cleaned/od_week_gh5_id_2020.csv'
df2 = pd.read_csv(file)
df2

In [None]:
file = f'/home/jovyan/Data/Week/od_co_2020_week_gh5.csv'
df3 = pd.read_csv(file)
df3

In [None]:
country_code = 'CO'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_9, 
        end_h3_9, 
        week_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 9) AS start_h3_9, 
            h3_encode(end_lat, end_lng, 9) AS end_h3_9, 
            duration_minutes,
            length_meters,
            number_of_points, 
            WEEK(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS week_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20191101 AND 20191103
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_9, end_h3_9, week_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/CO/{country_code.lower()}_2019_h39.csv', index=False)