In [None]:
%reload_ext sql
%config SqlMagic.autocommit = False
%config SqlMagic.autolimit = 0
%config SqlMagic.autopandas = True
%config SqlMagic.displaylimit = 200

In [None]:
%sql trino://localhost:9090/cuebiq/

In [None]:
import os
import time
import logging
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from datetime import datetime, timedelta

In [None]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [None]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# H3 7

In [None]:
# country_code = 'CO'
country_code = 'IN'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20200101 AND 20200630
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2020_month1_h37.csv', index=False)

In [None]:
pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20200701 AND 20201231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2020_month2_h37.csv', index=False)

In [None]:
country_code = 'MX'
# country_code = 'ID'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20200101 AND 20200630
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2020_month1_h37.csv', index=False)

In [None]:
country_code = 'MX'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20200701 AND 20201231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2020_month2_h37.csv', index=False)

In [None]:
country_code = 'MX'

pe_tj_df7 = sql_engine.read_sql(
    f"""
    SELECT 
        start_h3_7, 
        end_h3_7, 
        month_number,
        COUNT(*) AS trip_count,
        AVG(duration_minutes) AS m_duration_min,
        STDDEV(duration_minutes) AS sd_duration_min,
        approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
        AVG(length_meters) AS m_length_m,
        STDDEV(length_meters) AS sd_length_m,
        approx_percentile(length_meters, 0.5) AS mdn_length_m,
        AVG(number_of_points) AS m_points_no,
        STDDEV(number_of_points) AS sd_points_no,
        approx_percentile(number_of_points, 0.5) AS mdn_points_no
    FROM 
    (
        SELECT 
            cuebiq_id,
            TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
            h3_encode(start_lat, start_lng, 7) AS start_h3_7, 
            h3_encode(end_lat, end_lng, 7) AS end_h3_7, 
            duration_minutes,
            length_meters,
            number_of_points, 
            MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN 20190701 AND 20191231
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
            AND start_lat <> 0
            AND end_lat <> 0
    ) subquery
    GROUP BY start_h3_7, end_h3_7, month_number
    """
)

pe_tj_df7.to_csv(f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month2_h37.csv', index=False)

# GH5

In [None]:
def export_weekly_data(country_code):
    """
    Export montly aggregated data for a specific country and save it as a CSV file.

    Parameters:
    - country_code (str): The country code (e.g., 'IN', 'CO', 'MX', etc.)
    """

    # SQL query to read the data
    pe_tj_df7 = sql_engine.read_sql(
        f"""
        SELECT 
            start_geohash5, 
            end_geohash5, 
            month_number,
            COUNT(*) AS trip_count,
            AVG(duration_minutes) AS m_duration_min,
            STDDEV(duration_minutes) AS sd_duration_min,
            approx_percentile(duration_minutes, 0.5) AS mdn_duration_min,
            AVG(length_meters) AS m_length_m,
            STDDEV(length_meters) AS sd_length_m,
            approx_percentile(length_meters, 0.5) AS mdn_length_m,
            AVG(number_of_points) AS m_points_no,
            STDDEV(number_of_points) AS sd_points_no,
            approx_percentile(number_of_points, 0.5) AS mdn_points_no
        FROM 
        (
            SELECT 
                cuebiq_id,
                TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s')) AS event_datetime_local,
                geohash_encode(start_lat, start_lng, 5) AS start_geohash5, 
                geohash_encode(end_lat, end_lng, 5) AS end_geohash5, 
                duration_minutes,
                length_meters,
                number_of_points, 
                MONTH(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS month_number
            FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
            WHERE 
                event_date BETWEEN 20200101 AND 20201231
                AND end_country = '{country_code}' 
                AND start_country = '{country_code}'
                AND start_lat <> 0
                AND end_lat <> 0
        ) subquery
        GROUP BY start_geohash5, end_geohash5, month_number
        """
    )

    # Define the file path using the country code
    file_path = f'/home/jovyan/Data/Month/od_{country_code.lower()}_2020_month_gh5.csv'

    # Save the data to a CSV file
    pe_tj_df7.to_csv(file_path, index=False)
    print(f"Data for {country_code} saved to {file_path}")


In [None]:
# Example usage:
export_weekly_data('CO') 

In [None]:
export_weekly_data('ID') 

In [None]:
export_weekly_data('IN') 

In [None]:
export_weekly_data('MX') 

# Check

## H37

In [None]:
country_code = 'CO' 
# country_code = 'ID' 
# country_code = 'IN' 
# country_code = 'MX' 

In [None]:
# file1 = f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month1_h37.csv'
file1 = f'/home/jovyan/Data/Month/od_{country_code.lower()}_2020_month1_h37.csv'
df1 = pd.read_csv(file1)
df1

In [None]:
sorted(df1['month_number'].unique())

In [None]:
# file2 = f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month2_h37.csv'
file2 = f'/home/jovyan/Data/Month/od_{country_code.lower()}_2020_month2_h37.csv'
df2 = pd.read_csv(file2)
df2

In [None]:
sorted(df2['month_number'].unique())

In [None]:
# df2 = df2[df2['month_number'] != 1]
# df2

In [None]:
# combined_df = df1 # for co 2019
combined_df = pd.concat([df1, df2], ignore_index=True)
combined_df

In [None]:
combined_df = combined_df.drop_duplicates()
combined_df

In [None]:
# Plot (1): Number of rows per month
rows_per_month = combined_df.groupby('month_number').size()
plt.figure(figsize=(8, 6))
rows_per_month.plot(kind='bar')
plt.title('Number of Rows per Month')
plt.xlabel('Month Number')
plt.ylabel('Number of Rows')
plt.show()

In [None]:
# Plot (2): Number of trip_count per month
trip_count_per_month = combined_df.groupby('month_number')['trip_count'].sum()
plt.figure(figsize=(8, 6))
trip_count_per_month.plot(kind='bar')
plt.title('Number of Trip Counts per Month')
plt.xlabel('Month Number')
plt.ylabel('Trip Count')
plt.show()

In [None]:
combined_df.to_csv(f"/home/jovyan/Data/Month/combined/od_month_h37_{country_code.lower()}_2020_all.csv", index=False)
# combined_df.to_csv(f"/home/jovyan/Data/Month/combined/od_month_h37_{country_code.lower()}_2019_all.csv", index=False)

In [None]:
filtered = combined_df[combined_df['trip_count'] > 9]
filtered

In [None]:
filtered.to_csv(f"/home/jovyan/Data/Month/cleaned/od_month_h37_{country_code.lower()}_2020.csv", index=False)
# filtered.to_csv(f"/home/jovyan/Data/Month/cleaned/od_month_h37_{country_code.lower()}_2019.csv", index=False)

## GH5

In [None]:
country_code = 'CO' 
# country_code = 'ID' 
# country_code = 'IN' 
# country_code = 'MX' 

In [None]:
# file = f'/home/jovyan/Data/Month/od_{country_code.lower()}_2019_month.csv'
file = f'/home/jovyan/Data/Month/od_{country_code.lower()}_2020_month_gh5.csv'
df = pd.read_csv(file)
df.rename(columns={'month': 'month_number'}, inplace=True)
df

In [None]:
sorted(df['month_number'].unique())

In [None]:
df = df.drop_duplicates()
df

In [None]:
df.to_csv(f"/home/jovyan/Data/Month/combined/od_month_gh5_{country_code.lower()}_2020_all.csv", index=False)
# df.to_csv(f"/home/jovyan/Data/Month/combined/od_month_gh5_{country_code.lower()}_2019_all.csv", index=False)

In [None]:
filtered = df[df['trip_count'] > 9]
filtered

In [None]:
filtered.to_csv(f"/home/jovyan/Data/Month/cleaned/od_month_gh5_{country_code.lower()}_2020.csv", index=False)
# filtered.to_csv(f"/home/jovyan/Data/Month/cleaned/od_month_gh5_{country_code.lower()}_2019.csv", index=False)