In [None]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [None]:
%sql trino://localhost:9090/cuebiq/

In [None]:
# import geohash
import os
import logging
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta

In [None]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [None]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def process_daily_data(start_date, end_date, country_code, export_path):
    """
    Process and export daily aggregated data for a specified date range and country.

    Parameters:
    - start_date (str): The start date in the format 'YYYY-MM-DD'.
    - end_date (str): The end date in the format 'YYYY-MM-DD'.
    - country_code (str): The country code (e.g., 'CO' for Colombia).
    - export_path (str): The directory where the output files will be saved.
    """

    # Convert the start and end dates to datetime objects
    start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
    end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')

    # Iterate through each day in the date range
    current_date = start_date_obj
    while current_date <= end_date_obj:
        event_date = current_date.strftime('%Y%m%d')
        
        csv3_file = f"od_{country_code.lower()}_agg3_daily.csv"
        csv5_file = f"od_{country_code.lower()}_agg5_daily.csv"
        csv3_file_path = os.path.join(export_path, csv3_file)
        csv5_file_path = os.path.join(export_path, csv5_file)

        try:
            # Read data from the SQL table for the specified day and country
            pe_tj_df = sql_engine.read_sql(
                f"""
                SELECT 
                    cuebiq_id,
                    geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
                    geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
                    geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
                    geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
                    DATE(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date,
                    duration_minutes,
                    length_meters,
                    number_of_points
                FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
                WHERE 
                    event_date = {event_date}
                    AND end_country = '{country_code}' 
                    AND start_country = '{country_code}' 
                """
            )
            logging.info(f"Executing SQL query for date {event_date}")

            # Aggregation 3
            aggregated_df3 = pe_tj_df.groupby(['start_geohash3', 'end_geohash3']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std'],
                'local_date': 'first' 
            }).reset_index()
            aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m', 
                                      'm_points_no', 'mdn_points_no', 'sd_points_no',
                                     'local_date']

            # Reorder columns
            aggregated_df3 = aggregated_df3[['start_geohash3', 'end_geohash3', 'trip_count', 
                                         'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                        'm_length_m', 'mdn_length_m', 'sd_length_m', 
                                       'm_points_no', 'mdn_points_no', 'sd_points_no',
                                            'local_date']]
            logging.info(f"Exporting aggregated data (geohash3) for date: {event_date}")
            if not os.path.isfile(csv3_file_path):
                aggregated_df3.to_csv(csv3_file_path, index=False)
            else:
                aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

            # Aggregation 5
            aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std'],
                'local_date': 'first' 
            }).reset_index()
            aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m', 
                                      'm_points_no', 'mdn_points_no', 'sd_points_no',
                                     'local_date']

            # Reorder columns
            aggregated_df5 = aggregated_df5[['start_geohash5', 'end_geohash5', 'trip_count', 
                                         'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                        'm_length_m', 'mdn_length_m', 'sd_length_m', 
                                       'm_points_no', 'mdn_points_no', 'sd_points_no',
                                            'local_date']]
            logging.info(f"Exporting aggregated data (geohash5) for date: {event_date}")
            if not os.path.isfile(csv5_file_path):
                aggregated_df5.to_csv(csv5_file_path, index=False)
            else:
                aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)

        except Exception as e:
            logging.error(f"Error processing data for date {event_date}: {e}")
        
        # Move to the next day
        current_date += timedelta(days=1)


In [None]:
# Example usage
process_daily_data('2020-09-20', '2020-12-31', 'CO', '/home/jovyan/Data/2020OD/daily/')

In [None]:
process_daily_data('2020-01-01', '2020-12-31', 'ID', '/home/jovyan/Data/2020OD/daily/')

In [None]:
process_daily_data('2020-01-01', '2020-12-31', 'IN', '/home/jovyan/Data/2020OD/daily/')

In [None]:
process_daily_data('2020-01-01', '2020-12-31', 'MX', '/home/jovyan/Data/2020OD/daily/')

# Fill up missing dates:

In [None]:
from datetime import datetime, timedelta
import os
import logging

def process_daily_data(dates, country_code, export_path):
    """
    Process and export daily aggregated data for a specified list of dates and country.

    Parameters:
    - dates (list): A list of dates in the format 'YYYYMMDD'.
    - country_code (str): The country code (e.g., 'CO' for Colombia).
    - export_path (str): The directory where the output files will be saved.
    """

    for date_str in dates:
        # Convert the date string to a datetime object
        event_date_obj = datetime.strptime(date_str, '%Y%m%d')
        event_date = event_date_obj.strftime('%Y%m%d')

        csv3_file = f"od_{country_code.lower()}_agg3_daily.csv"
        csv5_file = f"od_{country_code.lower()}_agg5_daily.csv"
        csv3_file_path = os.path.join(export_path, csv3_file)
        csv5_file_path = os.path.join(export_path, csv5_file)

        try:
            # Read data from the SQL table for the specified day and country
            pe_tj_df = sql_engine.read_sql(
                f"""
                SELECT 
                    cuebiq_id,
                    geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
                    geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
                    geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
                    geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
                    DATE(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date,
                    duration_minutes,
                    length_meters,
                    number_of_points
                FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
                WHERE 
                    event_date = {event_date}
                    AND end_country = '{country_code}' 
                    AND start_country = '{country_code}' 
                """
            )
            logging.info(f"Executing SQL query for date {event_date}")

            # Aggregation 3
            aggregated_df3 = pe_tj_df.groupby(['start_geohash3', 'end_geohash3']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std'],
                'local_date': 'first' 
            }).reset_index()
            aggregated_df3.columns = ['start_geohash3', 'end_geohash3', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m', 
                                      'm_points_no', 'mdn_points_no', 'sd_points_no',
                                     'local_date']

            # Reorder columns
            aggregated_df3 = aggregated_df3[['start_geohash3', 'end_geohash3', 'trip_count', 
                                         'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                        'm_length_m', 'mdn_length_m', 'sd_length_m', 
                                       'm_points_no', 'mdn_points_no', 'sd_points_no',
                                            'local_date']]
            logging.info(f"Exporting aggregated data (geohash3) for date: {event_date}")
            if not os.path.isfile(csv3_file_path):
                aggregated_df3.to_csv(csv3_file_path, index=False)
            else:
                aggregated_df3.to_csv(csv3_file_path, mode='a', header=False, index=False)

            # Aggregation 5
            aggregated_df5 = pe_tj_df.groupby(['start_geohash5', 'end_geohash5']).agg({
                'cuebiq_id': 'count',
                'duration_minutes': ['mean', 'median', 'std'],
                'length_meters': ['mean', 'median', 'std'],
                'number_of_points': ['mean', 'median', 'std'],
                'local_date': 'first' 
            }).reset_index()
            aggregated_df5.columns = ['start_geohash5', 'end_geohash5', 'trip_count', 
                                      'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                      'm_length_m', 'mdn_length_m', 'sd_length_m', 
                                      'm_points_no', 'mdn_points_no', 'sd_points_no',
                                     'local_date']

            # Reorder columns
            aggregated_df5 = aggregated_df5[['start_geohash5', 'end_geohash5', 'trip_count', 
                                         'm_duration_min', 'mdn_duration_min', 'sd_duration_min', 
                                        'm_length_m', 'mdn_length_m', 'sd_length_m', 
                                       'm_points_no', 'mdn_points_no', 'sd_points_no',
                                            'local_date']]
            logging.info(f"Exporting aggregated data (geohash5) for date: {event_date}")
            if not os.path.isfile(csv5_file_path):
                aggregated_df5.to_csv(csv5_file_path, index=False)
            else:
                aggregated_df5.to_csv(csv5_file_path, mode='a', header=False, index=False)

        except Exception as e:
            logging.error(f"Error processing data for date {event_date}: {e}")


In [None]:
# dates = ['20200501', '20200918', '20200919']
dates = ['20200501']
process_daily_data(dates, 'CO', '/home/jovyan/Data/2020OD/daily/')

In [None]:
dates = ['20200501']
process_daily_data(dates, 'IN', '/home/jovyan/Data/2020OD/daily/')

# Only GH3 for CO 2020

In [None]:
def fetch_and_save_aggregated_data(start_date, end_date, country_code, output_csv):
    """
    Fetches aggregated trajectory data from the SQL database and saves the output to a CSV file.

    Parameters:
    - start_date (int): The start date for filtering data (e.g., 20200701).
    - end_date (int): The end date for filtering data (e.g., 20201231).
    - country_code (str): The country code to filter the data (e.g., 'CO').
    - output_csv (str): The file path where the CSV should be saved.
    """

    query = f"""
    SELECT 
        local_date,
        start_geohash3,
        end_geohash3,
        COUNT(cuebiq_id) AS trip_count,
        ROUND(AVG(duration_minutes), 6) AS m_duration_min,
        ROUND(APPROX_PERCENTILE(duration_minutes, 0.5), 6) AS mdn_duration_min,
        ROUND(STDDEV(duration_minutes), 6) AS sd_duration_min,
        ROUND(AVG(length_meters), 6) AS m_length_m,
        ROUND(APPROX_PERCENTILE(length_meters, 0.5), 6) AS mdn_length_m,
        ROUND(STDDEV(length_meters), 6) AS sd_length_m,
        ROUND(AVG(number_of_points), 6) AS m_points_no,
        ROUND(APPROX_PERCENTILE(number_of_points, 0.5), 6) AS mdn_points_no,
        ROUND(STDDEV(number_of_points), 6) AS sd_points_no
    FROM (
        SELECT 
            cuebiq_id,
            geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
            geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
            DATE(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date,
            duration_minutes,
            length_meters,
            number_of_points
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {start_date} AND {end_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    ) AS subquery
    GROUP BY 
        start_geohash3, end_geohash3, local_date
    """

    # Execute the SQL query and read the result into a DataFrame
    df = sql_engine.read_sql(query)

#     # Save the DataFrame to a CSV file
#     df.to_csv(output_csv, index=False)

#     print(f"Data successfully saved to {output_csv}")

In [None]:
fetch_and_save_aggregated_data(20200101, 20200131, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_1.csv')

# Check

In [None]:
import io
import os
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
country_code = 'CO'
# country_code = 'ID'
# country_code = 'IN'
# country_code = 'MX'

file = f'/home/jovyan/Data/2020OD/daily/od_{country_code.lower()}_agg3_daily.csv'
# file = f'/home/jovyan/Data/2020OD/daily/od_{country_code.lower()}_agg5_daily.csv'

df = pd.read_csv(file)
df

In [None]:
df[df['local_date'] == '2020-12-29']

## Check duplicates

In [None]:
df['start_geohash3'] = df['start_geohash3'].astype(str)
df['end_geohash3'] = df['end_geohash3'].astype(str)

# df['start_geohash5'] = df['start_geohash5'].astype(str)
# df['end_geohash5'] = df['end_geohash5'].astype(str)

# df['local_date'] = df['local_date'].astype(int)
df['trip_count'] = df['trip_count'].astype(int)
df['m_duration_min'] = df['m_duration_min'].astype(float)
df['mdn_duration_min'] = df['mdn_duration_min'].astype(float)
df['sd_duration_min'] = df['sd_duration_min'].astype(float)
df['m_length_m'] = df['m_length_m'].astype(float)
df['mdn_length_m'] = df['mdn_length_m'].astype(float)
df['sd_length_m'] = df['sd_length_m'].astype(float)
df['m_points_no'] = df['m_points_no'].astype(float)
df['mdn_points_no'] = df['mdn_points_no'].astype(float)
df['sd_points_no'] = df['sd_points_no'].astype(float)

# Round float columns to 6 decimal places
float_columns = ['m_duration_min', 'mdn_duration_min', 'sd_duration_min',
                 'm_length_m', 'mdn_length_m', 'sd_length_m',
                 'm_points_no', 'mdn_points_no', 'sd_points_no']

df[float_columns] = df[float_columns].round(6)
df

In [None]:
# To check the ones to remove
# original_df = df.copy()

# # Drop duplicates
# df = df.drop_duplicates()

# # Find the rows that were removed
# removed_rows = original_df[~original_df.index.isin(df.index)]
# removed_rows

In [None]:
df = df.drop_duplicates().reset_index(drop= True)
df

In [None]:
df[df['local_date'] == '2020-10-31']

## Check missing dates

In [None]:
unique_values_count = df["local_date"].unique()
all_dates = pd.date_range(start='2020-01-01', end='2020-12-31').strftime('%Y-%m-%d')
# all_dates = pd.date_range(start='2019-01-01', end='2019-12-31').strftime('%Y%m%d').astype(int)

all_dates = np.array(all_dates)
missing_dates = np.setdiff1d(all_dates, unique_values_count)

missing_dates

### Check with figure

In [None]:
# Group by local_date and calculate the total number of rows and total trip_count
daily_summary = df.groupby('local_date').agg(row_count=('trip_count', 'size'), total_trip_count=('trip_count', 'sum')).reset_index()

# Plotting
plt.figure(figsize=(12, 6))

# Plot for row count
plt.subplot(1, 2, 1)
plt.plot(daily_summary['local_date'], daily_summary['row_count'], linestyle='-')
plt.title('Row Count per Day')
plt.xlabel('Date')
plt.ylabel('Row Count')
plt.xticks(rotation=45)

# Plot for total trip count
plt.subplot(1, 2, 2)
plt.plot(daily_summary['local_date'], daily_summary['total_trip_count'], color='orange', linestyle='-')
plt.title('Total Trip Count per Day')
plt.xlabel('Date')
plt.ylabel('Total Trip Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## Clean up and Export

In [None]:
df['local_date'] = pd.to_datetime(df['local_date']).dt.strftime('%Y%m%d').astype(int)
filter_df = df[df['trip_count']>9]
filter_df

In [None]:
# Save the combined DataFrame to a new CSV file
com_path = '/home/jovyan/Data/2020OD/daily/combined/'

df.to_csv(com_path + f'od_daily_gh3_{country_code.lower()}_2020_all.csv', index=False)
# df.to_csv(com_path + f'od_daily_gh5_{country_code.lower()}_2020_all.csv', index=False)

In [None]:
folder_path = '/home/jovyan/Data/2020OD/daily/cleaned/'

filter_df.to_csv(folder_path + f'od_daily_gh3_{country_code.lower()}_2020.csv', index=False)
# filter_df.to_csv(folder_path + f'od_daily_gh5_{country_code.lower()}_2020.csv', index=False)

Error handelling. some days missing for 3h, reget here.

In [None]:
event_date = 20200229
country_code = 'MX'
year = 2020

pe_tj_df = sql_engine.read_sql(
    f"""
    SELECT 
        cuebiq_id,
        geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
        geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
        geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
        geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
        EXTRACT(HOUR FROM TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS event_hour,
        DATE(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date,
        duration_minutes,
        length_meters,
        number_of_points
    FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
    WHERE 
        event_date = {event_date}
        AND end_country = '{country_code}' 
        AND start_country = '{country_code}' 
    """
)
pe_tj_df

In [None]:
pe_tj_df['local_date'] = pd.to_datetime(pe_tj_df['local_date'], format='%Y-%m-%d')
pe_tj_df['grt'] = (pe_tj_df['event_hour'] // 3).astype(int)
pe_tj_df['day'] = pe_tj_df['local_date'].dt.day
pe_tj_df['month'] = pe_tj_df['local_date'].dt.month
pe_tj_df

In [None]:
pe_tj_df.to_csv('/home/jovyan/Data/0801/' + f'od_{year}0229_{country_code.lower()}.csv', index=False)