In [None]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200

In [None]:
%sql trino://localhost:9090/cuebiq/

In [None]:
# import geohash
import os
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from datetime import datetime, timedelta

In [None]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine
import pandas as pd
import time

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [None]:
schema_name = {'cda': 'cuebiq.paas_cda_pe_v3'}
pe_tj_table = f"{schema_name['cda']}.trajectory_uplevelled"

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
# for GH5
def fetch_and_save_aggregated_data(start_date, end_date, country_code, output_csv):
    """
    Fetches aggregated trajectory data from the SQL database and saves the output to a CSV file.

    Parameters:
    - start_date (int): The start date for filtering data (e.g., 20200701).
    - end_date (int): The end date for filtering data (e.g., 20201231).
    - country_code (str): The country code to filter the data (e.g., 'CO').
    - output_csv (str): The file path where the CSV should be saved.
    """

    query = f"""
    SELECT 
        local_date,
        start_geohash5,
        end_geohash5,
        grt,
        COUNT(cuebiq_id) AS trip_count,
        ROUND(AVG(duration_minutes), 6) AS m_duration_min,
        ROUND(APPROX_PERCENTILE(duration_minutes, 0.5), 6) AS mdn_duration_min,
        ROUND(STDDEV(duration_minutes), 6) AS sd_duration_min,
        ROUND(AVG(length_meters), 6) AS m_length_m,
        ROUND(APPROX_PERCENTILE(length_meters, 0.5), 6) AS mdn_length_m,
        ROUND(STDDEV(length_meters), 6) AS sd_length_m,
        ROUND(AVG(number_of_points), 6) AS m_points_no,
        ROUND(APPROX_PERCENTILE(number_of_points, 0.5), 6) AS mdn_points_no,
        ROUND(STDDEV(number_of_points), 6) AS sd_points_no
    FROM (
        SELECT 
            cuebiq_id,
            geohash_encode(start_lat, start_lng, 5) AS start_geohash5,
            geohash_encode(end_lat, end_lng, 5) AS end_geohash5,
            DATE(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date,
            extract(HOUR FROM date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt,
            duration_minutes,
            length_meters,
            number_of_points
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {start_date} AND {end_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    ) AS subquery
    GROUP BY 
        start_geohash5, end_geohash5, local_date, grt
    """

    # Execute the SQL query and read the result into a DataFrame
    df = sql_engine.read_sql(query)

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)

    print(f"Data successfully saved to {output_csv}")


In [None]:
# for GH3
def fetch_and_save_aggregated3_data(start_date, end_date, country_code, output_csv):
    """
    Fetches aggregated trajectory data from the SQL database and saves the output to a CSV file.

    Parameters:
    - start_date (int): The start date for filtering data (e.g., 20200701).
    - end_date (int): The end date for filtering data (e.g., 20201231).
    - country_code (str): The country code to filter the data (e.g., 'CO').
    - output_csv (str): The file path where the CSV should be saved.
    """

    query = f"""
    SELECT 
        local_date,
        start_geohash3,
        end_geohash3,
        grt,
        COUNT(cuebiq_id) AS trip_count,
        ROUND(AVG(duration_minutes), 6) AS m_duration_min,
        ROUND(APPROX_PERCENTILE(duration_minutes, 0.5), 6) AS mdn_duration_min,
        ROUND(STDDEV(duration_minutes), 6) AS sd_duration_min,
        ROUND(AVG(length_meters), 6) AS m_length_m,
        ROUND(APPROX_PERCENTILE(length_meters, 0.5), 6) AS mdn_length_m,
        ROUND(STDDEV(length_meters), 6) AS sd_length_m,
        ROUND(AVG(number_of_points), 6) AS m_points_no,
        ROUND(APPROX_PERCENTILE(number_of_points, 0.5), 6) AS mdn_points_no,
        ROUND(STDDEV(number_of_points), 6) AS sd_points_no
    FROM (
        SELECT 
            cuebiq_id,
            geohash_encode(start_lat, start_lng, 3) AS start_geohash3,
            geohash_encode(end_lat, end_lng, 3) AS end_geohash3,
            DATE(TRY(date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))) AS local_date,
            extract(HOUR FROM date_parse(substr(start_zoned_datetime, 1, 19), '%Y-%m-%dT%H:%i:%s'))/3  grt,
            duration_minutes,
            length_meters,
            number_of_points
        FROM cuebiq.paas_cda_pe_v3.trajectory_uplevelled
        WHERE 
            event_date BETWEEN {start_date} AND {end_date}
            AND end_country = '{country_code}' 
            AND start_country = '{country_code}'
    ) AS subquery
    GROUP BY 
        start_geohash3, end_geohash3, local_date, grt
    """

    # Execute the SQL query and read the result into a DataFrame
    df = sql_engine.read_sql(query)

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)

    print(f"Data successfully saved to {output_csv}")


In [None]:
fetch_and_save_aggregated_data(20200101, 20200131, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_1.csv')

In [None]:
fetch_and_save_aggregated_data(20200201, 20200229, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_2.csv')

In [None]:
fetch_and_save_aggregated_data(20200301, 20200331, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_3.csv')

In [None]:
fetch_and_save_aggregated_data(20200401, 20200430, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_4.csv')

In [None]:
fetch_and_save_aggregated_data(20200501, 20200531, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_5.csv')

In [None]:
fetch_and_save_aggregated_data(20200601, 20200630, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_6.csv')

In [None]:
fetch_and_save_aggregated_data(20200701, 20200731, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_7.csv')

In [None]:
fetch_and_save_aggregated_data(20200801, 20200831, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_8.csv')

In [None]:
fetch_and_save_aggregated_data(20200901, 20200930, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_9.csv')

In [None]:
fetch_and_save_aggregated_data(20201001, 20201031, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_10.csv')

In [None]:
fetch_and_save_aggregated_data(20201101, 20201130, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_11.csv')

In [None]:
fetch_and_save_aggregated_data(20201201, 20201231, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_12.csv')

In [None]:
fetch_and_save_aggregated3_data(20200831, 20200831, 'ID', 
                               '/home/jovyan/Data/2020OD/del_3h/od3_id_3h_8.csv')

In [None]:
fetch_and_save_aggregated_data(20190831, 20190831, 'IN', 
                               '/home/jovyan/Data/2020OD/del_3h/od5_in_3h_19831.csv')

In [None]:
od2 = '/home/jovyan/Data/2020OD/del_3h/od5_in_3h_19831.csv'
df_od2 = pd.read_csv(od2)
df_od2

In [None]:
od8 = '/home/jovyan/Data/2020OD/del_3h/od5_id_3h_8.csv'
df_od8 = pd.read_csv(od8)
df_od8

In [None]:
final_df = pd.concat([df_od2, df_od8], ignore_index=True)
# final_df = df_od2
final_df

In [None]:
final_df['local_date'] = pd.to_datetime(final_df['local_date']).dt.strftime('%Y%m%d')
time_intervals = {
    0: "00:00:00 - 03:00:00", 1: "03:00:00 - 06:00:00",
    2: "06:00:00 - 09:00:00", 3: "09:00:00 - 12:00:00",
    4: "12:00:00 - 15:00:00", 5: "15:00:00 - 18:00:00",
    6: "18:00:00 - 21:00:00", 7: "21:00:00 - 24:00:00"}
final_df['local_time'] = final_df.apply(lambda row: f"{row['local_date']} {time_intervals[row['grt']]}", axis=1).astype(str)
final_df = final_df.drop(['grt'], axis=1)

final_df

In [None]:
final_df.to_csv('/home/jovyan/Data/2020OD/del_3h/od5_in_3h_19831.csv', index=False)

In [None]:
pd_file = '/home/jovyan/Data/2020OD/del_3h/od_mx_agg5_3h.csv'
df_od = pd.read_csv(pd_file)
df_od

In [None]:
df_od[df_od['local_date'] == 20200229]['trip_count'].sum()

In [None]:
df_od[df_od['local_date'] == 20200228]['trip_count'].sum()

In [None]:
# pip install seaborn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df_od is your DataFrame
# Convert 'local_date' to datetime format
df_od['local_date'] = pd.to_datetime(df_od['local_date'], format='%Y%m%d')

# Group by 'local_date' and sum the 'trip_count'
df_daily = df_od.groupby('local_date')['trip_count'].sum().reset_index()

# Initialize the plot
plt.figure(figsize=(14, 8))
plt.rcParams.update({'font.size': 16})  # Increase the default font size

# Plotting with seaborn
sns.lineplot(data=df_daily, x='local_date', y='trip_count')

# Customize the x-axis to show every month
plt.gca().xaxis.set_major_locator(plt.matplotlib.dates.MonthLocator())
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m'))

# Adding labels and title
plt.xlabel('Local Date')
plt.ylabel('Trip Count')
plt.title('Trip Count per Day')
plt.xticks(rotation=45)
plt.grid(True)

# Show the plot
plt.show()


# Check and Processing

In [None]:
folder_path = '/home/jovyan/Data/2020OD/del_3h/IN'

file_list = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
df_list = []

# Loop through the list of files and read each file into a DataFrame
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path) 
    df_list.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)
combined_df

In [None]:
combined_df['local_date'] = pd.to_datetime(combined_df['local_date']).dt.strftime('%Y%m%d')
# combined_df['trip_count'].sum()
combined_df

In [None]:
time_intervals = {
    0: "00:00:00 - 03:00:00", 1: "03:00:00 - 06:00:00",
    2: "06:00:00 - 09:00:00", 3: "09:00:00 - 12:00:00",
    4: "12:00:00 - 15:00:00", 5: "15:00:00 - 18:00:00",
    6: "18:00:00 - 21:00:00", 7: "21:00:00 - 24:00:00"}
combined_df['local_time'] = combined_df.apply(lambda row: f"{row['local_date']} {time_intervals[row['grt']]}", axis=1).astype(str)
combined_df

## Check duplicates

In [None]:
# combined_df['start_geohash3'] = combined_df['start_geohash3'].astype(str)
# combined_df['end_geohash3'] = combined_df['end_geohash3'].astype(str)

combined_df['start_geohash5'] = combined_df['start_geohash5'].astype(str)
combined_df['end_geohash5'] = combined_df['end_geohash5'].astype(str)

combined_df['local_date'] = combined_df['local_date'].astype(int)
combined_df['trip_count'] = combined_df['trip_count'].astype(int)
combined_df['m_duration_min'] = combined_df['m_duration_min'].astype(float)
combined_df['mdn_duration_min'] = combined_df['mdn_duration_min'].astype(float)
combined_df['sd_duration_min'] = combined_df['sd_duration_min'].astype(float)
combined_df['m_length_m'] = combined_df['m_length_m'].astype(float)
combined_df['mdn_length_m'] = combined_df['mdn_length_m'].astype(float)
combined_df['sd_length_m'] = combined_df['sd_length_m'].astype(float)
combined_df['m_points_no'] = combined_df['m_points_no'].astype(float)
combined_df['mdn_points_no'] = combined_df['mdn_points_no'].astype(float)
combined_df['sd_points_no'] = combined_df['sd_points_no'].astype(float)
combined_df['local_time'] = combined_df['local_time'].astype(str)

# Round float columns to 6 decimal places
float_columns = ['m_duration_min', 'mdn_duration_min', 'sd_duration_min',
                 'm_length_m', 'mdn_length_m', 'sd_length_m',
                 'm_points_no', 'mdn_points_no', 'sd_points_no']

combined_df[float_columns] = combined_df[float_columns].round(6)
combined_df = combined_df.drop_duplicates()
combined_df = combined_df.sort_values('local_time').reset_index(drop = True)
combined_df

## Check missing dates

In [None]:
unique_values_count = combined_df["local_date"].unique()
all_dates = pd.date_range(start='2020-01-01', end='2020-12-31').strftime('%Y%m%d').astype(int)
all_dates = np.array(all_dates)
missing_dates = np.setdiff1d(all_dates, unique_values_count)

missing_dates

## Check for hours per day (for 3h)

In [None]:
# Convert local_date to datetime objects
combined_df['local_date'] = pd.to_datetime(combined_df['local_date'], format='%Y%m%d')

# Function to parse the local_time column and extract the start hour
def parse_start_hour(time_str):
    try:
        start_time_str, _ = time_str.split(' - ')
        start_time = datetime.strptime(start_time_str, '%Y%m%d %H:%M:%S')
        return start_time.hour
    except ValueError:
        return None

combined_df['start_hour'] = combined_df['local_time'].map(parse_start_hour)

# Required hours to check
required_hours = set([0, 3, 6, 9, 12, 15, 18, 21])

# Group by local_date and check missing hours
missing_hours = {}

for date, group in combined_df.groupby('local_date'):
    present_hours = set(group['start_hour'])
    missing = required_hours - present_hours
    if missing:
        missing_hours[date] = sorted(list(missing))

# Convert missing hours dictionary to DataFrame for better readability
missing_hours_df = pd.DataFrame(list(missing_hours.items()), columns=['Date', 'Missing Hours'])

# Display the missing hours
print("Missing hours for each date:")
print(missing_hours_df)

## Check with figure

In [None]:
daily_counts = combined_df['local_date'].value_counts().sort_index()

# Generate custom xticks starting from the first date and then incrementing by 7 days
start_date = daily_counts.index.min()
end_date = daily_counts.index.max()
custom_xticks = pd.date_range(start=start_date, end=end_date, freq='7D')

# Plot the results
plt.figure(figsize=(10, 6))
daily_counts.plot(kind='bar')
plt.title('Number of Rows for Each Day')
plt.xlabel('Date')
plt.ylabel('Number of Rows')
plt.xticks(ticks=range(len(daily_counts)), labels=daily_counts.index.strftime('%Y-%m-%d'), rotation=90)
plt.gca().set_xticks([i for i, date in enumerate(daily_counts.index) if date in custom_xticks])
plt.tight_layout()
plt.show()

## Export

In [None]:
combined_df

In [None]:
combined_df = combined_df.drop(['start_hour'], axis=1)
# combined_df = combined_df.drop(['grt'], axis=1)
combined_df['local_date'] = combined_df['local_date'].dt.strftime('%Y%m%d').astype(int)
combined_df = combined_df.sort_values('local_time').reset_index(drop=True)
combined_df

In [None]:
combined_df.to_csv('/home/jovyan/Data/2020OD/del_3h/IN/combined/od_3h_gh5_in_2020_all.csv', index=False)

In [None]:
filterdf = combined_df[combined_df['trip_count'] > 9]
filterdf

In [None]:
filterdf.to_csv('/home/jovyan/Data/2020OD/del_3h/IN/combined/od_3h_gh5_in_2020.csv', index=False)

In [None]:
combined_f = '/home/jovyan/Data/2020OD/del_3h/IN/combined/od_3h_gh5_in_2020_all.csv'
combined_df = pd.read_csv(combined_f)