In [None]:
# Set up the environment
%env DB_HOST=mongodb://localhost/openpath_stage

# Import necessary libraries and modules
import emission.core.get_database as edb
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.builtin_timeseries as estb
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.core.wrapper.entry as ecwe
import emission.storage.decorations.trip_queries as esdt
import emission.storage.timeseries.timequery as estt
import pandas as pd
from datetime import datetime, timedelta
import pytz
import pprint
import os

In [14]:
def get_all_user_uuids():
    try:
        uuid_cursor = edb.get_uuid_db().find({}, {"uuid": 1, "_id": 0})
        uuid_list = [doc['uuid'] for doc in uuid_cursor]
        print(f"Retrieved {len(uuid_list)} user UUIDs.")
        return uuid_list
    except Exception as e:
        print(f"Error retrieving user UUIDs: {e}")
        return []



# THE MAIN ISSUE IS THAT `GET_DATA_DF` DOES NOT HAVE ANY OF THE NEW PIPELINE STATS?

In [15]:
def fetch_pipeline_time_data(uuid_list):

    all_users_pipeline_dfs = []
    total_users = len(uuid_list)
    for idx, user_uuid in enumerate(uuid_list, start=1):
        try:
            ts = esta.TimeSeries.get_time_series(user_uuid)
            pipeline_df = ts.get_data_df("stats/pipeline_time", time_query=None)
            if not pipeline_df.empty:
                all_users_pipeline_dfs.append(pipeline_df)
                print(f"[{idx}/{total_users}] Fetched data for user {user_uuid}.")
                print(pipeline_df['name'].unique())
            else:
                print(f"[{idx}/{total_users}] No 'stats/pipeline_time' data for user {user_uuid}.")
        except Exception as e:
            print(f"[{idx}/{total_users}] Error fetching data for user {user_uuid}: {e}")
    return all_users_pipeline_dfs




In [16]:
def analyze_usercache(combined_df, start_date_str='2024-11-08'):
    print("\nAnalyzing 'USERCACHE' executions...")
    # Step 1: Filter for rows where name is "USERCACHE"
    usercache_df = combined_df[combined_df['name'] == "USERCACHE"].copy()

    if usercache_df.empty:
        print("No 'USERCACHE' entries found.")
        return

    # Step 2: Convert metawrite_ts to datetime
    usercache_df['datetime'] = pd.to_datetime(usercache_df['metawrite_ts'], unit='s')

    # Step 3: Define the start date for filtering
    start_date = pd.Timestamp(start_date_str)

    # Step 4: Filter for rows since the start date
    usercache_df = usercache_df[usercache_df['datetime'] >= start_date]

    # Step 5: Group by hour and count executions
    hourly_execution_counts = usercache_df.groupby(usercache_df['datetime'].dt.floor('H')).size()

    # Step 6: Output the results
    if hourly_execution_counts.empty:
        print(f"No executions of 'USERCACHE' since {start_date_str}.")
    else:
        print(f"Hourly execution counts since {start_date_str}:")
        print(hourly_execution_counts)


In [17]:
def process_function_level_data(combined_df, exclude_names, base_dir=os.getcwd()):
    print("\nProcessing function-level ..")
    # Step 1: Filter for function-level data only (entries with slashes in 'name') and exclude specified names
    function_level_df = combined_df[
        combined_df['name'].str.contains('/') &
        ~combined_df['name'].isin(exclude_names)
    ].copy()

    if function_level_df.empty:
        print("No function-level data after filtering.")
        return

    # Step 2: Select the relevant columns
    selected_columns = function_level_df[['reading', 'name']].copy()

    # Step 3: Data Cleaning
    selected_columns.dropna(subset=['reading', 'name'], inplace=True)
    selected_columns = selected_columns[pd.to_numeric(selected_columns['reading'], errors='coerce').notnull()]

    if selected_columns.empty:
        print("No valid 'reading' after cleaning.")
        return

    # Step 4: Aggregate 'reading' by 'name'
    aggregated_sum = selected_columns.groupby('name', as_index=False)['reading'].sum()
    aggregated_sum.rename(columns={'reading': 'total_reading'}, inplace=True)

    aggregated_mean = selected_columns.groupby('name', as_index=False)['reading'].mean()
    aggregated_mean.rename(columns={'reading': 'average_reading'}, inplace=True)

    # Step 5: Determine the 80th percentile threshold
    threshold_sum = aggregated_sum['total_reading'].quantile(0.80)
    threshold_mean = aggregated_mean['average_reading'].quantile(0.80)

    # Step 6: Split the DataFrame into top 20% and bottom 80%
    top20_sum = aggregated_sum[aggregated_sum['total_reading'] >= threshold_sum].sort_values(by='total_reading', ascending=False)
    bottom80_sum = aggregated_sum[aggregated_sum['total_reading'] < threshold_sum].sort_values(by='total_reading', ascending=False)

    top20_mean = aggregated_mean[aggregated_mean['average_reading'] >= threshold_mean].sort_values(by='average_reading', ascending=False)
    bottom80_mean = aggregated_mean[aggregated_mean['average_reading'] < threshold_mean].sort_values(by='average_reading', ascending=False)

    # Step 7: Define file paths
    aggregated_sum_path = os.path.join(base_dir, 'aggregated_sum_function_level.csv')
    top20_sum_path = os.path.join(base_dir, 'top20_function_level_sum_sorted.csv')
    bottom80_sum_path = os.path.join(base_dir, 'bottom80_function_level_sum_sorted.csv')

    aggregated_mean_path = os.path.join(base_dir, 'aggregated_mean_function_level.csv')
    top20_mean_path = os.path.join(base_dir, 'top20_function_level_mean_sorted.csv')
    bottom80_mean_path = os.path.join(base_dir, 'bottom80_function_level_mean_sorted.csv')

    # Step 8: Save to CSV
    try:
        aggregated_sum.to_csv(aggregated_sum_path, index=False)
        top20_sum.to_csv(top20_sum_path, index=False)
        bottom80_sum.to_csv(bottom80_sum_path, index=False)

        aggregated_mean.to_csv(aggregated_mean_path, index=False)
        top20_mean.to_csv(top20_mean_path, index=False)
        bottom80_mean.to_csv(bottom80_mean_path, index=False)

        print(f"Aggregated Sum Function-Level Data saved to {aggregated_sum_path}")
        print(f"Top 20% (Sum) function-level data saved to {top20_sum_path}")
        print(f"Bottom 80% (Sum) function-level data saved to {bottom80_sum_path}")

        print(f"\nAggregated Mean Function-Level Data saved to {aggregated_mean_path}")
        print(f"Top 20% (Mean) function-level data saved to {top20_mean_path}")
        print(f"Bottom 80% (Mean) function-level data saved to {bottom80_mean_path}")
    except Exception as e:
        print(f"Error saving aggregated data to CSV: {e}")
        return

    # Step 9: Verify the splits
    print(f"\nSum Aggregation - Top 20% row count: {len(top20_sum)}")
    print(f"Sum Aggregation - Bottom 80% row count: {len(bottom80_sum)}")

    print(f"\nMean Aggregation - Top 20% row count: {len(top20_mean)}")
    print(f"Mean Aggregation - Bottom 80% row count: {len(bottom80_mean)}")

    # Step 10: Inspect some entries
    print("\nSample Top 20% Sum Aggregation Entries:")
    print(top20_sum.head())

    print("\nSample Bottom 80% Sum Aggregation Entries:")
    print(bottom80_sum.head())

    print("\nSample Top 20% Mean Aggregation Entries:")
    print(top20_mean.head())

    print("\nSample Bottom 80% Mean Aggregation Entries:")
    print(bottom80_mean.head())

In [None]:
def main():
    # Step 1: Retrieve all user UUIDs
    user_uuid_list = get_all_user_uuids()
    #user_uuid_list = user_uuid_list[1:3]
    if not user_uuid_list:
        print("No user UUIDs retrieved. Exiting script.")
        return

    # Step 2: Fetch 'stats/pipeline_time' data for all users
    all_users_pipeline_dfs = fetch_pipeline_time_data(user_uuid_list)

    if not all_users_pipeline_dfs:
        print("No pipeline data fetched for any user.")
        return

    # Step 3: Combine all users' DataFrames
    combined_pipeline_df = pd.concat(all_users_pipeline_dfs, ignore_index=True)
    print(f"\nCombined Pipeline Data Shape: {combined_pipeline_df.shape}")

    # Step 4: Describe and get info about the combined DataFrame
    print("\nCombined Pipeline Data Description:")
    print(combined_pipeline_df.describe())

    print(combined_pipeline_df.info())

    # Step 5: Get unique 'name' entries
    unique_names = combined_pipeline_df['name'].unique()
    print(f"\nUnique 'name' entries:")
    print(unique_names)

    # Step 6: Analyze 'USERCACHE' executions
    analyze_usercache(combined_pipeline_df)

    # Step 7: Define the list of 'name' entries to exclude
    exclude_data_names = [
        'TRIP_SEGMENTATION/segment_into_trips',
        'TRIP_SEGMENTATION/segment_into_trips_dist/loop'
    ]

    # Step 8: Process function-level data
    process_function_level_data(combined_pipeline_df, exclude_data_names)

main()