# Artificial Intelligence
## AI Ready Data - 006
###  Process Profile data using various techniques of each dataset loaded



<center>
<table align="center">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/christophergarthwood/jbooks/blob/main/STEM-006_AIReadyData-Speed-Tests-003.ipynb">
      <img width="32px" src="https://colab.research.google.com/img/colab_favicon_256px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/notebooks?referrer=search&hl=en&project=usfs-ai-bootcamp">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Link to Colab Enterprise
    </a>
  </td>   
  <td style="text-align: center">
    <a href="https://github.com/christophergarthwood/jbooks/blob/main/STEM-006_AIReadyData-Speed-Tests-003.ipynb">
      <img width=32 src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/instances?referrer=search&hl=en&project=usfs-ai-bootcamp">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Link to Vertex AI Workbench
    </a>
  </td>
</table>
</center>
</br></br></br>

| | |
|-|-|
|Author(s) | [Christopher G Wood](https://github.com/christophergarthwood)  |

# Overview

Using output from 002, read the pickled profile results and gather some metrics for additional analysis.

In [None]:
# Let's define some variables (information holders) for our project overall

global PROJECT_ID, BUCKET_NAME, LOCATION
BUCKET_NAME = "ai-bootcamp-vertex-colab"
PROJECT_ID = "ai-bootcamp"
LOCATION = "us-central1"

BOLD_START = "\033[1m"
BOLD_END = "\033[0m"

## Environment Check

In [None]:
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# - Google Colab Check
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import datetime

RunningInCOLAB = False
RunningInCOLAB = "google.colab" in str(get_ipython())
current_time = datetime.datetime.now()

if RunningInCOLAB:
    print(
        f"You are running this notebook in Google Colab at {current_time} in the {BOLD_START}{PROJECT_ID}{BOLD_END}lab."
    )
else:
    print(
        f"You are likely running this notebook with Jupyter iPython runtime at {current_time} in the {PROJECT_ID} lab."
    )

## Library Management
### Load Libraries necessary for this operation via pip install

In [None]:
# Import key libraries necessary to support dynamic installation of additional libraries
import sys

# Use subprocess to support running operating system commands from the program, using the "bang" (!)
# symbology is supported, however that does not translate to an actual python script, this is a more
# agnostic approach.
import subprocess
import importlib.util

In [None]:
# Identify the libraries you'd like to add to this Runtime environment.
# Commented out as this adds time but is critical for initial run.
"""
libraries = [
    "backoff",
    "python-dotenv",
    "seaborn",
    "piexif",
    "unidecode",
    "icecream",
    "watermark",
    "watermark[GPU]",
    "rich",
    "rich[jupyter]",
    "numpy",
    "pydot",
    "polars[all]",
    "dask[complete]",
    "xarray",
    "pandas",
    "pystac",
    "pystac[jinja2]",
    "pystac[orjson]",
    "pystac[validation]",
    "fastparquet",
    "zarr",
    "gdown",
    "wget",
]

# Loop through each library and test for existence, if not present install quietly
for library in libraries:
    if library == "Pillow":
        spec = importlib.util.find_spec("PIL")
    else:
        spec = importlib.util.find_spec(library)
    if spec is None:
        print("Installing library " + library)
        subprocess.run(["pip", "install", library, "--quiet"], check=True)
    else:
        print("Library " + library + " already installed.")

# Specialized install for GPU enabled capability with CUDF
# pip install --extra-index-url=https://pypi.nvidia.com "cudf-cu12==25.2.*" "dask-cudf-cu12==25.2.*" "cuml-cu12==25.2.*" "cugraph-cu12==25.2.*" "nx-cugraph-cu12==25.2.*" "cuspatial-cu12==25.2.*"     "cuproj-cu12==25.2.*" "cuxfilter-cu12==25.2.*" "cucim-cu12==25.2.*"
try:
    library="cudf-cu12"
    spec = importlib.util.find_spec(library)
    if spec is None:
        subprocess.run(
            [
                "pip",
                "install",
                "--extra-index-url=https://pypi.nvidia.com",
                library,
                "--quiet",
            ],
            check=True,
        )
    else:
        print("Library " + library + " already installed.")

    library="dask-cudf-cu12"
    spec = importlib.util.find_spec(library)
    if spec is None:
        subprocess.run(
            [
                "pip",
                "install",
                "--extra-index-url=https://pypi.nvidia.com",
                library,
                "--quiet",
            ],
            check=True,
        )
    else:
        print("Library " + library + " already installed.")

except (subprocess.CalledProcessError, RuntimeError, Exception) as e:
    print(repr(e))
"""

### Library Import

In [None]:
# - Import additional libraries that add value to the project related to NLP

# - Set of libraries that perhaps should always be in Python source
import backoff
import datetime
from dotenv import load_dotenv
import gc
import getopt
import glob
import inspect
import io
import itertools
import json
import math
import os
from pathlib import Path
import pickle
import platform
import random
import re
import shutil
import string
from io import StringIO
import subprocess
import socket
import sys
import textwrap
import tqdm
import traceback
import warnings
import time
import uuid

#- Datastructures
from dataclasses import dataclass, fields, field
from typing import List

#- Profiling
from time import perf_counter
import gc
import io
import tracemalloc
import psutil
import cProfile
import pstats
from pstats import SortKey

#- Text formatting
from rich import print as rprint
from rich.console import Console
from rich.traceback import install
from tabulate import tabulate
import locale

# - Displays system info
from watermark import watermark as the_watermark
from py3nvml import py3nvml

# - Additional libraries for this work
import math
from base64 import b64decode
from IPython.display import Image, Markdown
import pandas, IPython.display as display, io, jinja2, base64
from IPython.display import clear_output  # used to support real-time plotting
import requests
import unidecode
import pydot
import wget

# - Data Science Libraries
import pandas as pd
import numpy as np
import polars as pl
import dask as da
import dask.dataframe as dd
import dask.bag as db
import xarray as xr
import cupy_xarray  # never actually invoked in source itself use ds=ds.cupy.as_cupy()
import pystac as pys
import pystac
from pystac.utils import datetime_to_str

# - Statistics
import statistics

# from stacframes import df_from
import fastparquet as fq
import zarr
from zarr import Group
import netCDF4 as nc
from netCDF4 import Dataset

try:
    import cudf
except Exception as e:
    pass

try:
    import cupy
except Exception as e:
    pass

# Tensorflow and related AI libraries
import tensorflow as tf
from tensorflow import data as tf_data

# Torch
import torch

# - Graphics
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.cbook import get_sample_data
from matplotlib.offsetbox import AnnotationBbox, DrawingArea, OffsetImage, TextArea
from matplotlib.pyplot import imshow
from matplotlib.patches import Circle
from PIL import Image as PIL_Image
import PIL.ImageOps
import matplotlib.image as mpimg
from imageio import imread
import seaborn as sns

from mpl_toolkits.basemap import Basemap
from pylab import *

# - Image meta-data for Section 508 compliance
import piexif
from piexif.helper import UserComment

# - Progress bar
from tqdm import tqdm
from tqdm.notebook import trange, tqdm


## DataClasses

In [None]:
## Dataclass used to represent each metric used during execution
#
@dataclass
class aggregate_metrics:
    id: str

    runtime: List[float] = field(default_factory=list)
    
    # milliseconds, reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_read_time: List[float] = field(default_factory=list)
    
    # milliseconds, reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_write_time: List[float] = field(default_factory=list)
    
    # number read operations[end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_read_count: List[float] = field(default_factory=list)

    # number read operations[end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_write_count: List[float] = field(default_factory=list)

    # bytes read [end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_read_throughput: List[float] = field(default_factory=list)

    # bytes read [end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_write_throughput: List[float] = field(default_factory=list)

    # number read operations[end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_read_count: List[float] = field(default_factory=list)

    # number read operations[end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_write_count: List[float] = field(default_factory=list)
    
    # bytes read [end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_read_throughput: List[float] = field(default_factory=list)
    
    # bytes read [end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_write_throughput: List[float] = field(default_factory=list)

    # milliseconds, reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_read_time: List[float] = field(default_factory=list)

    # milliseconds, reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_write_time: List[float] = field(default_factory=list)

    # calculated in MBs, reference: https://www.geeksforgeeks.org/monitoring-memory-usage-of-a-running-python-program/
    mem_current: List[float] = field(default_factory=list)

    # calculated in MBs, reference: https://www.geeksforgeeks.org/monitoring-memory-usage-of-a-running-python-program/
    mem_peak: List[float] = field(default_factory=list)

    def mean_excluding_zero(self,data):
        """Calculates the mean of a list, excluding zero values."""
        filtered_data = [x for x in data if x != 0]
        # Alternatively: filtered_data = list(filter(lambda x: x != 0, data))
        if not filtered_data:
            return 0 # Or raise an exception if no non-zero values exist
        return statistics.mean(filtered_data)

    def calculate_stats(self, field_name) -> List:
       my_stats=[]
       my_field_data=getattr(self, field_name)
       my_field_data = [float(x) for x in my_field_data]
       #my_stats.append(statistics.mean(my_field_data))
       my_stats.append(self.mean_excluding_zero(my_field_data))
       my_stats.append(statistics.median(my_field_data))
       my_stats.append(statistics.mode(my_field_data))
       my_stats.append(statistics.stdev(my_field_data))
       my_stats.append(statistics.variance(my_field_data))
       #return my_stats

       #resultant=["^".join(str(element) for element in my_stats)]
       resultant="^".join(str(element) for element in my_stats)
       return resultant
        
    def __str__(self):
             #id, runtime, disk read counts, disk write counts, general disk read counts, general disk write counts, target disk read time, target disk write time, general disk read time, general disk write time, memory current, memory peak
             #{self.id}^{self.calculate_stats("runtime")}^{self.calculate_stats("io_disk_read_count")}^{self.calculate_stats("io_disk_write_count")}^{self.calculate_stats("io_os_read_count")}^{self.calculate_stats("io_os_write_count")}^{self.calculate_stats("io_disk_read_time")}^{self.calculate_stats("io_disk_write_time")}^{self.calculate_stats("io_os_read_time")}^{self.calculate_stats("io_os_write_time")}^{self.calculate_stats("mem_current")}^{self.calculate_stats("mem_peak")} 
     return f"""
             {self.id}^{self.calculate_stats("runtime")}^{self.calculate_stats("mem_current")}^{self.calculate_stats("mem_peak")} 
             """ 

    #def __strs__(self):
    # return f"""
    #         Id---------------------------------------------
    #                               Id: {self.id}
    #         Runtime----------------------------------------
    #               Runtime Stats:    {self.calculate_stats("runtime")} milliseconds
#
#             I/O Counts-------------------------------------
#                   Targeted disk read: {self.calculate_stats("io_disk_read_count")} counts
#                  Targeted disk write: {self.calculate_stats("io_disk_write_count")} counts
#                    General disk read: {self.calculate_stats("io_os_read_count")} counts
#                   General disk write: {self.calculate_stats("io_os_write_count")} counts
#
#             I/O Time---------------------------------------
#              Targeted disk read time: {self.calculate_stats("io_disk_read_time")} milliseconds
#             Targeted disk write time: {self.calculate_stats("io_disk_write_time")} milliseconds
#               General disk read time: {self.calculate_stats("io_os_read_time")} milliseconds
#              General disk write time: {self.calculate_stats("io_os_write_time")} milliseconds
#
#             Memory------------------------------------------
#                              Current: {self.calculate_stats("mem_current")} MB
#                                 Peak: {self.calculate_stats("mem_peak")} MB
#
#             """         

In [None]:
@dataclass
class runtime_metrics:
    id: str

    runtime: float = field(default=0.0)

    # reference: https://docs.python.org/4/library/profile.html
    profile_data: cProfile.Profile = field(init=False)

    # reference: https://www.geeksforgeeks.org/how-to-get-file-size-in-python/
    file_size: float = field(
        default=0.0,
    )

    # milliseconds, reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_read_time: float = field(
        default=0.0,
    )

    # milliseconds, reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_write_time: float = field(
        default=0.0,
    )
    
    # number read operations[end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_read_count: float = field(
        default=0.0,
    )

    # number read operations[end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_write_count: float = field(
        default=0.0,
    )

    # bytes read [end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_read_throughput: float = field(
        default=0.0,
    )

    # bytes read [end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_disk_write_throughput: float = field(
        default=0.0,
    )

    # number read operations[end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_read_count: float = field(
        default=0.0,
    )

    # number read operations[end-begin], reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_write_count: float = field(
        default=0.0,
    )

    # milliseconds, reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_read_time: float = field(
        default=0.0,
    )

    # milliseconds, reference: https://stackoverflow.com/questions/24723092/using-python-to-measure-in-situ-read-write-speed-for-files
    io_os_write_time: float = field(
        default=0.0,
    )

    # calculated in MBs, reference: https://www.geeksforgeeks.org/monitoring-memory-usage-of-a-running-python-program/
    mem_current: float = field(
        default=0.0,
    )

    # calculated in MBs, reference: https://www.geeksforgeeks.org/monitoring-memory-usage-of-a-running-python-program/
    mem_peak: float = field(
        default=0.0,
    )

    def __str__(self):
        return f"""
                Id---------------------------------------------
                                      Id: {self.id}
                Runtime----------------------------------------
                                 Runtime: {self.runtime:,.8f} milliseconds

                I/O Size---------------------------------------
                               File Size: {self.file_size:,.8f} bytes

                I/O Counts-------------------------------------
                      Targeted disk read: {self.io_disk_read_count:,.8f} counts
                     Targeted disk write: {self.io_disk_write_count:,.8f} counts
                       General disk read: {self.io_os_read_count:,.8f} counts
                      General disk write: {self.io_os_write_count:,.8f} counts

                I/O Throughput----------------------------------
                 Targeted disk read bytes: {self.io_disk_read_throughput:,.8f} bytes
                Targeted disk write bytes: {self.io_disk_write_throughput:,.8f} bytes
                  General disk read bytes: {self.io_os_write_throughput:,.8f} bytes
                 General disk write bytes: {self.io_os_write_throughput:,.8f} bytes                      

                I/O Time---------------------------------------
                 Targeted disk read time: {self.io_disk_read_time:,.8f} milliseconds
                Targeted disk write time: {self.io_disk_write_time:,.8f} milliseconds
                  General disk read time: {self.io_os_read_time:,.8f} milliseconds
                 General disk write time: {self.io_os_write_time:,.8f} milliseconds

                Memory------------------------------------------
                                 Current: {self.mem_current:,.8f} MB
                                    Peak: {self.mem_peak:,.8f} MB

                """
    #def __repr__(self):
    #    return f'{self.__class__.__name__}(name={self.name!r}, unit_price={self.unit_price!r}, quantity={self.quantity_on_hand!r})'

    # TODO - CGW
    # def __post_init__(self):
    #    self.id = f'{self.phrase}_{self.word_type.name.lower()}'

    # worthy consideration - https://www.geeksforgeeks.org/psutil-module-in-python/

## Function Declaration

#### Lib Diagnostics

In [None]:
def lib_diagnostics() -> None:

    import pkg_resources

    package_name_length = 20
    package_version_length = 10

    # Show notebook details
    #%watermark?
    #%watermark --github_username christophergwood --email christopher.g.wood@gmail.com --date --time --iso8601 --updated --python --conda --hostname --machine --githash --gitrepo --gitbranch --iversions --gpu
    # Watermark
    print(
        the_watermark(
            author=f"{AUTHOR_NAME}",
            github_username=f"GITHUB_USERNAME",
            email=f"{AUTHOR_EMAIL}",
            iso8601=True,
            datename=True,
            current_time=True,
            python=True,
            updated=True,
            hostname=True,
            machine=True,
            gitrepo=True,
            gitbranch=True,
            githash=True,
        )
    )

    print(f"{BOLD_START}Packages:{BOLD_END}")
    print("")
    # Get installed packages
    the_packages = [
        "nltk",
        "numpy",
        "os",
        "pandas",
        "keras",
        "seaborn",
        "fastparquet",
        "zarr",
        "dask",
        "pystac",
        "polars",
        "xarray",
    ]  # Functions are like legos that do one thing, this function outputs library version history of effort.

    installed = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
    for package_idx, package_name in enumerate(installed):
        if package_name in the_packages:
            installed_version = installed[package_name]
            print(
                f"{package_name:<40}#: {str(pkg_resources.parse_version(installed_version)):<20}"
            )

    try:
        print(f"{'TensorFlow version':<40}#: {str(tf.__version__):<20}")
        print(
            f"{'     gpu.count:':<40}#: {str(len(tf.config.experimental.list_physical_devices('GPU')))}"
        )
        print(
            f"{'     cpu.count:':<40}#: {str(len(tf.config.experimental.list_physical_devices('CPU')))}"
        )
    except Exception as e:
        pass

    try:
        print(f"{'Torch version':<40}#: {str(torch.__version__):<20}")
        if torch.cuda.is_available():
            device = torch.device("cuda")
            print(f"{'     GPUs available?':<40}#: {torch.cuda.is_available()}")
            print(f"{'     count':<40}#: {torch.cuda.device_count()}")
            print(f"{'     current':<40}#: {torch.cuda.get_device_name(0)}")
        else:
            device = torch.device("cpu")
            print("No GPU available, using CPU.")
    except Exception as e:
        pass

    try:
        print(f"{'OpenAI Azure Version':<40}#: {str(the_openai_version):<20}")
    except Exception as e:
        pass

    return

#### Libary Configuration

In [None]:
def set_library_configuration() -> None:

    ############################################
    # - JUPYTER NOTEBOOK OUTPUT CONTROL / FORMATTING
    ############################################
    # pandas set floating point to 4 places to things don't run loose
    debug.msg_info("Setting Pandas and Numpy library options.")
    pd.set_option(
        "display.max_colwidth", 10
    )  # None if you want to view the full json blob in the printed dataframe, use this
    pd.options.display.float_format = "{:,.4f}".format
    np.set_printoptions(precision=4)

#### Custom Exception Display

In [None]:
# this function displays the stack trace on errors from a central location making adjustments to the display on an error easier to manage
# functions perform useful solutions for highly repetitive code
def process_exception(inc_exception: Exception) -> None:
    if DEBUG_STACKTRACE == 1:
        traceback.print_exc()
        console.print_exception(show_locals=True)
    else:
        rprint(repr(inc_exception))

#### Check your resources from a CPU/GPU perspective

In [None]:
def get_hardware_stats() -> None:
    rprint(f"Entering {__name__} {inspect.stack()[0][3]}")
    print(
        f"{BOLD_START}List Devices{BOLD_END} #########################################"
    )
    try:
        from tensorflow.python.client import device_lib

        rprint(device_lib.list_local_devices())
        print("")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        rprint(str(repr(e)))

    print(
        f"{BOLD_START}Devices Counts{BOLD_END} ########################################"
    )
    try:
        rprint(
            f"Num GPUs Available: {str(len(tf.config.experimental.list_physical_devices('GPU')))}"
        )
        rprint(
            f"Num CPUs Available: {str(len(tf.config.experimental.list_physical_devices('CPU')))}"
        )
        print("")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        rprint(str(repr(e)))

    print(
        f"{BOLD_START}Optional Enablement{BOLD_END} ####################################"
    )
    try:
        gpus = tf.config.experimental.list_physical_devices("GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        rprint(str(repr(e)))

    if gpus:
        # Restrict TensorFlow to only use the first GPU
        try:
            tf.config.experimental.set_visible_devices(gpus[0], "GPU")
            logical_gpus = tf.config.experimental.list_logical_devices("GPU")
            rprint(
                str(
                    str(len(gpus))
                    + " Physical GPUs,"
                    + str(len(logical_gpus))
                    + " Logical GPU"
                )
            )
        except RuntimeError as e:
            # Visible devices must be set before GPUs have been initialized
            rprint(str(repr(e)))
        print("")
    rprint(f"Entering {__name__} {inspect.stack()[0][3]}")

## Input Sources

#### Read Profiles

In [None]:
def read_profiles(inc_source_filenames: []) -> []:

    rprint(f"Entering {__name__} {inspect.stack()[0][3]}")
    the_list = []
    failed_read = []

    rprint(f"...reading pickled profile data from list of {len(inc_source_filenames)} files:")
    for target_filename in inc_source_filenames:
        try:
            rprint(f"......reading profile ({target_filename})")
            #the_netcdf = Dataset(target_filename, "r", format="NETCDF4")
            with (open(target_filename, "rb")) as openfile:
                the_list.append(pickle.load(openfile))
        except Exception as e:
            process_exception(e)
            print(f"...ERROR, investigate this failed read.")
            failed_read.append(target_filename)

    print(f"......{len(the_list)}  of {len(inc_source_filenames)} files successfully read in.")
    print(f"......{len(failed_read)}  of {len(inc_source_filenames)} files failed to read in.")
    rprint(f"Entering {__name__} {inspect.stack()[0][3]}")
    return the_list

#### Analysis

In [None]:
#def add_dataclass_values(a: aggregate_metrics, b: runtime_metrics) -> aggregate_metrics:
def add_dataclass_values(a: aggregate_metrics, b: runtime_metrics):
    """Modified to append to an array. """
    field_metadata = a.__dataclass_fields__
    #for field in fields(a):
    for field_name in field_metadata:
        if field_name not in "id":
            field_list = getattr(a, field_name)
            new_value  = getattr(b, field_name)
            field_list.append(float(new_value))
            #setattr(a, field_name, field_list)
            #print(f"{field_list} - {new_value}")
    return a

In [None]:
def stats_dataclass_values(a: aggregate_metrics, b: runtime_metrics) -> runtime_metrics:
    """Calculates the statistics of all data class values added to this point"""
    for field in fields(a):
        if field.name not in "profile_data" and field.name not in "id":
            setattr(b, field.name, getattr(a,statistics.mean(field.name)))
    return a

In [None]:
def analyze_outputs(inc_files:[], inc_pattern:str) -> None:
    
 current_pattern=[]
 for idx, filename in enumerate(inc_files):
     match = re.search(f"{inc_pattern}", filename)
     if match:
         current_pattern.append(filename)

 dataset_aggregated=aggregate_metrics(id=inc_pattern)
 for profiler_data in current_pattern:
     with open(profiler_data, 'rb') as file:
         single_profile = pickle.load(file)
         #print(single_profile)
         dataset_aggregrated=add_dataclass_values(dataset_aggregated, single_profile)
         
 return dataset_aggregated

#### Pattern Capture

In [None]:
def get_unique_patterns(inc_files:[]) -> []:
  delimiter="_"
  split_index=2
  patterns=set() 
  for idx, filename in enumerate(inc_files):
      filename_pattern=filename.split(delimiter, )[split_index:-1]
      #print("_".join(filename_pattern))
      patterns.add("_".join(filename_pattern))
      
  return list(patterns)

## Process

In [None]:
## Main routine that executes all code, does return a data frame of data for further analysis if desired.
#
#  @param (None)
def process(inc_input_directory: str, ) -> {}:

    rprint(f"Entering {__name__} {inspect.stack()[0][3]}")

    # identify target files
    for idx, value in enumerate(["write", "read"]):
        #iterate through each data profile saved and gather metrics
        #create a list of unique profiles (per type) and process them
        source_filenames_list = []
        unique_patterns = []
        stats = []
        print(f"...marshaling {value} data files:")
        target_directory = f"{inc_input_directory}{os.sep}"
        if os.path.isdir(target_directory):
            for file in os.listdir(target_directory):
                filename, file_extension = os.path.splitext(file)
                if OUTPUT_PICKLE_EXT.lower() in file_extension.lower():
                    if filename.find(f"_{value}_") > -1:
                        source_filenames_list.append(os.path.join(target_directory, file))
        else:
            print(
                "Target directory ({target_directory}) does not exist, cannot continue execution.  Check your paths."
            )
            raise SystemError
    
        source_filenames_list = sorted(source_filenames_list)
        print(f"Found {len(source_filenames_list)} potential target files.")
        unique_patterns = get_unique_patterns(source_filenames_list)
        print("    Patterns found are:")
        for idx, value in enumerate(unique_patterns):
            #print(f"    ...{value}")
            stats.append(analyze_outputs(source_filenames_list, value))
            #print("    ##########################################################################################")
            print(f"    {stats[-1]}")
            #print("    ##########################################################################################")
            #print(f"{stats[-1]}")
            #print("")
            #print("")

    rprint(f"Exiting {__name__} {inspect.stack()[0][3]}")

# Main Routine (call all other routines)

In [None]:
if __name__ == "__main__":

    # note that this design now deviates from previous methods.
    # Implementation will assume a single execution of a single PIID folder, scanning results and
    # appending metrics to a single ASCII file as the code proceeds thus ensuring multi-processor, *nix driven execution.

    start_t = perf_counter()
    print("BEGIN PROGRAM")

    ############################################
    # CONSTANTS
    ############################################

    # Semantic Versioning
    VERSION_NAME = "MLDATAREADY_ANALYSIS"
    VERSION_MAJOR = 0
    VERSION_MINOR = 0
    VERSION_RELEASE = 2

    DATA_VERSION_RELEASE = "-".join(
        [
            str(VERSION_NAME),
            str(VERSION_MAJOR),
            str(VERSION_MINOR),
            str(VERSION_RELEASE),
        ]
    )

    # OUTPUT EXTENSIONS
    OUTPUT_PICKLE_EXT = "pkl"
    OUTPUT_PANDAS_EXT = "pkl"
    OUTPUT_NUMPY_EXT = "npy"
    OUTPUT_TORCH_EXT = "pt"
    OUTPUT_XARRAY_EXT = "xr"
    OUTPUT_ZARR_EXT = "zarr"
    OUTPUT_PARQUET_EXT = "parquet"
    OUTPUT_TENSORFLOW_EXT = "tf"
    OUTPUT_PYSTAC_EXT = "psc"
    OUTPUT_DASK_EXT = "dask"
    # location of our working files
    # WORKING_FOLDER="/content/folderOnColab"
    WORKING_FOLDER = "./folderOnColab/ANALYSIS3/test"
    input_directory = "./folderOnColab/ANALYSIS3/test/"
    output_directory = "./folderOnColab/ANALYSIS3/test/"

    # Notebook Author details
    AUTHOR_NAME = "Christopher G Wood"
    GITHUB_USERNAME = "christophergarthwood"
    AUTHOR_EMAIL = "christopher.g.wood@gmail.com"

    # GEOSPATIAL NAMES
    LAT_LNAME = "latitude"
    LAT_SNAME = "lat"
    LONG_LNAME = "longitude"
    LONG_SNAME = "lon"
    #PRODUCT_LNAME = "chlor_a"
    #PRODUCT_SNAME = "chlor_a"
    PRODUCT_LNAME = "cld_amt"
    PRODUCT_SNAME = "cld_amt"

    # PRODUCT_LNAME="salinity"
    # PRODUCT_SNAME="salinity"

    # Encoding
    ENCODING = "utf-8"
    os.environ["PYTHONIOENCODING"] = ENCODING

    BOLD_START = "\033[1m"
    BOLD_END = "\033[0;0m"
    TEXT_WIDTH = 77

    # You can also adjust the verbosity by changing the value of TF_CPP_MIN_LOG_LEVEL:
    #
    # 0 = all messages are logged (default behavior)
    # 1 = INFO messages are not printed
    # 2 = INFO and WARNING messages are not printed
    # 3 = INFO, WARNING, and ERROR messages are not printed
    TF_CPP_MIN_LOG_LEVEL_SETTING = 0

    # Set the Seed for the experiment (ask me why?)
    # seed the pseudorandom number generator
    # THIS IS ESSENTIAL FOR CONSISTENT MODEL OUTPUT, remember these are random in nature.
    # SEED_INIT = 7
    # random.seed(SEED_INIT)
    # tf.random.set_seed(SEED_INIT)
    # np.random.seed(SEED_INIT)

    DEBUG_STACKTRACE = 0
    DEBUG_USING_GPU = 0   #no gpu utilization on 0, 1 is gpu utilization
    NUM_PROCESSORS = 10
    ITERATIONS = 20

    # make comparisons lower case and include wild card character at the end of each to catch anomalous file extensions like xlsx, etc.
    EXTENSIONS = [".nc"]
    LOWER_EXTENSIONS = [x.lower() for x in EXTENSIONS]

    THE_DEVICE_NAME = "/job:localhost/replica:0/task:0/device:CPU:0"
    if DEBUG_USING_GPU == 1:
        THE_DEVICE_NAME = "/job:localhost/replica:0/task:0/device:GPU:0"

    warnings.filterwarnings("ignore", category=DeprecationWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=UserWarning)

    # GPU Setup (for multiple GPU devices)
    device = torch.cuda.current_device()

    # softare watermark
    lib_diagnostics()

    # hardware specs
    get_hardware_stats()

    # - Core workhorse routine
    process(input_directory)

    # - Save the results
    # save_output()

    end_t = perf_counter()
    print("END PROGRAM")
    print(f"Elapsed time: {end_t - start_t}")

In [None]:
#{self.id}^{self.calculate_stats("runtime")}^{self.calculate_stats("io_disk_read_count")}^{self.calculate_stats("io_disk_write_count")}^{self.calculate_stats("io_os_read_count")}^{self.calculate_stats("io_os_write_count")}^{self.calculate_stats("io_disk_read_time")}^{self.calculate_stats("io_disk_write_time")}^{self.calculate_stats("io_os_read_time")}^{self.calculate_stats("io_os_write_time")}^{self.calculate_stats("mem_current")}^{self.calculate_stats("mem_peak")} 