# Benchmarking

In this notebook, we are going to automatically extract data and evaluate all attributes, creating the model and testing it based on a given dataset.

## Bootstraping

In [45]:
import json
import os
import sys
import warnings
from datetime import datetime

import dowser
import numpy as np
import pandas as pd
from dasf_seismic.datasets import F3
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split

warnings.filterwarnings("ignore", category=FutureWarning)

notebook_path = os.getcwd()
model_path = os.path.dirname(notebook_path)
memory_usage_path = os.path.join(model_path, 'memory_usage')
if memory_usage_path not in sys.path:
    sys.path.append(memory_usage_path)

from seismic.data.synthetic import generate_and_save_synthetic_data


def load_session(session_id):
    return json.load(open(f"../output/{session_id}/collected_data.json"))


def extract_features(df):
    df["volume"] = df["inlines"] * df["crosslines"] * df["samples"]

    return df


def load_data(session_id):
    collected_data = load_session(session_id)
    df = pd.DataFrame(collected_data['peaks'].items(), columns=['keys', 'memory_usage_kb'])
    df[['inlines', 'crosslines', 'samples']] = df['keys'].str.split('-', expand=True)
    df = df.drop(columns=['keys'])
    df = df.apply(pd.to_numeric)
    df = extract_features(df)

    return df


def calculate_accuracy(y_true, y_pred, tolerance=0.1):
    return np.mean(np.abs((y_true - y_pred) / y_true) <= tolerance)


def evaluate_model(model, X_train, y_train, cv):
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-scores)
    return rmse_scores.mean()


def train_model(df, cv=5, augmentation=3):
    df_augmented = pd.concat([df] * augmentation, ignore_index=True)
    X = df_augmented.drop(columns=["memory_usage_kb"])
    y = df_augmented["memory_usage_kb"]

    random_state = None  # 42
    test_size = 0.1
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    lin_reg = LinearRegression()
    lin_reg_rmse = evaluate_model(lin_reg, X_train, y_train, cv=cv)

    print(f"RMSE: {lin_reg_rmse}")

    y_pred = lin_reg.predict(X_test)

    lin_reg_acc = calculate_accuracy(y_test, y_pred)
    print(f"Accuracy: {lin_reg_acc * 100:.2f}%")

    return lin_reg


def get_peak_from_profile(profile) -> float:
    return max(profile["experiment"], key=lambda x: x["kernel_memory_usage"])[
        "kernel_memory_usage"
    ]


def validate_model(model, attribute, session_id, dataset_path, dataset_shape):
    target_df = pd.DataFrame([dataset_shape], columns=['inlines', 'crosslines', 'samples'])
    target_df = extract_features(target_df)
    expected_memory_usage = model.predict(target_df)[0]

    print(f"The expected memory usage is {expected_memory_usage:.2f} KB for the target shape {dataset_shape}")

    output_dir = f"../output/{session_id}/validation"

    dowser.load_config(
        {
            "output_dir": output_dir,
            ""
            "profiler": {
                "session_id": session_id,
                "precision": 1,
            },
        }
    )

    dowser.profile(attribute, dataset_path)

    validation_profile = dowser.profiler.load_profile(f"{output_dir}/{session_id}.prof")
    validation_peak_memory_usage = get_peak_from_profile(validation_profile)

    print(f"The actual peak memory usage of the validation dataset is {validation_peak_memory_usage} KB")

    validation_difference = abs(validation_peak_memory_usage - expected_memory_usage)
    validation_difference_in_mb = validation_difference / 1024
    validation_difference_in_pct = (validation_difference / expected_memory_usage) * 100

    print(f"The difference between the expected and actual peak memory usage is {validation_difference_in_mb:.2f} MB")
    print(f"This is a {validation_difference_in_pct:.2f}% difference")


def get_validation_dataset(small_validation, session_id):
    if small_validation:
        output_dir = f"../output/{session_id}/validation"
        small_shape = (325, 475, 231)
        small_dataset_path = generate_and_save_synthetic_data(*small_shape, output_dir=f"{output_dir}/experiment")

        return small_dataset_path, small_shape

    return F3()._root_file, F3().shape


def train_and_validate_session(session_id, attribute, small_validation):
    df_session = load_data(session_id)
    model_session = train_model(df_session)
    dataset_path, dataset_shape = get_validation_dataset(small_validation, session_id)
    validate_model(model_session, attribute, session_id, dataset_path, dataset_shape)

    return model_session


def generate_cube_shapes(max, min):
    return [
        (min, min, min),
        (min, min, max),
        (min, max, min),
        (max, min, min),
        (max, max, min),
        (max, min, max),
        (min, max, max),
        (max, max, max),
    ]


def create_datasets(output_dir, max, min):
    shapes = generate_cube_shapes(max, min)
    dataset_dir = f"{output_dir}/experiment"
    return [
        generate_and_save_synthetic_data(*shape, output_dir=dataset_dir)
        for shape in shapes
    ]


def dataset_shape_from_path(dataset_path: str) -> str:
    return dataset_path.split("/")[-1].split(".")[0]


def group_output_to_folder(folder: str) -> str:
    already_is_using_folder = dowser.context.config.output_dir.as_posix().endswith(
        folder
    )

    return (
        dowser.context.config.output_dir
        if already_is_using_folder
        else f"{dowser.context.config.output_dir}/{folder}"
    )


def profile_dataset(attribute, dataset_path):
    session_id = dataset_shape_from_path(dataset_path)
    output_dir = group_output_to_folder("profiles")

    dowser.load_config(
        {
            "output_dir": output_dir,
            "profiler": {
                "session_id": session_id,
            },
        }
    )
    dowser.profile(attribute, dataset_path)

    return f"{output_dir}/{session_id}.prof"


def read_profiles(profile_paths):
    return [
        dowser.profiler.load_profile(profile_path) for profile_path in profile_paths
    ]


def get_peak_from_profile(profile):
    return max(profile["experiment"], key=lambda x: x["kernel_memory_usage"])[
        "kernel_memory_usage"
    ]


def save_collected_data(profiles, output_dir):
    profiles = read_profiles(profiles)
    data = {
        "memory_unit": None,
        "peaks": {},
    }

    for profile in profiles:
        if data["memory_unit"] is None:
            data["memory_unit"] = profile["metadata"]["kernel_memory_usage_unit"]

        shape = dataset_shape_from_path(profile["metadata"]["entrypoint_segy_filepath"])

        peak = get_peak_from_profile(profile)
        data["peaks"][shape] = peak

    output_path = f"{output_dir}/collected_data.json"
    json.dump(data, open(output_path, "w"), indent=4, sort_keys=True)


def create_model(attribute_module, max=300, min=100, small_validation=False):
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    attribute_name = attribute_module.__name__.split('.')[-1]
    session_id = f"{attribute_name}-{timestamp}"
    output_dir = f"../output/{session_id}"

    dowser.load_config(
        {
            "output_dir": output_dir,
            "profiler": {
                "session_id": session_id,
                "precision": 1,
            },
        }
    )

    datasets = create_datasets(output_dir, max=max, min=min)
    profiles = [profile_dataset(attribute_module.run, dataset) for dataset in datasets]
    save_collected_data(profiles, output_dir)

    return train_and_validate_session(session_id, attribute_module.run, small_validation=small_validation)



## Validating if it works

In [38]:
from seismic.attributes import envelope

envelope_model = create_model(envelope)


[32m2024-06-14 16:18:39.144[0m | [1mINFO    [0m | [36mseismic.data.synthetic[0m:[36mgenerate_and_save_synthetic_data[0m:[36m130[0m - [1mGenerating synthetic data for shape (50, 50, 50)[0m
[32m2024-06-14 16:18:39.406[0m | [1mINFO    [0m | [36mseismic.data.synthetic[0m:[36mgenerate_and_save_synthetic_data[0m:[36m130[0m - [1mGenerating synthetic data for shape (50, 50, 250)[0m
[32m2024-06-14 16:18:39.776[0m | [1mINFO    [0m | [36mseismic.data.synthetic[0m:[36mgenerate_and_save_synthetic_data[0m:[36m130[0m - [1mGenerating synthetic data for shape (50, 250, 50)[0m
[32m2024-06-14 16:18:40.821[0m | [1mINFO    [0m | [36mseismic.data.synthetic[0m:[36mgenerate_and_save_synthetic_data[0m:[36m130[0m - [1mGenerating synthetic data for shape (250, 50, 50)[0m
[32m2024-06-14 16:18:42.085[0m | [1mINFO    [0m | [36mseismic.data.synthetic[0m:[36mgenerate_and_save_synthetic_data[0m:[36m130[0m - [1mGenerating synthetic data for shape (250, 250, 50)

## Testing for all attributes

In [55]:
import importlib
from pathlib import Path

dowser.load_config(
    {
        "logger": {
            "level": "ERROR"
        },
    }
)

attributes_path = Path('../../../tools/seismic/seismic/attributes')
attribute_names = sorted([f.name.split('.')[0] for f in attributes_path.iterdir() if f.is_file()])[1:]
custom_kwargs = {
    "gst_3d_dip": {
        "small_validation": True
    },
}

attributes_hashmap = [
    {"attribute_name": attribute_name, "kwargs": custom_kwargs.get(attribute_name, {})} for attribute_name in
    attribute_names
]

for data in attributes_hashmap:
    module = importlib.import_module(f"seismic.attributes.{data['attribute_name']}")
    kwargs = data["kwargs"]

    print(f"Modeling for attribute {module.__name__}")
    create_model(module, **kwargs)
    print("---")

Modeling for attribute seismic.attributes.apparent_polarity
RMSE: 9515.437453395853
Accuracy: 100.00%
The expected memory usage is 6773667.93 KB for the target shape (651, 951, 462)
The actual peak memory usage of the validation dataset is 8403088.0 KB
The difference between the expected and actual peak memory usage is 1591.23 MB
This is a 24.06% difference
---
Modeling for attribute seismic.attributes.chaos
RMSE: 18490.283497960107
Accuracy: 100.00%
The expected memory usage is 45077765.73 KB for the target shape (651, 951, 462)
[32m2024-06-14 16:36:02.190[0m | [31m[1mERROR   [0m | [36mdowser.common.synchronization[0m:[36mdo_many[0m:[36m67[0m - [31m[1mError executing function: list index out of range[0m
[32m2024-06-14 16:36:02.428[0m | [31m[1mERROR   [0m | [36mdowser.common.synchronization[0m:[36mdo_many[0m:[36m67[0m - [31m[1mError executing function: list index out of range[0m
[32m2024-06-14 16:36:02.533[0m | [31m[1mERROR   [0m | [36mdowser.common.sy

Process Process-324:1:
Traceback (most recent call last):
  File "/home/delucca/.pyenv/versions/3.8.10/lib/python3.8/multiprocessing/managers.py", line 1104, in is_set
    return self._callmethod('is_set')
  File "/home/delucca/.pyenv/versions/3.8.10/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/delucca/.pyenv/versions/3.8.10/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/dowser/dowser/profiler/tracer.py", line 172, in sampler
    while not self.__sampler_finished.is_set():
  File "/home/delucca/.pyenv/versions/3.8.10/lib/python3.8/multiprocessing/managers.py", line 834, in _callmethod
    conn.send((self._id, methodname, args, kwds))
  File "/home/delucca/.pyenv/versions/3.8.10/lib/python3.8/multiprocessing/connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "/home/delucc

The actual peak memory usage of the validation dataset is 15220100.0 KB
The difference between the expected and actual peak memory usage is 29157.88 MB
This is a 66.24% difference
---
Modeling for attribute seismic.attributes.convolution
RMSE: 10986.734335842153
Accuracy: 100.00%
The expected memory usage is 3133879.05 KB for the target shape (651, 951, 462)
The actual peak memory usage of the validation dataset is 3080800.0 KB
The difference between the expected and actual peak memory usage is 51.84 MB
This is a 1.69% difference
---
Modeling for attribute seismic.attributes.cosine_instantaneous_phase
RMSE: 42252.899042516394
Accuracy: 33.33%
The expected memory usage is 9832613.67 KB for the target shape (651, 951, 462)
The actual peak memory usage of the validation dataset is 8297668.0 KB
The difference between the expected and actual peak memory usage is 1498.97 MB
This is a 15.61% difference
---
Modeling for attribute seismic.attributes.dominant_frequency
RMSE: 10814.996284306693
A

  result = env_prime / env
  result = env_prime / env


The actual peak memory usage of the validation dataset is 9492164.0 KB
The difference between the expected and actual peak memory usage is 358.84 MB
This is a 4.03% difference
---
Modeling for attribute seismic.attributes.eig_complex
