# Comparing Attributes

In this notebook we are going to compare some attributes to check if our model is generic

## Bootstraping

In [9]:
import warnings
import sys
import os
import json
import dowser
import pandas as pd
import numpy as np
from dasf_seismic.datasets import F3
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

warnings.filterwarnings("ignore", category=FutureWarning)

notebook_path = os.getcwd()
model_path = os.path.dirname(notebook_path)
memory_usage_path = os.path.join(model_path, 'memory_usage')
if memory_usage_path not in sys.path:
    sys.path.append(memory_usage_path)

def load_session(session_id):
    return json.load(open(f"../output/{session_id}/collected_data.json"))

def extract_features(df):
    # Interaction
    #df["inline_crossline"] = df["inlines"] * df["crosslines"]
    #df["inline_sample"] = df["inlines"] * df["samples"]
    #df["crossline_sample"] = df["crosslines"] * df["samples"]
    df["volume"] = df["inlines"] * df["crosslines"] * df["samples"]
    
    # Logarithmic and Exponential Transformations
    #df['log_inlines'] = np.log1p(df['inlines'])
    #df['log_crosslines'] = np.log1p(df['crosslines'])
    #df['log_samples'] = np.log1p(df['samples'])

    # Ratios
    #df['inline_to_crossline'] = df['inlines'] / (df['crosslines'] + 1)
    #df['inline_to_sample'] = df['inlines'] / (df['samples'] + 1)
    #df['crossline_to_sample'] = df['crosslines'] / (df['samples'] + 1)
    
    # Statistical Aggregates
    #df['mean_inline_crossline'] = df[['inlines', 'crosslines']].mean(axis=1)
    #df['std_inline_crossline'] = df[['inlines', 'crosslines']].std(axis=1)

    return df

def load_data(session_id):
    collected_data = load_session(session_id)
    df = pd.DataFrame(collected_data['peaks'].items(), columns=['keys', 'memory_usage_kb'])
    df[['inlines', 'crosslines', 'samples']] = df['keys'].str.split('-', expand=True)
    df = df.drop(columns=['keys'])
    df = df.apply(pd.to_numeric)
    df = extract_features(df)

    return df

def calculate_accuracy(y_true, y_pred, tolerance=0.1):
    return np.mean(np.abs((y_true - y_pred) / y_true) <= tolerance)

def evaluate_model(model, X_train, y_train, cv):
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-scores)
    return rmse_scores.mean()

def train_model(df, cv=5, augmentation=3):
    df_augmented = pd.concat([df] * augmentation, ignore_index=True)
    X = df_augmented.drop(columns=["memory_usage_kb"])
    y = df_augmented["memory_usage_kb"]

    random_state = None # 42
    test_size = 0.1
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    lin_reg = LinearRegression()
    lin_reg_rmse = evaluate_model(lin_reg, X_train, y_train, cv=cv)

    print(f"RMSE: {lin_reg_rmse}")
    
    y_pred = lin_reg.predict(X_test)

    lin_reg_acc = calculate_accuracy(y_test, y_pred)
    print(f"Accuracy: {lin_reg_acc * 100:.2f}%")
    
    lin_reg_r2 = r2_score(y_test, y_pred)
    print(f"R2 Score: {lin_reg_r2}")
    
    return lin_reg

def get_peak_from_profile(profile) -> float:
    return max(profile["data"], key=lambda x: x["kernel_memory_usage"])[
        "kernel_memory_usage"
    ]

def validate_model(model, attribute, session_id, dataset_path=F3()._root_file, dataset_shape=F3().shape):
    target_df = pd.DataFrame([dataset_shape], columns=['inlines', 'crosslines', 'samples'])
    target_df = extract_features(target_df)
    expected_memory_usage = model.predict(target_df)[0]

    print(f"The expected memory usage is {expected_memory_usage:.2f} KB for the target shape {dataset_shape}")

    output_dir = f"../output/{session_id}/validation"

    dowser.load_config(
        {
            "output_dir": output_dir,
            ""
            "profiler": {
                "session_id": session_id,
                "precision": 1,
            },
        }
    )

    dowser.profile(attribute, dataset_path)


    validation_profile = dowser.profiler.load_profile(f"{output_dir}/{session_id}.prof")
    validation_peak_memory_usage = get_peak_from_profile(validation_profile)

    print(f"The actual peak memory usage of the validation dataset is {validation_peak_memory_usage} KB")

    validation_difference = abs(validation_peak_memory_usage - expected_memory_usage)
    validation_difference_in_mb = validation_difference / 1024
    validation_difference_in_pct = (validation_difference / expected_memory_usage) * 100

    print(f"The difference between the expected and actual peak memory usage is {validation_difference_in_mb:.2f} MB")
    print(f"This is a {validation_difference_in_pct:.2f}% difference")

def train_and_validate_session(session_id, attribute):
    df_session = load_data(session_id)
    model_session = train_model(df_session)
    validate_model(model_session, attribute, session_id)
    
    return model_session

## Envelope

In [12]:
from seismic.attributes import envelope

session_id_envelope = 'envelope-20240609-191813'
model_envelope = train_and_validate_session(session_id_envelope, envelope.run)

RMSE: 28763.757245178225
Accuracy: 100.00%
R2 Score: 0.02230812777393465
The expected memory usage is 8276342.76 KB for the target shape (651, 951, 462)
[32m2024-06-14 16:07:47.610[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m15[0m - [1mStarting profiler[0m
[32m2024-06-14 16:07:47.610[0m | [1mINFO    [0m | [36mdowser.profiler.metrics.memory_usage.builders[0m:[36mbuild_trace_hooks[0m:[36m12[0m - [1mEnabled memory usage backends: "[<MemoryUsageBackend.KERNEL: 'KERNEL'>]"[0m
[32m2024-06-14 16:07:47.628[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m46[0m - [1mStarting profiler execution for "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py"[0m
[32m2024-06-14 16:07:47.641[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m19[0m - [1mStarting new profiler session for file "/home/delucca/src/msc/seismic-attribute

In [16]:
from seismic.attributes import envelope

session_id_envelope = 'envelope-20240608-120422'
model_envelope = train_and_validate_session(session_id_envelope, envelope.run)

RMSE: 42569.89757375153
Accuracy: 100.00%
R2 Score: 0.9999900102903184
The expected memory usage is 8374093.09 KB for the target shape (651, 951, 462)
[32m2024-06-14 15:19:27.332[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m15[0m - [1mStarting profiler[0m
[32m2024-06-14 15:19:27.334[0m | [1mINFO    [0m | [36mdowser.profiler.metrics.memory_usage.builders[0m:[36mbuild_trace_hooks[0m:[36m12[0m - [1mEnabled memory usage backends: "[<MemoryUsageBackend.KERNEL: 'KERNEL'>]"[0m


[32m2024-06-14 15:19:27.359[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m46[0m - [1mStarting profiler execution for "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py"[0m
[32m2024-06-14 15:19:27.378[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m19[0m - [1mStarting new profiler session for file "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py" with entrypoint set to: "run"[0m
[32m2024-06-14 15:19:27.383[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m38[0m - [1mExecuting file: /home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py[0m
[32m2024-06-14 15:19:27.385[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m42[0m - [1mCompiling code[0m
[32m2024-06-14 15:19:27.386[0m | [

## First Derivative

In [17]:
from seismic.attributes import first_derivative

session_id_first_derivative = 'first_derivative-20240609-192724'
model_first_derivative = train_and_validate_session(session_id_first_derivative, first_derivative.run)

RMSE: 24844.81156185551
Accuracy: 100.00%
R2 Score: 0.931730217727686
The expected memory usage is 5397474.98 KB for the target shape (651, 951, 462)
[32m2024-06-14 15:19:38.303[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m15[0m - [1mStarting profiler[0m
[32m2024-06-14 15:19:38.304[0m | [1mINFO    [0m | [36mdowser.profiler.metrics.memory_usage.builders[0m:[36mbuild_trace_hooks[0m:[36m12[0m - [1mEnabled memory usage backends: "[<MemoryUsageBackend.KERNEL: 'KERNEL'>]"[0m
[32m2024-06-14 15:19:38.322[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m46[0m - [1mStarting profiler execution for "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/first_derivative.py"[0m
[32m2024-06-14 15:19:38.335[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m19[0m - [1mStarting new profiler session for file "/home/delucca/src/msc/seismic-attr

## GST3D

In [19]:
from seismic.attributes import gst_3d_dip
from seismic.data.synthetic import generate_and_save_synthetic_data

session_id_gst3d = 'gst_3d_dip-20240609-193855'
df_gst3d = load_data(session_id_gst3d)
model_gst3d = train_model(df_gst3d)

target_shape = (450,300,200)
target_df = pd.DataFrame([target_shape], columns=['inlines', 'crosslines', 'samples'])
target_df = extract_features(target_df)
expected_memory_usage = model_gst3d.predict(target_df)[0]

output_dir = f"../output/{session_id_gst3d}/validation"
validation_dataset_path = generate_and_save_synthetic_data(*target_shape,output_dir=f"{output_dir}/data")

validate_model(model_gst3d, gst_3d_dip.run, session_id_gst3d, validation_dataset_path, target_shape)

RMSE: 19459.659728982057
Accuracy: 100.00%
R2 Score: 0.9989739819851616
The expected memory usage is 6746558.41 KB for the target shape (450, 300, 200)
[32m2024-06-14 15:20:09.462[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m15[0m - [1mStarting profiler[0m
[32m2024-06-14 15:20:09.463[0m | [1mINFO    [0m | [36mdowser.profiler.metrics.memory_usage.builders[0m:[36mbuild_trace_hooks[0m:[36m12[0m - [1mEnabled memory usage backends: "[<MemoryUsageBackend.KERNEL: 'KERNEL'>]"[0m
[32m2024-06-14 15:20:09.483[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m46[0m - [1mStarting profiler execution for "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/gst_3d_dip.py"[0m
[32m2024-06-14 15:20:09.497[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m19[0m - [1mStarting new profiler session for file "/home/delucca/src/msc/seismic-attribut