# Minimum Training Size

The idea for this notebook is to explore what is the minimum training size to train a model with a good accuracy

## Data Loading and Preprocessing

In [1]:
import os
import sys

notebook_path = os.getcwd()
model_path = os.path.dirname(notebook_path)
memory_usage_path = os.path.join(model_path, 'memory_usage')
if memory_usage_path not in sys.path:
    sys.path.append(memory_usage_path)

In [2]:
import dowser
import json
import shutil
from data_collection import collect_data

attribute = "envelope"
session_id = f"{attribute}-20240608-120422"
log_level = "DEBUG"
max_inlines = 400
max_crosslines = 400
max_samples = 200
amount_of_datasets = 30
precision = 1

output_dir = f"../output/{session_id}"

if not os.path.exists(f"{output_dir}/collected_data.json"):
    print(f"Session {session_id} not found, collecting experiment")

    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)

    dowser.load_config(
        {
            "output_dir": output_dir,
            "logger": {
                "level": log_level,
                "enabled_transports": ["CONSOLE", "FILE"],
            },
            "profiler": {
                "precision": precision,
            }
        }
    )

    collect_data(
        attribute=attribute,
        max_inlines=max_inlines,
        max_crosslines=max_crosslines,
        max_samples=max_samples,
        amount_of_datasets=amount_of_datasets,
    )

collected_data = json.load(open(f"../output/{session_id}/collected_data.json"))
collected_data

{'memory_unit': 'kb',
 'peaks': {'100-100-100': 617312.0,
  '100-100-240': 647684.0,
  '100-100-380': 647160.0,
  '100-100-520': 724484.0,
  '100-100-660': 696936.0,
  '100-100-800': 750444.0,
  '100-1240-100': 940884.0,
  '100-1240-240': 1363520.0,
  '100-1240-380': 1798372.0,
  '100-1240-520': 2380752.0,
  '100-1240-660': 2848336.0,
  '100-1240-800': 3252508.0,
  '100-1620-100': 1028988.0,
  '100-1620-240': 1670216.0,
  '100-1620-380': 2295100.0,
  '100-1620-520': 2897420.0,
  '100-1620-660': 3395288.0,
  '100-1620-800': 4138952.0,
  '100-2000-100': 1125196.0,
  '100-2000-240': 1831528.0,
  '100-2000-380': 2580784.0,
  '100-2000-520': 3385392.0,
  '100-2000-660': 4081444.0,
  '100-2000-800': 4912688.0,
  '100-480-100': 641060.0,
  '100-480-240': 907968.0,
  '100-480-380': 1007920.0,
  '100-480-520': 1298376.0,
  '100-480-660': 1447348.0,
  '100-480-800': 1653844.0,
  '100-860-100': 842540.0,
  '100-860-240': 1149060.0,
  '100-860-380': 1469736.0,
  '100-860-520': 1778064.0,
  '100-86

In [3]:
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore", category=FutureWarning)

df = pd.DataFrame(collected_data['peaks'].items(), columns=['keys', 'memory_usage_kb'])
df[['inlines', 'crosslines', 'samples']] = df['keys'].str.split('-', expand=True)
df = df.drop(columns=['keys'])

df = df.apply(pd.to_numeric)


def extract_features(df):
    # Interaction
    df["inline_crossline"] = df["inlines"] * df["crosslines"]
    df["inline_sample"] = df["inlines"] * df["samples"]
    df["crossline_sample"] = df["crosslines"] * df["samples"]
    df["inline_crossline_sample"] = df["inlines"] * df["crosslines"] * df["samples"]

    # Logarithmic and Exponential Transformations
    df['log_inlines'] = np.log1p(df['inlines'])
    df['log_crosslines'] = np.log1p(df['crosslines'])
    df['log_samples'] = np.log1p(df['samples'])

    # Ratios
    df['inline_to_crossline'] = df['inlines'] / (df['crosslines'] + 1)
    df['inline_to_sample'] = df['inlines'] / (df['samples'] + 1)
    df['crossline_to_sample'] = df['crosslines'] / (df['samples'] + 1)

    # Statistical Aggregates
    df['mean_inline_crossline'] = df[['inlines', 'crosslines']].mean(axis=1)
    df['std_inline_crossline'] = df[['inlines', 'crosslines']].std(axis=1)

    return df


df = extract_features(df)
df

Unnamed: 0,memory_usage_kb,inlines,crosslines,samples,inline_crossline,inline_sample,crossline_sample,inline_crossline_sample,log_inlines,log_crosslines,log_samples,inline_to_crossline,inline_to_sample,crossline_to_sample,mean_inline_crossline,std_inline_crossline
0,617312.0,100,100,100,10000,10000,10000,1000000,4.615121,4.615121,4.615121,0.990099,0.990099,0.990099,100.0,0.0
1,647684.0,100,100,240,10000,24000,24000,2400000,4.615121,4.615121,5.484797,0.990099,0.414938,0.414938,100.0,0.0
2,647160.0,100,100,380,10000,38000,38000,3800000,4.615121,4.615121,5.942799,0.990099,0.262467,0.262467,100.0,0.0
3,724484.0,100,100,520,10000,52000,52000,5200000,4.615121,4.615121,6.255750,0.990099,0.191939,0.191939,100.0,0.0
4,696936.0,100,100,660,10000,66000,66000,6600000,4.615121,4.615121,6.493754,0.990099,0.151286,0.151286,100.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,5440140.0,860,860,240,739600,206400,206400,177504000,6.758095,6.758095,5.484797,0.998839,3.568465,3.568465,860.0,0.0
196,8247768.0,860,860,380,739600,326800,326800,281048000,6.758095,6.758095,5.942799,0.998839,2.257218,2.257218,860.0,0.0
197,11069828.0,860,860,520,739600,447200,447200,384592000,6.758095,6.758095,6.255750,0.998839,1.650672,1.650672,860.0,0.0
198,13895896.0,860,860,660,739600,567600,567600,488136000,6.758095,6.758095,6.493754,0.998839,1.301059,1.301059,860.0,0.0


## Splitting the data

In [29]:
import pandas as pd

df_600 = df[(df["inlines"] <= 600) & (df["crosslines"] <= 600) & (df["samples"] <= 600)]
df_800 = df[(df["inlines"] <= 800) & (df["crosslines"] <= 800) & (df["samples"] <= 800)]
df_1000 = df[
    (df["inlines"] <= 1000) & (df["crosslines"] <= 1000) & (df["samples"] <= 1000)
    ]


def get_min_max(df, column):
    min_value = df[column].min()
    max_value = df[column].max()
    return (
        df[df[column] == min_value],
        df[df[column] == max_value],
    )


df_min_inline, df_max_inline = get_min_max(df, "inlines")
df_min_crossline, df_max_crossline = get_min_max(df, "crosslines")
df_min_sample, df_max_sample = get_min_max(df, "samples")

df_min_max = pd.concat(
    [
        df_min_inline,
        df_max_inline,
        df_min_crossline,
        df_max_crossline,
        df_min_sample,
        df_max_sample,
    ]
).drop_duplicates()


def get_two_largest(df, column):
    return df.nlargest(2, column)


df_two_largest_inlines = get_two_largest(df, 'inlines')
df_two_largest_crosslines = get_two_largest(df, 'crosslines')
df_two_largest_samples = get_two_largest(df, 'samples')

df_two_largest = pd.concat(
    [df_two_largest_inlines, df_two_largest_crosslines, df_two_largest_samples]).drop_duplicates()

df_two_largest

Unnamed: 0,memory_usage_kb,inlines,crosslines,samples,inline_crossline,inline_sample,crossline_sample,inline_crossline_sample,log_inlines,log_crosslines,log_samples,inline_to_crossline,inline_to_sample,crossline_to_sample,mean_inline_crossline,std_inline_crossline
108,1053044.0,2000,100,100,200000,200000,10000,20000000,7.601402,4.615121,4.615121,19.80198,19.80198,0.990099,1050.0,1343.502884
109,1835268.0,2000,100,240,200000,480000,24000,48000000,7.601402,4.615121,5.484797,19.80198,8.298755,0.414938,1050.0,1343.502884
18,1125196.0,100,2000,100,200000,10000,200000,20000000,4.615121,7.601402,4.615121,0.049975,0.990099,19.80198,1050.0,1343.502884
19,1831528.0,100,2000,240,200000,24000,480000,48000000,4.615121,7.601402,5.484797,0.049975,0.414938,8.298755,1050.0,1343.502884
5,750444.0,100,100,800,10000,80000,80000,8000000,4.615121,4.615121,6.685861,0.990099,0.124844,0.124844,100.0,0.0
11,3252508.0,100,1240,800,124000,80000,992000,99200000,4.615121,7.123673,6.685861,0.08058,0.124844,1.548065,670.0,806.101731


## Training the Model

In [45]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression


def calculate_accuracy(y_true, y_pred, tolerance=0.1):
    return np.mean(np.abs((y_true - y_pred) / y_true) <= tolerance)


def evaluate_model(model, X_train, y_train, cv):
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-scores)
    return rmse_scores.mean()


def train_model(X, y, cv=5):
    random_state = None  # 42
    test_size = 0.1
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    lin_reg = LinearRegression()
    lin_reg_rmse = evaluate_model(lin_reg, X_train, y_train, cv=cv)

    print(f"Linear Regression RMSE: {lin_reg_rmse}")

    lin_reg_acc = calculate_accuracy(y_test, lin_reg.predict(X_test))
    print(f"Linear Regression Accuracy: {lin_reg_acc * 100:.2f}%")

    return lin_reg

In [34]:
print("Testing with all experiment:")
X = df.drop(columns=["memory_usage_kb"])
y = df["memory_usage_kb"]
model = train_model(X, y)

print("\nTesting with experiment up to 600:")
X_600 = df_600.drop(columns=["memory_usage_kb"])
y_600 = df_600["memory_usage_kb"]
model_600 = train_model(X_600, y_600)

print("\nTesting with experiment up to 800:")
X_800 = df_800.drop(columns=["memory_usage_kb"])
y_800 = df_800["memory_usage_kb"]
model_800 = train_model(X_800, y_800)

print("\nTesting with experiment up to 1000:")
X_1000 = df_1000.drop(columns=["memory_usage_kb"])
y_1000 = df_1000["memory_usage_kb"]
model_1000 = train_model(X_1000, y_1000)

print("\nTesting with min and max values:")
X_min_max = df_min_max.drop(columns=["memory_usage_kb"])
y_min_max = df_min_max["memory_usage_kb"]
model_min_max = train_model(X_min_max, y_min_max)

print("\nTesting with two largest values:")
X_two_largest = df_two_largest.drop(columns=["memory_usage_kb"])
y_two_largest = df_two_largest["memory_usage_kb"]
model_two_largest = train_model(X_two_largest, y_two_largest)


Testing with all data:
Linear Regression RMSE: 44098.11366967696
Linear Regression Accuracy: 95.00%

Testing with data up to 600:
Linear Regression RMSE: 838986.4676395891
Linear Regression Accuracy: 100.00%

Testing with data up to 800:
Linear Regression RMSE: 84720.98957777364
Linear Regression Accuracy: 100.00%

Testing with data up to 1000:
Linear Regression RMSE: 47296.34073037178
Linear Regression Accuracy: 100.00%

Testing with min and max values:
Linear Regression RMSE: 44964.18349980065
Linear Regression Accuracy: 100.00%

Testing with two largest values:
Linear Regression RMSE: 137609.83449641848
Linear Regression Accuracy: 100.00%


## Testing with real world data

In [37]:
import dowser
from dasf_seismic.datasets import F3
from seismic.attributes import envelope

final_model = model_600
dataset = F3()
dataset_path = dataset._root_file

target_df = pd.DataFrame([dataset.shape], columns=['inlines', 'crosslines', 'samples'])
target_df = extract_features(target_df)
expected_memory_usage = final_model.predict(target_df)[0]

print(f"The expected memory usage is {expected_memory_usage:.2f} KB for the target shape {dataset.shape}")

output_dir = f"../output/{session_id}/validation"

dowser.load_config(
    {
        "output_dir": output_dir,
        "profiler": {
            "session_id": session_id,
            "precision": 1,
        },
    }
)

dowser.profile(envelope.run, dataset_path)


def get_peak_from_profile(profile) -> float:
    return max(profile["experiment"], key=lambda x: x["kernel_memory_usage"])[
        "kernel_memory_usage"
    ]


validation_profile = dowser.profiler.load_profile(f"{output_dir}/{session_id}.prof")
validation_peak_memory_usage = get_peak_from_profile(validation_profile)

print(f"The actual peak memory usage of the validation dataset is {validation_peak_memory_usage} KB")

validation_difference = abs(validation_peak_memory_usage - expected_memory_usage)
validation_difference_in_mb = validation_difference / 1024
validation_difference_in_pct = (validation_difference / expected_memory_usage) * 100

print(f"The difference between the expected and actual peak memory usage is {validation_difference_in_mb:.2f} MB")
print(f"This is a {validation_difference_in_pct:.2f}% difference")

The expected memory usage is 8129753.75 KB for the target shape (651, 951, 462)
[32m2024-06-09 17:56:31.601[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m15[0m - [1mStarting profiler[0m
[32m2024-06-09 17:56:31.602[0m | [1mINFO    [0m | [36mdowser.profiler.metrics.memory_usage.builders[0m:[36mbuild_trace_hooks[0m:[36m12[0m - [1mEnabled memory usage backends: "[<MemoryUsageBackend.KERNEL: 'KERNEL'>]"[0m


[32m2024-06-09 17:56:31.623[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m46[0m - [1mStarting profiler execution for "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py"[0m
[32m2024-06-09 17:56:31.638[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m19[0m - [1mStarting new profiler session for file "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py" with entrypoint set to: "run"[0m
[32m2024-06-09 17:56:31.642[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m38[0m - [1mExecuting file: /home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py[0m
[32m2024-06-09 17:56:31.643[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m42[0m - [1mCompiling code[0m
[32m2024-06-09 17:56:31.645[0m | [

In [38]:
df_600

Unnamed: 0,memory_usage_kb,inlines,crosslines,samples,inline_crossline,inline_sample,crossline_sample,inline_crossline_sample,log_inlines,log_crosslines,log_samples,inline_to_crossline,inline_to_sample,crossline_to_sample,mean_inline_crossline,std_inline_crossline
0,617312.0,100,100,100,10000,10000,10000,1000000,4.615121,4.615121,4.615121,0.990099,0.990099,0.990099,100.0,0.0
1,647684.0,100,100,240,10000,24000,24000,2400000,4.615121,4.615121,5.484797,0.990099,0.414938,0.414938,100.0,0.0
2,647160.0,100,100,380,10000,38000,38000,3800000,4.615121,4.615121,5.942799,0.990099,0.262467,0.262467,100.0,0.0
3,724484.0,100,100,520,10000,52000,52000,5200000,4.615121,4.615121,6.25575,0.990099,0.191939,0.191939,100.0,0.0
24,641060.0,100,480,100,48000,10000,48000,4800000,4.615121,6.175867,4.615121,0.2079,0.990099,4.752475,290.0,268.700577
25,907968.0,100,480,240,48000,24000,115200,11520000,4.615121,6.175867,5.484797,0.2079,0.414938,1.991701,290.0,268.700577
26,1007920.0,100,480,380,48000,38000,182400,18240000,4.615121,6.175867,5.942799,0.2079,0.262467,1.259843,290.0,268.700577
27,1298376.0,100,480,520,48000,52000,249600,24960000,4.615121,6.175867,6.25575,0.2079,0.191939,0.921305,290.0,268.700577
128,731812.0,480,100,100,48000,48000,10000,4800000,6.175867,4.615121,4.615121,4.752475,4.752475,0.990099,290.0,268.700577
129,879188.0,480,100,240,48000,115200,24000,11520000,6.175867,4.615121,5.484797,4.752475,1.991701,0.414938,290.0,268.700577


## Hand-picking

In [89]:
n = 3
df_handpicked = df.iloc[[0, 3, 24, 128, 155]]
df_handpicked = pd.concat([df_handpicked] * n, ignore_index=True)

X = df_handpicked.drop(columns=["memory_usage_kb"])
y = df_handpicked["memory_usage_kb"]
model_handpicked = train_model(X, y, cv=4)

dataset = F3()
dataset_path = dataset._root_file

target_df = pd.DataFrame([dataset.shape], columns=['inlines', 'crosslines', 'samples'])
target_df = extract_features(target_df)
expected_memory_usage = model_handpicked.predict(target_df)[0]

print(f"The expected memory usage is {expected_memory_usage:.2f} KB for the target shape {dataset.shape}")

output_dir = f"../output/{session_id}/validation"

dowser.load_config(
    {
        "output_dir": output_dir,
        "profiler": {
            "session_id": session_id,
            "precision": 1,
        },
    }
)

dowser.profile(envelope.run, dataset_path)


def get_peak_from_profile(profile) -> float:
    return max(profile["experiment"], key=lambda x: x["kernel_memory_usage"])[
        "kernel_memory_usage"
    ]


validation_profile = dowser.profiler.load_profile(f"{output_dir}/{session_id}.prof")
validation_peak_memory_usage = get_peak_from_profile(validation_profile)

print(f"The actual peak memory usage of the validation dataset is {validation_peak_memory_usage} KB")

validation_difference = abs(validation_peak_memory_usage - expected_memory_usage)
validation_difference_in_mb = validation_difference / 1024
validation_difference_in_pct = (validation_difference / expected_memory_usage) * 100

print(f"The difference between the expected and actual peak memory usage is {validation_difference_in_mb:.2f} MB")
print(f"This is a {validation_difference_in_pct:.2f}% difference")

Linear Regression RMSE: 3764.230165425532
Linear Regression Accuracy: 100.00%
The expected memory usage is 8105994.68 KB for the target shape (651, 951, 462)
[32m2024-06-09 18:09:55.678[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m15[0m - [1mStarting profiler[0m
[32m2024-06-09 18:09:55.680[0m | [1mINFO    [0m | [36mdowser.profiler.metrics.memory_usage.builders[0m:[36mbuild_trace_hooks[0m:[36m12[0m - [1mEnabled memory usage backends: "[<MemoryUsageBackend.KERNEL: 'KERNEL'>]"[0m


[32m2024-06-09 18:09:55.699[0m | [1mINFO    [0m | [36mdowser.profiler.main[0m:[36mrun_profiler[0m:[36m46[0m - [1mStarting profiler execution for "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py"[0m
[32m2024-06-09 18:09:55.712[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m19[0m - [1mStarting new profiler session for file "/home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py" with entrypoint set to: "run"[0m
[32m2024-06-09 18:09:55.715[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m38[0m - [1mExecuting file: /home/delucca/src/msc/seismic-attributes-memory-profile/tools/seismic/seismic/attributes/envelope.py[0m
[32m2024-06-09 18:09:55.717[0m | [1mINFO    [0m | [36mdowser.profiler.handlers[0m:[36mexecute_file[0m:[36m42[0m - [1mCompiling code[0m
[32m2024-06-09 18:09:55.718[0m | [