In [73]:
import time
import numpy as np
import pandas as pd
import torch
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from scipy.interpolate import griddata
from sklearn.metrics import mean_squared_error
import tracemalloc  # For memory usage
import subprocess
import numpy as np
import random
from torch.utils.data import DataLoader
from ivyspt.input_processing import split_surfaces, IVSurfaceDataset

# Set the random seed for reproducibility
RANDOM_STATE = 0
N_JOBS = 8
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [74]:
hyperparameters = {
    'Input Preprocessing' : {
        'Mask Proportions' : [0.1, 0.3, 0.5, 0.7],
        'Number of Query Points' : 1,
        'Batch Size' : 1
    },
}

In [75]:
pre_train_data = pd.read_csv('data/pre_train_data.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
fine_tune_data = pd.read_csv('data/fine_tune_data.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
pre_train_surfaces_train, pre_train_surfaces_validation, pre_train_surfaces_test = split_surfaces(
    pre_train_data,
    # toy_sample=True,
    # max_points=50,
    # max_surfaces=100,
    random_state=RANDOM_STATE
)
fine_tune_surfaces_train, fine_tune_surfaces_validation, fine_tune_surfaces_test = split_surfaces(
    fine_tune_data,
    # toy_sample=True,
    # max_points=50,
    # max_surfaces=100,
    random_state=RANDOM_STATE
)

In [76]:
pre_train_dataset_test = IVSurfaceDataset(
    pre_train_surfaces_test, 
    hyperparameters['Input Preprocessing']['Mask Proportions'], 
    RANDOM_STATE, 
    hyperparameters['Input Preprocessing']['Number of Query Points'] 
)
pre_train_data_loader_test = DataLoader(
    pre_train_dataset_test, 
    batch_size=hyperparameters['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)
fine_tune_dataset_test = IVSurfaceDataset(
    fine_tune_surfaces_test, 
    hyperparameters['Input Preprocessing']['Mask Proportions'], 
    RANDOM_STATE, 
    hyperparameters['Input Preprocessing']['Number of Query Points'] 
)
fine_tune_data_loader_test = DataLoader(
    fine_tune_dataset_test, 
    batch_size=hyperparameters['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)

In [77]:
# Function to get CPU information
def get_cpu_info():
    # Run the lscpu command
    result = subprocess.run(['lscpu'], stdout=subprocess.PIPE)
    # Decode the output from bytes to string
    lscpu_output = result.stdout.decode('utf-8')
    
    # Parse the lscpu output
    cpu_info = {}
    for line in lscpu_output.split('\n'):
        if line.strip():
            parts = line.split(':', 1)
            if len(parts) == 2:
                key, value = parts
                cpu_info[key.strip()] = value.strip()

    # Extract useful information
    useful_info = {
        "Model name": cpu_info.get("Model name"),
    }

    return useful_info

def format_cpu_info(cpu_info):
    report = (
        f"Model Name: {cpu_info['Model name']}\n"
    )
    return report

# Get and format CPU information
cpu_info = get_cpu_info()
cpu_info_report = format_cpu_info(cpu_info)
print("\nCPU Information:\n", cpu_info_report)


CPU Information:
 Model Name: Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz



In [103]:
def benchmark_models(
    data_loader, 
    model_type='mlp',
    random_state=0
):
    mse_list = []
    total_time = 0
    max_memory_usage = 0

    for batch in data_loader:
        # Extract the data from the batch
        input_surface = batch['Input Surface']
        query_points = batch['Query Points']

        X_train = np.column_stack((
            input_surface['Log Moneyness'][0].numpy(), 
            input_surface['Time to Maturity'][0].numpy()
        ))
        y_train = input_surface['Total Variance'][0].numpy()

        X_test = np.column_stack((
            query_points['Log Moneyness'][0].detach().clone().numpy(), 
            query_points['Time to Maturity'][0].detach().clone().numpy()
        ))
        y_test = query_points['Total Variance'][0].numpy()

        # Start memory and time tracking
        tracemalloc.start()
        start_time = time.time()

        if model_type == 'mlp':
            model = MLPRegressor(max_iter=500, random_state=random_state)

        elif model_type == 'gpr':
            model = GaussianProcessRegressor(random_state=random_state)

        elif model_type == 'quadratic':
            poly = PolynomialFeatures(degree=2)
            X_train_poly = poly.fit_transform(X_train)
            model = LinearRegression()
            model.fit(X_train_poly, y_train)
            X_test_poly = poly.transform(X_test)
            y_pred = model.predict(X_test_poly)

        elif model_type == 'cubic_spline':
            y_pred = griddata(X_train, y_train, X_test, method='cubic')    

        else:
            raise ValueError("Unsupported model type.")

        if model_type in ['mlp', 'gpr']:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        # Calculate the MSE
        if not np.isnan(y_pred).any():
            mse = mean_squared_error(y_test, y_pred)
            mse_list.append(mse)

        # End memory and time tracking
        current_memory, peak_memory = tracemalloc.get_traced_memory()
        elapsed_time = time.time() - start_time

        tracemalloc.stop()

        total_time += elapsed_time
        max_memory_usage = max(max_memory_usage, peak_memory)

    # Aggregate results
    avg_mse = np.mean(mse_list)
    print(f"Model: {model_type.upper()}")
    print(f"Average MSE: {avg_mse:.6f}")
    print(f"Total Computation Time: {total_time:.2f} seconds")
    print(f"Max Memory Usage: {max_memory_usage / 1024:.2f} KB")

    return avg_mse, total_time, max_memory_usage

In [104]:
# MLP
benchmark_models(pre_train_data_loader_test, model_type='mlp')

Model: MLP
Average MSE: 0.000358
Total Computation Time: 4.41 seconds
Max Memory Usage: 279.11 KB


(0.00035765415, 4.4053614139556885, 285804)

In [105]:
# Gaussian Process Regression
benchmark_models(pre_train_data_loader_test, model_type='gpr')

Model: GPR
Average MSE: 0.000898
Total Computation Time: 4.62 seconds
Max Memory Usage: 222933.65 KB


(0.0008980387841225338, 4.615170001983643, 228284061)

In [106]:
# Quadratic Regression
benchmark_models(pre_train_data_loader_test, model_type='quadratic')

Model: QUADRATIC
Average MSE: 0.000190
Total Computation Time: 0.17 seconds
Max Memory Usage: 229.72 KB


(0.00018954626, 0.16718626022338867, 235236)

In [107]:
# Cubic Spline
benchmark_models(pre_train_data_loader_test, model_type='cubic_spline')

Model: CUBIC_SPLINE
Average MSE: 0.003109
Total Computation Time: 0.47 seconds
Max Memory Usage: 529.04 KB


(0.0031086418592921835, 0.468597412109375, 541732)

In [108]:
# MLP
benchmark_models(fine_tune_data_loader_test, model_type='mlp')



Model: MLP
Average MSE: 0.276772
Total Computation Time: 2.20 seconds
Max Memory Usage: 181.21 KB


(0.27677184, 2.198162078857422, 185560)

In [109]:
# Gaussian Process Regression
benchmark_models(fine_tune_data_loader_test, model_type='gpr')

Model: GPR
Average MSE: 0.074846
Total Computation Time: 0.23 seconds
Max Memory Usage: 150.05 KB


(0.07484571583656385, 0.22784686088562012, 153653)

In [110]:
# Quadratic Regression
benchmark_models(fine_tune_data_loader_test, model_type='quadratic')

Model: QUADRATIC
Average MSE: 0.597583
Total Computation Time: 0.14 seconds
Max Memory Usage: 18.37 KB


(0.5975827, 0.13836002349853516, 18808)

In [111]:
# Cubic Spline
benchmark_models(fine_tune_data_loader_test, model_type='cubic_spline')

Model: CUBIC_SPLINE
Average MSE: 0.025799
Total Computation Time: 0.06 seconds
Max Memory Usage: 20.78 KB


(0.025798614190301904, 0.0581212043762207, 21276)