In [25]:
# Import libraries
from pyspark.sql import *
from pyspark.sql.functions import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import psutil
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time
import json

# Linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Random forest
from sklearn.ensemble import RandomForestRegressor

# Gaussian process
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.mlls import ExactMarginalLogLikelihood
import torch
import gpytorch

# Neural network
from sklearn.neural_network import MLPRegressor

# Gradient boosting
from sklearn.ensemble import GradientBoostingRegressor

# Import own functions
import helperFuncs as hf
import metrics
import methods

In [None]:
%matplotlib inline

In [32]:
%load_ext autoreload
%autoreload 2

In [2]:
# Create spark session
spark = SparkSession.builder.getOrCreate()

# Create spark context
sc = spark.sparkContext

# Check how much memory is allocated to spark runtime
mem_info = psutil.virtual_memory()
total_memory_gb = mem_info.total / (1024 * 1024 * 1024)
print("Total Memory Allocated to Spark Runtime:", int(total_memory_gb), "GB")

24/05/28 11:27:42 WARN Utils: Your hostname, CC-M133A-EU.local resolves to a loopback address: 127.0.0.1; using 10.19.209.184 instead (on interface en0)
24/05/28 11:27:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/28 11:27:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Total Memory Allocated to Spark Runtime: 32 GB


In [3]:
# Load data
folder = '/Users/ekinokos2/Downloads/2023-citibike-tripdata/9_September/'
name = '202309-citibike-tripdata_4.csv'
path = folder + name

sep23_data = spark.read.csv(path, sep = ',', header = True, inferSchema = True)

                                                                                

In [5]:
sep23_data_pd = pd.read_csv(path)

  sep23_data_pd = pd.read_csv(path)


In [4]:
# Choose which dataset
data = sep23_data

# View data
print(f"NUM DATA POINTS = {data.count()}")
data.show()

NUM DATA POINTS = 575162
+----------------+-------------+-------------------+-------------------+--------------------+----------------+--------------------+--------------+------------------+------------------+-----------------+------------------+-------------+
|         ride_id|rideable_type|         started_at|           ended_at|  start_station_name|start_station_id|    end_station_name|end_station_id|         start_lat|         start_lng|          end_lat|           end_lng|member_casual|
+----------------+-------------+-------------------+-------------------+--------------------+----------------+--------------------+--------------+------------------+------------------+-----------------+------------------+-------------+
|E7CE7CE80519EB26| classic_bike|2023-09-16 17:15:28|2023-09-16 17:19:00|Halsey St & Ralph...|         4334.02|Broadway & Madiso...|       4483.10|          40.68494|         -73.92299|         40.68822|         -73.91966|       member|
|56F4C65B769CF128| classic_bike

In [5]:
# Project only relevant columns
projected_data = data.select("started_at", "ended_at", "start_lat", "start_lng",
                  "end_lat", "end_lng")

# Select a small sample (first N) of rows
N = 1000
small_data = projected_data.select("*").limit(N)

# Test function
test = hf.extract_times_to_seconds(small_data)
test.show()

+------------------+------------------+-----------------+------------------+-------------+-----------+
|         start_lat|         start_lng|          end_lat|           end_lng|start_seconds|end_seconds|
+------------------+------------------+-----------------+------------------+-------------+-----------+
|          40.68494|         -73.92299|         40.68822|         -73.91966|        62128|      62340|
|       40.73240447|     -73.998393893|        40.722055|        -73.989111|        78329|      78813|
| 40.78232260730881|-73.95987510681152|40.78866499127884|-73.96680057048798|        43198|      44582|
|40.750097682094825|-73.91349703073502|        40.756913|        -73.921631|        32947|      33267|
| 40.78232260730881|-73.95987510681152|40.78866499127884|-73.96680057048798|        74065|      74516|
| 40.78232260730881|-73.95987510681152|40.78866499127884|-73.96680057048798|        76446|      77789|
| 40.78232260730881|-73.95987510681152|40.78866499127884|-73.966800570487

In [6]:
# Search for best lambda and gamma
# Collect working dataset into an array
small_time_and_space = hf.extract_times_to_seconds(small_data)
small_time_and_space_arr = np.array(small_time_and_space.collect())

# Normalize the data
scaler = StandardScaler()
scaler.fit(small_time_and_space_arr)
small_time_and_space_arr = scaler.transform(small_time_and_space_arr)

In [28]:
lam_search = [0.9, 0.7, 0.5, 0.3, 0.1]
gam_search = [0.1, 0.5, 1.0, 1.5, 3.0]

for lam in lam_search:
    for gam in gam_search:
        # Set k, the number of clusters for K-means
        k = 5

        # Pick the first k points as a "random" initialization of centroids
        random_centroids = small_time_and_space_arr[:k]

        # Run k-means time & space algorithm on small_space_and_time data
        # lam (a hyper-parameter between 0 and 1) controls how much we weight space
        # vs. time in our calculation of the cost. The smaller lam is, the more space
        # is weighted.
        costs, centroids, clustering = hf.K_means_time_and_space(small_time_and_space_arr,
                                                            random_centroids,
                                                            max_iter = 10, norm = 2,
                                                            lam = lam,
                                                            gam = gam)
        # Plot the cost vs. iteration
        #plt.plot(costs)
        #plt.xlabel("Iteration")
        #plt.ylabel("Cost")
        #plt.title(f"Cost vs. Iteration for Lambda = {lam}, Gamma = {gam}")
        #plt.show()

        overall_score, sample_scores = hf.silhouette_score_custom(small_time_and_space_arr, clustering, hf.custom_distance_metric, lam, gam)
        print(f"Lambda = {lam}, Gamma = {gam}, Overall Silhouette  Score {overall_score}")

Lambda = 0.9, Gamma = 0.1, Overall Silhouette  Score 0.3729527852828081
Lambda = 0.9, Gamma = 0.5, Overall Silhouette  Score 0.37269928906311334
Lambda = 0.9, Gamma = 1.0, Overall Silhouette  Score 0.3723830184413008
Lambda = 0.9, Gamma = 1.5, Overall Silhouette  Score 0.37206741191487336
Lambda = 0.9, Gamma = 3.0, Overall Silhouette  Score 0.37112455366888436
Lambda = 0.7, Gamma = 0.1, Overall Silhouette  Score 0.38196284643434814
Lambda = 0.7, Gamma = 0.5, Overall Silhouette  Score 0.381694921300087
Lambda = 0.7, Gamma = 1.0, Overall Silhouette  Score 0.38136066916303785
Lambda = 0.7, Gamma = 1.5, Overall Silhouette  Score 0.3810271415335652
Lambda = 0.7, Gamma = 3.0, Overall Silhouette  Score 0.3800308794232987
Lambda = 0.5, Gamma = 0.1, Overall Silhouette  Score 0.38382632728703164
Lambda = 0.5, Gamma = 0.5, Overall Silhouette  Score 0.3835553559276871
Lambda = 0.5, Gamma = 1.0, Overall Silhouette  Score 0.3832173078205451
Lambda = 0.5, Gamma = 1.5, Overall Silhouette  Score 0.3828

## Outline of the remaining work
* Implement models to predict the arrival time using
    * Linear regression
    * Random forest
    * Gradient boosting
    * Neural network
    * Gaussian process
* Compare the performance of the models when run with the entire set of data and when run with the subset of data belonging to the specific cluster
* Compare our clustering approach with Euclidian distance clustering

In [7]:
# Project only relevant columns
projected_data = data.select("started_at", "ended_at", "start_lat", "start_lng",
                  "end_lat", "end_lng")

# Select a small sample (first N) of rows
N = 10000
small_data = projected_data.select("*").limit(N)

# Search for best lambda and gamma
# Collect working dataset into an array
small_time_and_space = hf.extract_times_to_seconds(small_data)
small_time_and_space_arr = np.array(small_time_and_space.collect())

# Normalize the data
scaler = StandardScaler()
scaler.fit(small_time_and_space_arr)
small_time_and_space_arr = scaler.transform(small_time_and_space_arr)

# Set k, the number of clusters for K-means
k = 5

lam = 0.5
gam = 1.0

# Pick the first k points as a "random" initialization of centroids
random_centroids = small_time_and_space_arr[:k]

# Run k-means time & space algorithm on small_space_and_time data
# lam (a hyper-parameter between 0 and 1) controls how much we weight space
# vs. time in our calculation of the cost. The smaller lam is, the more space
# is weighted.
costs, centroids, clustering = hf.K_means_time_and_space(small_time_and_space_arr,
                                                    random_centroids,
                                                    max_iter = 10, norm = 2,
                                                    lam = lam,
                                                    gam = gam)

In [8]:
# Add clustering to the dataframe
small_time_and_space_arr_pd = pd.DataFrame(small_time_and_space_arr)
small_time_and_space_arr_pd.columns = ["start_lat", "start_lng", "end_lat", "end_lng", "start_time", "end_time"]
small_time_and_space_arr_pd["cluster"] = clustering

# Assuming 'data' is the DataFrame containing the data
# and 'clustering' is the numpy array containing the cluster values

cluster_train_test_splits = hf.cluster_train_test_split(small_time_and_space_arr_pd)


# Also do a regular train/test split
X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
y = small_time_and_space_arr_pd["end_time"]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
small_time_and_space_arr_pd["cluster"].value_counts()

1.0    2906
0.0    2783
2.0    1966
3.0    1176
4.0    1169
Name: cluster, dtype: int64

In [10]:
# Train a linear regression model for each cluster
linear_regression_models = {}
lr_cluster_times = {}
for cluster in cluster_train_test_splits:
    start = time.time()
    train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
    model = LinearRegression()
    model.fit(train_x, train_y)
    linear_regression_models[cluster] = model
    end = time.time()
    lr_cluster_times[cluster] = end - start

# Evaluate the linear regression models
linear_regression_mse = {}
lr_cluster_results = {}
for cluster in linear_regression_models:
    model = linear_regression_models[cluster]
    train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
    predictions = model.predict(test_x)
    mse = mean_squared_error(test_y, predictions)
    linear_regression_mse[cluster] = mse
    results = metrics.evaluate(test_y, predictions)
    lr_cluster_results[cluster] = results

# Also do a regular train/test split
X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
y = small_time_and_space_arr_pd["end_time"]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model for the entire dataset
model = LinearRegression()
start = time.time()
model.fit(train_x, train_y)
end = time.time()
lr_time = end - start
predictions = model.predict(test_x)
lr_mse = mean_squared_error(test_y, predictions)
lr_entire_results = metrics.evaluate(test_y, predictions)

print(f"Training time for entire dataset: {lr_time}")
print()
print(f"Linear Regression MSE for Entire Dataset: {lr_mse}")
print()
print("Linear Regression Training Times for Each Cluster:")
for cluster in lr_cluster_times:
    print(f"Cluster {cluster}: {lr_cluster_times[cluster]}")
print()
print("Linear Regression MSE for Each Cluster:")
for cluster in linear_regression_mse:
    print(f"Cluster {cluster}: {linear_regression_mse[cluster]}")

# Print the average of all entries in lr_cluster_results
print()
print("Average of all entries in lr_cluster_results:")
lr_cluster_results_pd = pd.DataFrame(lr_cluster_results)
print(lr_cluster_results_pd.mean(axis=1))

Training time for entire dataset: 0.0012881755828857422

Linear Regression MSE for Entire Dataset: 0.12155805688854653

Linear Regression Training Times for Each Cluster:
Cluster 0.0: 0.0018339157104492188
Cluster 1.0: 0.0020568370819091797
Cluster 2.0: 0.0009860992431640625
Cluster 3.0: 0.0012049674987792969
Cluster 4.0: 0.0007288455963134766

Linear Regression MSE for Each Cluster:
Cluster 0.0: 0.039213024809092306
Cluster 1.0: 0.14505166299762787
Cluster 2.0: 0.003318368760180141
Cluster 3.0: 0.01796160488738441
Cluster 4.0: 0.1765149867168799

Average of all entries in lr_cluster_results:
MAE        0.069936
RMSE       0.238128
MAD        0.045427
MAPE       0.252623
MAXAPE    33.926156
TAPE      97.099360
dtype: float64


In [11]:
lr_entire_results

{'MAE': 0.0711493164406142,
 'RMSE': 0.34865177023578486,
 'MAD': 0.037222118158097006,
 'MAPE': 0.17124203540145422,
 'MAXAPE': 18.954042224448255,
 'TAPE': 342.48407080290843}

In [14]:
# Train a random forest model for each cluster
random_forest_models = {}
rf_cluster_times = {}
for cluster in cluster_train_test_splits:
    train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
    model = RandomForestRegressor()
    start = time.time()
    model.fit(train_x, train_y)
    end = time.time()
    rf_cluster_times[cluster] = end - start
    random_forest_models[cluster] = model

# Evaluate the random forest models
random_forest_mse = {}
for cluster in random_forest_models:
    model = random_forest_models[cluster]
    train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
    predictions = model.predict(test_x)
    mse = mean_squared_error(test_y, predictions)
    random_forest_mse[cluster] = mse
    
# Also do a regular train/test split
X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
y = small_time_and_space_arr_pd["end_time"]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a random forest model for the entire dataset
model = RandomForestRegressor()
start = time.time()
model.fit(train_x, train_y)
end = time.time()
rf_time = end - start
predictions = model.predict(test_x)
mse = mean_squared_error(test_y, predictions)

print(f"Training Time for Entire Dataset: {rf_time}")
print()
print(f"Random Forest MSE for Entire Dataset: {mse}")
print()
print("Random Forest Training Times for Each Cluster:")
for cluster in rf_cluster_times:
    print(f"Cluster {cluster}: {rf_cluster_times[cluster]}")
print()
print("Random Forest MSE for Each Cluster:")
for cluster in random_forest_mse:
    print(f"Cluster {cluster}: {random_forest_mse[cluster]}")

Training Time for Entire Dataset: 2.2465171813964844

Random Forest MSE for Entire Dataset: 0.043384756654168626

Random Forest Training Times for Each Cluster:
Cluster 0.0: 0.5286421775817871
Cluster 1.0: 0.5370738506317139
Cluster 2.0: 0.35738706588745117
Cluster 3.0: 0.21899890899658203
Cluster 4.0: 0.21520614624023438

Random Forest MSE for Each Cluster:
Cluster 0.0: 0.02144629111257377
Cluster 1.0: 0.03772802743462243
Cluster 2.0: 0.005291408981845475
Cluster 3.0: 0.08192402372832287
Cluster 4.0: 0.15909367869171734


In [15]:
# Train a neural network model for each cluster
neural_network_models = {}
nn_cluster_times = {}
for cluster in cluster_train_test_splits:
    train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
    model = MLPRegressor()
    start = time.time()
    model.fit(train_x, train_y)
    end = time.time()
    nn_cluster_times[cluster] = end - start
    neural_network_models[cluster] = model

# Evaluate the neural network models
neural_network_mse = {}
for cluster in neural_network_models:
    model = neural_network_models[cluster]
    train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
    predictions = model.predict(test_x)
    mse = mean_squared_error(test_y, predictions)
    neural_network_mse[cluster] = mse

# Also do a regular train/test split
X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
y = small_time_and_space_arr_pd["end_time"]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a neural network model for the entire dataset
model = MLPRegressor()
start = time.time()
model.fit(train_x, train_y)
end = time.time()
nn_time = end - start
predictions = model.predict(test_x)
mse = mean_squared_error(test_y, predictions)

print(f"NN Training Time for Entire Dataset: {nn_time}")
print()
print(f"NN MSE for Entire Dataset: {mse}")
print()
print("NN Training Times for Each Cluster:")
for cluster in nn_cluster_times:
    print(f"Cluster {cluster}: {nn_cluster_times[cluster]}")
print()
print("NN MSE for Each Cluster:")
for cluster in neural_network_mse:
    print(f"Cluster {cluster}: {neural_network_mse[cluster]}")



NN Training Time for Entire Dataset: 2.6895737648010254

NN MSE for Entire Dataset: 0.08650308140077245

NN Training Times for Each Cluster:
Cluster 0.0: 0.7068591117858887
Cluster 1.0: 0.7510719299316406
Cluster 2.0: 0.1628129482269287
Cluster 3.0: 0.3173229694366455
Cluster 4.0: 0.09429192543029785

NN MSE for Each Cluster:
Cluster 0.0: 0.036605677170569006
Cluster 1.0: 0.1173815393100648
Cluster 2.0: 0.007457718034959324
Cluster 3.0: 0.026199237086368052
Cluster 4.0: 0.17337422702407582




In [17]:
num_epochs = 25
gp_lr = 0.1
    
# Train a Gaussian process model for each cluster
gaussian_process_models = {}
gp_cluster_times = {}
for cluster in cluster_train_test_splits:
    train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
    train_x = torch.tensor(train_x.values).float()
    train_y = torch.tensor(train_y.values).float()
    likelihood = GaussianLikelihood()
    model = methods.GPModel(train_x, train_y, likelihood)
    model.train()
    likelihood.train()
    mll = ExactMarginalLogLikelihood(likelihood, model)
    optimizer = torch.optim.Adam(model.parameters(), lr=gp_lr)
    start = time.time()
    for i in range(num_epochs):
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        optimizer.step()
    end = time.time()
    gp_cluster_times[cluster] = end - start
    gaussian_process_models[cluster] = model

# Evaluate the Gaussian process models
gaussian_process_mse = {}
for cluster in gaussian_process_models:
    model = gaussian_process_models[cluster]
    test_x, test_y = cluster_train_test_splits[cluster][1], cluster_train_test_splits[cluster][3]
    test_x = torch.tensor(test_x.values).float()
    test_y = torch.tensor(test_y.values).float()
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        predictions = model(test_x)
        mse = mean_squared_error(test_y, predictions.mean)
        gaussian_process_mse[cluster] = mse

# Also do a regular train/test split
X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
y = small_time_and_space_arr_pd["end_time"]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Gaussian process model for the entire dataset
train_x = torch.tensor(train_x.values).float()
train_y = torch.tensor(train_y.values).float()
likelihood = GaussianLikelihood()
model = methods.GPModel(train_x, train_y, likelihood)
model.train()
likelihood.train()
mll = ExactMarginalLogLikelihood(likelihood, model)
optimizer = torch.optim.Adam(model.parameters(), lr=gp_lr)
start = time.time()
for i in range(num_epochs):
    optimizer.zero_grad()
    output = model(train_x)
    loss = -mll(output, train_y)
    loss.backward()
    optimizer.step()
end = time.time()
gp_time = end - start
test_x = torch.tensor(test_x.values).float()
test_y = torch.tensor(test_y.values).float()
model.eval()
likelihood.eval()
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    predictions = model(test_x)
    mse = mean_squared_error(test_y, predictions.mean)

print(f"GP Training Time for Entire Dataset: {gp_time}")
print()
print(f"GP MSE for Entire Dataset: {mse}")
print()
print("GP Training Times for Each Cluster:")
for cluster in gp_cluster_times:
    print(f"Cluster {cluster}: {gp_cluster_times[cluster]}")
print()
print("GP MSE for Each Cluster:")
for cluster in gaussian_process_mse:
    print(f"Cluster {cluster}: {gaussian_process_mse[cluster]}")


GP Training Time for Entire Dataset: 12.097362041473389

GP MSE for Entire Dataset: 0.06379331648349762

GP Training Times for Each Cluster:
Cluster 0.0: 1.7533986568450928
Cluster 1.0: 1.734299898147583
Cluster 2.0: 0.6195230484008789
Cluster 3.0: 0.3039207458496094
Cluster 4.0: 0.30571484565734863

GP MSE for Each Cluster:
Cluster 0.0: 0.03782009333372116
Cluster 1.0: 0.06882806122303009
Cluster 2.0: 0.009221695363521576
Cluster 3.0: 0.043596670031547546
Cluster 4.0: 0.1631726324558258


In [18]:
# Train a gradient boosting model for each cluster
gradient_boosting_models = {}
gb_cluster_times = {}
for cluster in cluster_train_test_splits:
    train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
    model = GradientBoostingRegressor()
    start = time.time()
    model.fit(train_x, train_y)
    end = time.time()
    gb_cluster_times[cluster] = end - start
    gradient_boosting_models[cluster] = model

# Evaluate the gradient boosting models
gradient_boosting_mse = {}
for cluster in gradient_boosting_models:
    model = gradient_boosting_models[cluster]
    train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
    predictions = model.predict(test_x)
    mse = mean_squared_error(test_y, predictions)
    gradient_boosting_mse[cluster] = mse

# Also do a regular train/test split
X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
y = small_time_and_space_arr_pd["end_time"]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a gradient boosting model for the entire dataset
model = GradientBoostingRegressor()
start = time.time()
model.fit(train_x, train_y)
end = time.time()
gb_time = end - start
predictions = model.predict(test_x)
mse = mean_squared_error(test_y, predictions)

print(f"Training time for entire dataset: {gb_time}")
print()
print(f"Gradient Boosting MSE for Entire Dataset: {mse}")
print()
print("Gradient Boosting Training Times for Each Cluster:")
for cluster in gb_cluster_times:
    print(f"Cluster {cluster}: {gb_cluster_times[cluster]}")
print()
print("Gradient Boosting MSE for Each Cluster:")
for cluster in gradient_boosting_mse:
    print(f"Cluster {cluster}: {gradient_boosting_mse[cluster]}")

Training time for entire dataset: 0.8310761451721191

Gradient Boosting MSE for Entire Dataset: 0.03814455236191411

Gradient Boosting Training Times for Each Cluster:
Cluster 0.0: 0.22678589820861816
Cluster 1.0: 0.2177867889404297
Cluster 2.0: 0.15129613876342773
Cluster 3.0: 0.09425926208496094
Cluster 4.0: 0.09214210510253906

Gradient Boosting MSE for Each Cluster:
Cluster 0.0: 0.026964058798025135
Cluster 1.0: 0.037025286744018095
Cluster 2.0: 0.006013536929567091
Cluster 3.0: 0.09805893359151696
Cluster 4.0: 0.1755063411688478


In [65]:
# List of all parameters for sensitivity analysis
# [k, lam, gam, N, norm]
N = [1000, 10000, 100000]
norms = [1, 2]
lams = [0.1, 0.5, 0.9]
gams = [0.5, 1.0, 2.0]
K = [3, 6, 9]
test_size = 0.2
random_state = 42
max_iters = 10
num_epochs = 25
gp_lr = 0.1

verbose = True

# Run sensitivity analysis
for n in N:
    for norm in norms:
        for lam in lams:
            for gam in gams:
                for k in K:
                    print(f"N = {n}, Norm = {norm}, Lambda = {lam}, Gamma = {gam}, K = {k}")
                    print()
                    
                    # Project only relevant columns
                    projected_data = data.select("started_at", "ended_at", "start_lat", "start_lng",
                                    "end_lat", "end_lng")
                    
                    # Select a small sample (first N) of rows
                    small_data = projected_data.select("*").limit(n)

                    # Collect working dataset into an array
                    small_time_and_space = hf.extract_times_to_seconds(small_data)
                    small_time_and_space_arr = np.array(small_time_and_space.collect())

                    # Normalize the data
                    scaler = StandardScaler()
                    scaler.fit(small_time_and_space_arr)
                    small_time_and_space_arr = scaler.transform(small_time_and_space_arr)

                    # Pick the first k points as a "random" initialization of centroids
                    random_centroids = small_time_and_space_arr[:k]

                    # Run k-means time & space algorithm on small_space_and_time data
                    # lam (a hyper-parameter between 0 and 1) controls how much we weight space
                    # vs. time in our calculation of the cost. The smaller lam is, the more space
                    # is weighted.
                    costs, centroids, clustering = hf.K_means_time_and_space(small_time_and_space_arr,
                                                                        random_centroids,
                                                                        max_iter = max_iters, norm = norm,
                                                                        lam = lam,
                                                                        gam = gam)
                    
                    # Add clustering to the dataframe
                    small_time_and_space_arr_pd = pd.DataFrame(small_time_and_space_arr)
                    small_time_and_space_arr_pd.columns = ["start_lat", "start_lng", "end_lat", "end_lng", "start_time", "end_time"]
                    small_time_and_space_arr_pd["cluster"] = clustering

                    cluster_train_test_splits = hf.cluster_train_test_split(small_time_and_space_arr_pd)

                    # Linear regression
                    linear_regression_models = {}
                    lr_cluster_times = {}
                    for cluster in cluster_train_test_splits:
                        start = time.time()
                        train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
                        model = LinearRegression()
                        model.fit(train_x, train_y)
                        linear_regression_models[cluster] = model
                        end = time.time()
                        lr_cluster_times[cluster] = end - start

                    # Evaluate the linear regression models
                    linear_regression_mse = {}
                    lr_cluster_results = {}
                    for cluster in linear_regression_models:
                        model = linear_regression_models[cluster]
                        train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
                        predictions = model.predict(test_x)
                        lr_cluster_mse = mean_squared_error(test_y, predictions)
                        linear_regression_mse[cluster] = lr_cluster_mse
                        results = metrics.evaluate(test_y, predictions)
                        lr_cluster_results[cluster] = results

                    # Also do a regular train/test split
                    X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
                    y = small_time_and_space_arr_pd["end_time"]
                    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)

                    # Train a linear regression model for the entire dataset
                    model = LinearRegression()
                    start = time.time()
                    model.fit(train_x, train_y)
                    end = time.time()
                    lr_time = end - start
                    predictions = model.predict(test_x)
                    lr_entire_mse = mean_squared_error(test_y, predictions)
                    lr_entire_results = metrics.evaluate(test_y, predictions)

                    if verbose:
                        print(f"Training time for entire dataset: {lr_time}")
                        print()
                        print(f"Linear Regression MSE for Entire Dataset: {lr_entire_mse}")
                        print()
                        print("Linear Regression Training Times for Each Cluster:")
                        for cluster in lr_cluster_times:
                            print(f"Cluster {cluster}: {lr_cluster_times[cluster]}")
                        print()
                        print("Linear Regression MSE for Each Cluster:")
                        for cluster in linear_regression_mse:
                            print(f"Cluster {cluster}: {linear_regression_mse[cluster]}")

                        # Print the average of all entries in lr_cluster_results
                        print()
                        print("Average of all entries in lr_cluster_results:")
                        lr_cluster_results_pd = pd.DataFrame(lr_cluster_results)
                        print(lr_cluster_results_pd.mean(axis=1))
                        print()
                        print(f"Linear Regression Entire Dataset: {lr_entire_results}")

                    # Random forest
                    random_forest_models = {}
                    rf_cluster_times = {}
                    for cluster in cluster_train_test_splits:
                        train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
                        model = RandomForestRegressor()
                        start = time.time()
                        model.fit(train_x, train_y)
                        end = time.time()
                        rf_cluster_times[cluster] = end - start
                        random_forest_models[cluster] = model

                    random_forest_mse = {}
                    rf_cluster_results = {}
                    for cluster in random_forest_models:
                        model = random_forest_models[cluster]
                        train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
                        predictions = model.predict(test_x)
                        rf_cluster_mse = mean_squared_error(test_y, predictions)
                        random_forest_mse[cluster] = rf_cluster_mse
                        results = metrics.evaluate(test_y, predictions)
                        rf_cluster_results[cluster] = results

                    X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
                    y = small_time_and_space_arr_pd["end_time"]
                    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)

                    model = RandomForestRegressor()
                    start = time.time()
                    model.fit(train_x, train_y)
                    end = time.time()
                    rf_time = end - start
                    predictions = model.predict(test_x)
                    rf_entire_mse = mean_squared_error(test_y, predictions)
                    rf_entire_results = metrics.evaluate(test_y, predictions)

                    if verbose:
                        print(f"Training Time for Entire Dataset: {rf_time}")
                        print()
                        print(f"Random Forest MSE for Entire Dataset: {rf_entire_mse}")
                        print()
                        print("Random Forest Training Times for Each Cluster:")
                        for cluster in rf_cluster_times:
                            print(f"Cluster {cluster}: {rf_cluster_times[cluster]}")
                        print()
                        print("Random Forest MSE for Each Cluster:")
                        for cluster in random_forest_mse:
                            print(f"Cluster {cluster}: {random_forest_mse[cluster]}")

                        # Print the average of all entries in rf_cluster_results
                        print()
                        print("Average of all entries in rf_cluster_results:")
                        rf_cluster_results_pd = pd.DataFrame(rf_cluster_results)
                        print(rf_cluster_results_pd.mean(axis=1))
                        print()
                        print(f"Random Forest Entire Dataset: {rf_entire_results}")

                    # Neural network
                    neural_network_models = {}
                    nn_cluster_times = {}
                    for cluster in cluster_train_test_splits:
                        train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
                        model = MLPRegressor()
                        start = time.time()
                        model.fit(train_x, train_y)
                        end = time.time()
                        nn_cluster_times[cluster] = end - start
                        neural_network_models[cluster] = model
                    
                    neural_network_mse = {}
                    nn_cluster_results = {}
                    for cluster in neural_network_models:
                        model = neural_network_models[cluster]
                        train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
                        predictions = model.predict(test_x)
                        nn_cluster_mse = mean_squared_error(test_y, predictions)
                        neural_network_mse[cluster] = nn_cluster_mse
                        results = metrics.evaluate(test_y, predictions)
                        nn_cluster_results[cluster] = results

                    X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
                    y = small_time_and_space_arr_pd["end_time"]
                    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)

                    model = MLPRegressor()
                    start = time.time()
                    model.fit(train_x, train_y)
                    end = time.time()
                    nn_time = end - start
                    predictions = model.predict(test_x)
                    nn_mse = mean_squared_error(test_y, predictions)
                    nn_entire_results = metrics.evaluate(test_y, predictions)

                    if verbose:
                        print(f"NN Training Time for Entire Dataset: {nn_time}")
                        print()
                        print(f"NN MSE for Entire Dataset: {nn_mse}")
                        print()
                        print("NN Training Times for Each Cluster:")
                        for cluster in nn_cluster_times:
                            print(f"Cluster {cluster}: {nn_cluster_times[cluster]}")
                        print()
                        print("NN MSE for Each Cluster:")
                        for cluster in neural_network_mse:
                            print(f"Cluster {cluster}: {neural_network_mse[cluster]}")

                        # Print the average of all entries in nn_cluster_results
                        print()
                        print("Average of all entries in nn_cluster_results:")
                        nn_cluster_results_pd = pd.DataFrame(nn_cluster_results)
                        print(nn_cluster_results_pd.mean(axis=1))
                        print()
                        print(f"NN Entire Dataset: {nn_entire_results}")

                    # Gaussian process
                    gaussian_process_models = {}
                    gp_cluster_times = {}
                    for cluster in cluster_train_test_splits:
                        train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
                        train_x = torch.tensor(train_x.values).float()
                        train_y = torch.tensor(train_y.values).float()
                        likelihood = GaussianLikelihood()
                        model = methods.GPModel(train_x, train_y, likelihood)
                        model.train()
                        likelihood.train()
                        mll = ExactMarginalLogLikelihood(likelihood, model)
                        optimizer = torch.optim.Adam(model.parameters(), lr=gp_lr)
                        start = time.time()
                        for i in range(num_epochs):
                            optimizer.zero_grad()
                            output = model(train_x)
                            loss = -mll(output, train_y)
                            loss.backward()
                            optimizer.step()
                        end = time.time()
                        gp_cluster_times[cluster] = end - start
                        gaussian_process_models[cluster] = model

                    gaussian_process_mse = {}
                    gp_cluster_results = {}
                    for cluster in gaussian_process_models:
                        model = gaussian_process_models[cluster]
                        test_x, test_y = cluster_train_test_splits[cluster][1], cluster_train_test_splits[cluster][3]
                        test_x = torch.tensor(test_x.values).float()
                        test_y = torch.tensor(test_y.values).float()
                        model.eval()
                        likelihood.eval()
                        with torch.no_grad(), gpytorch.settings.fast_pred_var():
                            predictions = model(test_x)
                            gp_cluster_mse = mean_squared_error(test_y, predictions.mean)
                            gaussian_process_mse[cluster] = gp_cluster_mse
                            results = metrics.evaluate(test_y, predictions.mean)
                            gp_cluster_results[cluster] = results

                    X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
                    y = small_time_and_space_arr_pd["end_time"]
                    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)

                    train_x = torch.tensor(train_x.values).float()
                    train_y = torch.tensor(train_y.values).float()
                    likelihood = GaussianLikelihood()
                    model = methods.GPModel(train_x, train_y, likelihood)
                    model.train()
                    likelihood.train()
                    mll = ExactMarginalLogLikelihood(likelihood, model)
                    optimizer = torch.optim.Adam(model.parameters(), lr=gp_lr)
                    start = time.time()
                    for i in range(num_epochs):
                        optimizer.zero_grad()
                        output = model(train_x)
                        loss = -mll(output, train_y)
                        loss.backward()
                        optimizer.step()
                    end = time.time()
                    gp_time = end - start
                    test_x = torch.tensor(test_x.values).float()
                    test_y = torch.tensor(test_y.values).float()
                    model.eval()
                    likelihood.eval()
                    with torch.no_grad(), gpytorch.settings.fast_pred_var():
                        predictions = model(test_x)
                        gp_entire_mse = mean_squared_error(test_y, predictions.mean)
                        gp_entire_results = metrics.evaluate(test_y, predictions.mean)

                    if verbose:
                        print(f"GP Training Time for Entire Dataset: {gp_time}")
                        print()
                        print(f"GP MSE for Entire Dataset: {gp_entire_mse}")
                        print()
                        print("GP Training Times for Each Cluster:")
                        for cluster in gp_cluster_times:
                            print(f"Cluster {cluster}: {gp_cluster_times[cluster]}")
                        print()
                        print("GP MSE for Each Cluster:")
                        for cluster in gaussian_process_mse:
                            print(f"Cluster {cluster}: {gaussian_process_mse[cluster]}")
                        
                        # Print the average of all entries in nn_cluster_results
                        print()
                        print("Average of all entries in gp_cluster_results:")
                        gp_cluster_results_pd = pd.DataFrame(gp_cluster_results)
                        print(gp_cluster_results_pd.mean(axis=1))
                        print()
                        print(f"GP Entire Dataset: {gp_entire_results}")

                    # Gradient boosting
                    gradient_boosting_models = {}
                    gb_cluster_times = {}
                    for cluster in cluster_train_test_splits:
                        train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
                        model = GradientBoostingRegressor()
                        start = time.time()
                        model.fit(train_x, train_y)
                        end = time.time()
                        gb_cluster_times[cluster] = end - start
                        gradient_boosting_models[cluster] = model

                    gradient_boosting_mse = {}
                    gb_cluster_results = {}
                    for cluster in gradient_boosting_models:
                        model = gradient_boosting_models[cluster]
                        train_x, test_x, train_y, test_y = cluster_train_test_splits[cluster]
                        predictions = model.predict(test_x)
                        gb_cluster_mse = mean_squared_error(test_y, predictions)
                        gradient_boosting_mse[cluster] = gb_cluster_mse
                        results = metrics.evaluate(test_y, predictions)
                        gb_cluster_results[cluster] = results

                    X = small_time_and_space_arr_pd.drop(columns=["cluster", "end_time"])
                    y = small_time_and_space_arr_pd["end_time"]
                    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)

                    model = GradientBoostingRegressor()
                    start = time.time()
                    model.fit(train_x, train_y)
                    end = time.time()
                    gb_time = end - start
                    predictions = model.predict(test_x)
                    gb_entire_mse = mean_squared_error(test_y, predictions)
                    gb_entire_results = metrics.evaluate(test_y, predictions)

                    if verbose:
                        print(f"Training time for entire dataset: {gb_time}")
                        print()
                        print(f"Gradient Boosting MSE for Entire Dataset: {gb_entire_mse}")
                        print()
                        print("Gradient Boosting Training Times for Each Cluster:")
                        for cluster in gb_cluster_times:
                            print(f"Cluster {cluster}: {gb_cluster_times[cluster]}")
                        print()
                        print("Gradient Boosting MSE for Each Cluster:")
                        for cluster in gradient_boosting_mse:
                            print(f"Cluster {cluster}: {gradient_boosting_mse[cluster]}")

                        # Print the average of all entries in gb_cluster_results
                        print()
                        print("Average of all entries in gb_cluster_results:")
                        gb_cluster_results_pd = pd.DataFrame(gb_cluster_results)
                        print(gb_cluster_results_pd.mean(axis=1))
                        print()
                        print(f"Gradient Boosting Entire Dataset: {gb_entire_results}")

                    # Create empty DataFrame to store results
                    results_df = pd.DataFrame(columns=["N", "Norm", "Lambda", "Gamma", "K",
                                                       "LR_Entire_Time",
                                                       "Random Forest Entire Dataset",
                                                       "NN Entire Dataset",
                                                       "GP Entire Dataset",
                                                       "Gradient Boosting Entire Dataset"])

                    # Save the results as a DataFrame
                    results_df = pd.DataFrame({
                        "N": n,
                        "Norm": norm,
                        "Lambda": lam,
                        "Gamma": gam,
                        "K": k,
                        "LR_Entire_Time": lr_time,
                        "LR_Entire_MAE": lr_entire_results['MAE'],
                        "LR_Entire_RMSE": lr_entire_results['RMSE'],
                        "LR_Entire_MAD": lr_entire_results['MAD'],
                        "LR_Cluster_Time_Avg": pd.DataFrame(lr_cluster_times, index=[0]).mean(axis=1)[0],
                        "LR_Cluster_MAE_Avg": lr_cluster_results_pd.T['MAE'].mean(),
                        "LR_Cluster_RMSE_Avg": lr_cluster_results_pd.T['RMSE'].mean(),
                        "LR_Cluster_MAD_Avg": lr_cluster_results_pd.T['MAD'].mean(),
                        "LR_Cluster_MAE_Median": lr_cluster_results_pd.T['MAE'].median(),
                        "LR_Cluster_RMSE_Median": lr_cluster_results_pd.T['RMSE'].median(),
                        "LR_Cluster_MAD_Median": lr_cluster_results_pd.T['MAD'].median(),
                        "RF_Entire_Time": rf_time,
                        "RF_Entire_MAE": rf_entire_results['MAE'],
                        "RF_Entire_RMSE": rf_entire_results['RMSE'],
                        "RF_Entire_MAD": rf_entire_results['MAD'],
                        "RF_Cluster_Time_Avg": pd.DataFrame(rf_cluster_times, index=[0]).mean(axis=1)[0],
                        "RF_Cluster_MAE_Avg": rf_cluster_results_pd.T['MAE'].mean(),
                        "RF_Cluster_RMSE_Avg": rf_cluster_results_pd.T['RMSE'].mean(),
                        "RF_Cluster_MAD_Avg": rf_cluster_results_pd.T['MAD'].mean(),
                        "RF_Cluster_MAE_Median": rf_cluster_results_pd.T['MAE'].median(),
                        "RF_Cluster_RMSE_Median": rf_cluster_results_pd.T['RMSE'].median(),
                        "RF_Cluster_MAD_Median": rf_cluster_results_pd.T['MAD'].median(),
                        "NN_Entire_Time": nn_time,
                        "NN_Entire_MAE": nn_entire_results['MAE'],
                        "NN_Entire_RMSE": nn_entire_results['RMSE'],
                        "NN_Entire_MAD": nn_entire_results['MAD'],
                        "NN_Cluster_Time_Avg": pd.DataFrame(nn_cluster_times, index=[0]).mean(axis=1)[0],
                        "NN_Cluster_MAE_Avg": nn_cluster_results_pd.T['MAE'].mean(),
                        "NN_Cluster_RMSE_Avg": nn_cluster_results_pd.T['RMSE'].mean(),
                        "NN_Cluster_MAD_Avg": nn_cluster_results_pd.T['MAD'].mean(),
                        "NN_Cluster_MAE_Median": nn_cluster_results_pd.T['MAE'].median(),
                        "NN_Cluster_RMSE_Median": nn_cluster_results_pd.T['RMSE'].median(),
                        "NN_Cluster_MAD_Median": nn_cluster_results_pd.T['MAD'].median(),
                        "GP_Entire_Time": gp_time,
                        "GP_Entire_MAE": gp_entire_results['MAE'],
                        "GP_Entire_RMSE": gp_entire_results['RMSE'],
                        "GP_Entire_MAD": gp_entire_results['MAD'],
                        "GP_Cluster_Time_Avg": pd.DataFrame(gp_cluster_times, index=[0]).mean(axis=1)[0],
                        "GP_Cluster_MAE_Avg": gp_cluster_results_pd.T['MAE'].mean(),
                        "GP_Cluster_RMSE_Avg": gp_cluster_results_pd.T['RMSE'].mean(),
                        "GP_Cluster_MAD_Avg": gp_cluster_results_pd.T['MAD'].mean(),
                        "GP_Cluster_MAE_Median": gp_cluster_results_pd.T['MAE'].median(),
                        "GP_Cluster_RMSE_Median": gp_cluster_results_pd.T['RMSE'].median(),
                        "GP_Cluster_MAD_Median": gp_cluster_results_pd.T['MAD'].median(),
                        "GB_Entire_Time": gb_time,
                        "GB_Entire_MAE": gb_entire_results['MAE'],
                        "GB_Entire_RMSE": gb_entire_results['RMSE'],
                        "GB_Entire_MAD": gb_entire_results['MAD'],
                        "GB_Cluster_Time_Avg": pd.DataFrame(gb_cluster_times, index=[0]).mean(axis=1)[0],
                        "GB_Cluster_MAE_Avg": gb_cluster_results_pd.T['MAE'].mean(),
                        "GB_Cluster_RMSE_Avg": gb_cluster_results_pd.T['RMSE'].mean(),
                        "GB_Cluster_MAD_Avg": gb_cluster_results_pd.T['MAD'].mean(),
                        "GB_Cluster_MAE_Median": gb_cluster_results_pd.T['MAE'].median(),
                        "GB_Cluster_RMSE_Median": gb_cluster_results_pd.T['RMSE'].median(),
                        "GB_Cluster_MAD_Median": gb_cluster_results_pd.T['MAD'].median(),
                    }, index=[0])

                    # Save the results to a CSV file    
                    results_df.to_csv(f"results_{n}_{norm}_{lam}_{gam}_{k}.csv", index = False)

                    print()
                                        


N = 1000, Norm = 1, Lambda = 0.1, Gamma = 0.5, K = 3

Training time for entire dataset: 0.0009908676147460938

Linear Regression MSE for Entire Dataset: 0.10090998209080261

Linear Regression Training Times for Each Cluster:
Cluster 0.0: 0.000993967056274414
Cluster 1.0: 0.0007119178771972656
Cluster 2.0: 0.0006711483001708984

Linear Regression MSE for Each Cluster:
Cluster 0.0: 0.012722107247752275
Cluster 1.0: 0.0018078346424578835
Cluster 2.0: 0.2917196343294968

Average of all entries in lr_cluster_results:
MAE     0.085784
RMSE    0.231807
MAD     0.056430
dtype: float64

Linear Regression Entire Dataset: {'MAE': 0.07572404595198262, 'RMSE': 0.3176633156201745, 'MAD': 0.045674406410488144}
Training Time for Entire Dataset: 0.1434791088104248

Random Forest MSE for Entire Dataset: 0.0009692028865520469

Random Forest Training Times for Each Cluster:
Cluster 0.0: 0.06168985366821289
Cluster 1.0: 0.06725287437438965
Cluster 2.0: 0.06749677658081055

Random Forest MSE for Each Cluste



NN Training Time for Entire Dataset: 0.10111594200134277

NN MSE for Entire Dataset: 0.09507812664001813

NN Training Times for Each Cluster:
Cluster 0.0: 0.06457805633544922
Cluster 1.0: 0.03092503547668457
Cluster 2.0: 0.051236867904663086

NN MSE for Each Cluster:
Cluster 0.0: 0.02055573186710895
Cluster 1.0: 0.007782743037333077
Cluster 2.0: 0.2790088465472565

Average of all entries in nn_cluster_results:
MAE     0.089187
RMSE    0.253268
MAD     0.045270
dtype: float64

NN Entire Dataset: {'MAE': 0.08746079921278742, 'RMSE': 0.3083474122479677, 'MAD': 0.038064733558556885}
GP Training Time for Entire Dataset: 0.25675010681152344

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06463909149169922
Cluster 1.0: 0.07325410842895508
Cluster 2.0: 0.06986808776855469

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
M



NN Training Time for Entire Dataset: 0.13249778747558594

NN MSE for Entire Dataset: 0.09538576776716731

NN Training Times for Each Cluster:
Cluster 0.0: 0.030894041061401367
Cluster 1.0: 0.03580880165100098
Cluster 2.0: 0.012879133224487305
Cluster 3.0: 0.009495973587036133
Cluster 4.0: 0.02063894271850586
Cluster 5.0: 0.026911020278930664

NN MSE for Each Cluster:
Cluster 0.0: 0.004788359096111033
Cluster 1.0: 0.01083499392500426
Cluster 2.0: 0.0061710333110069765
Cluster 3.0: 1.2313614408890652
Cluster 4.0: 0.01911571446891896
Cluster 5.0: 0.3745712618357043

Average of all entries in nn_cluster_results:
MAE     0.128237
RMSE    0.351966
MAD     0.049290
dtype: float64

NN Entire Dataset: {'MAE': 0.09771067465772702, 'RMSE': 0.30884586409270126, 'MAD': 0.044649708875513186}
GP Training Time for Entire Dataset: 0.24770832061767578

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.07380080223083496
Cluster 1.0: 0.10318183898925781
Clu



NN Training Time for Entire Dataset: 0.14024758338928223

NN MSE for Entire Dataset: 0.09645486112726853

NN Training Times for Each Cluster:
Cluster 0.0: 0.010924816131591797
Cluster 1.0: 0.04203987121582031
Cluster 2.0: 0.019251108169555664
Cluster 3.0: 0.009165048599243164
Cluster 4.0: 0.02024078369140625
Cluster 5.0: 0.03474926948547363
Cluster 6.0: 0.00839996337890625
Cluster 7.0: 0.03597617149353027
Cluster 8.0: 0.02791309356689453

NN MSE for Each Cluster:
Cluster 0.0: 0.01371061762984429
Cluster 1.0: 0.009200887130304876
Cluster 2.0: 0.09909178729367205
Cluster 3.0: 0.004682491083819573
Cluster 4.0: 0.5689034674821435
Cluster 5.0: 0.6585483023640688
Cluster 6.0: 0.01434979356462245
Cluster 7.0: 0.011662169930382568
Cluster 8.0: 0.011627281213264519

Average of all entries in nn_cluster_results:
MAE     0.140658
RMSE    0.277512
MAD     0.077533
dtype: float64

NN Entire Dataset: {'MAE': 0.09432644998792272, 'RMSE': 0.3105718292557593, 'MAD': 0.041991279974161744}
GP Training Ti



NN Training Time for Entire Dataset: 0.12220168113708496

NN MSE for Entire Dataset: 0.09702448442619982

NN Training Times for Each Cluster:
Cluster 0.0: 0.12065410614013672
Cluster 1.0: 0.044615983963012695
Cluster 2.0: 0.055680036544799805

NN MSE for Each Cluster:
Cluster 0.0: 0.018307368274308438
Cluster 1.0: 0.006819539981743898
Cluster 2.0: 0.2896544837796431

Average of all entries in nn_cluster_results:
MAE     0.088495
RMSE    0.252027
MAD     0.036312
dtype: float64

NN Entire Dataset: {'MAE': 0.09617950974666396, 'RMSE': 0.31148753494514003, 'MAD': 0.04002149764801613}
GP Training Time for Entire Dataset: 0.24641084671020508

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.07154202461242676
Cluster 1.0: 0.07797503471374512
Cluster 2.0: 0.06970071792602539

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:



NN Training Time for Entire Dataset: 0.14743614196777344

NN MSE for Entire Dataset: 0.09539039037477913

NN Training Times for Each Cluster:
Cluster 0.0: 0.021550655364990234
Cluster 1.0: 0.02684783935546875
Cluster 2.0: 0.01067495346069336
Cluster 3.0: 0.008893013000488281
Cluster 4.0: 0.020778894424438477
Cluster 5.0: 0.028881072998046875

NN MSE for Each Cluster:
Cluster 0.0: 0.015350699802457816
Cluster 1.0: 0.01298995300126797
Cluster 2.0: 0.007410747796212416
Cluster 3.0: 1.0609900519563737
Cluster 4.0: 0.024170599136850563
Cluster 5.0: 0.37286329461926176

Average of all entries in nn_cluster_results:
MAE     0.132618
RMSE    0.353349
MAD     0.052931
dtype: float64

NN Entire Dataset: {'MAE': 0.09532444327979532, 'RMSE': 0.30885334768264877, 'MAD': 0.03998890755327411}
GP Training Time for Entire Dataset: 0.24046611785888672

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.08098888397216797
Cluster 1.0: 0.09092187881469727
Clu



NN Training Time for Entire Dataset: 0.22225475311279297

NN MSE for Entire Dataset: 0.09172188241104687

NN Training Times for Each Cluster:
Cluster 0.0: 0.008317232131958008
Cluster 1.0: 0.04311490058898926
Cluster 2.0: 0.01896500587463379
Cluster 3.0: 0.013158798217773438
Cluster 4.0: 0.02020716667175293
Cluster 5.0: 0.02371811866760254
Cluster 6.0: 0.011011123657226562
Cluster 7.0: 0.034256935119628906
Cluster 8.0: 0.030458927154541016

NN MSE for Each Cluster:
Cluster 0.0: 0.015326694479430997
Cluster 1.0: 0.0048436762734940875
Cluster 2.0: 0.0943941583684811
Cluster 3.0: 0.005150841373669431
Cluster 4.0: 0.6330311250742141
Cluster 5.0: 0.6799744451324433
Cluster 6.0: 0.017554222686122754
Cluster 7.0: 0.009339009175656816
Cluster 8.0: 0.012943246356032341

Average of all entries in nn_cluster_results:
MAE     0.139170
RMSE    0.281727
MAD     0.069719
dtype: float64

NN Entire Dataset: {'MAE': 0.08958003060864822, 'RMSE': 0.3028562074831006, 'MAD': 0.031282046908067795}
GP Trainin



NN Training Time for Entire Dataset: 0.07393407821655273

NN MSE for Entire Dataset: 0.09691049982920477

NN Training Times for Each Cluster:
Cluster 0.0: 0.06391310691833496
Cluster 1.0: 0.040743112564086914
Cluster 2.0: 0.05921196937561035

NN MSE for Each Cluster:
Cluster 0.0: 0.0250111342229896
Cluster 1.0: 0.004342301068116833
Cluster 2.0: 0.3027614235901201

Average of all entries in nn_cluster_results:
MAE     0.083221
RMSE    0.258094
MAD     0.039210
dtype: float64

NN Entire Dataset: {'MAE': 0.08873674466575071, 'RMSE': 0.3113045130241526, 'MAD': 0.03707176960618045}
GP Training Time for Entire Dataset: 0.2531089782714844

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.07181692123413086
Cluster 1.0: 0.07801222801208496
Cluster 2.0: 0.06896615028381348

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MAE 



NN Training Time for Entire Dataset: 0.2754018306732178

NN MSE for Entire Dataset: 0.09513931952260596

NN Training Times for Each Cluster:
Cluster 0.0: 0.01919698715209961
Cluster 1.0: 0.030755043029785156
Cluster 2.0: 0.011284112930297852
Cluster 3.0: 0.009516716003417969
Cluster 4.0: 0.022439002990722656
Cluster 5.0: 0.025516986846923828

NN MSE for Each Cluster:
Cluster 0.0: 0.014716492043535732
Cluster 1.0: 0.011835274810925658
Cluster 2.0: 0.01022946137922301
Cluster 3.0: 1.0976372746694611
Cluster 4.0: 0.012994438549968866
Cluster 5.0: 0.3730617811368736

Average of all entries in nn_cluster_results:
MAE     0.136397
RMSE    0.350618
MAD     0.065199
dtype: float64

NN Entire Dataset: {'MAE': 0.09841145431277401, 'RMSE': 0.3084466234579428, 'MAD': 0.03906889860977696}
GP Training Time for Entire Dataset: 0.2584848403930664

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.0519711971282959
Cluster 1.0: 0.07095718383789062
Cluster



NN Training Time for Entire Dataset: 0.16259288787841797

NN MSE for Entire Dataset: 0.09095230727683845

NN Training Times for Each Cluster:
Cluster 0.0: 0.009136199951171875
Cluster 1.0: 0.0932931900024414
Cluster 2.0: 0.018726110458374023
Cluster 3.0: 0.015086650848388672
Cluster 4.0: 0.023807048797607422
Cluster 5.0: 0.03981804847717285
Cluster 6.0: 0.011390209197998047
Cluster 7.0: 0.04231095314025879
Cluster 8.0: 0.02641892433166504

NN MSE for Each Cluster:
Cluster 0.0: 0.007944988953923359
Cluster 1.0: 0.006030975114938269
Cluster 2.0: 0.4345487004768939
Cluster 3.0: 0.00921287166994005
Cluster 4.0: 0.4970555219076733
Cluster 5.0: 0.6627396252056496
Cluster 6.0: 0.019830435539299564
Cluster 7.0: 0.006876273442932135
Cluster 8.0: 0.009780050968291402

Average of all entries in nn_cluster_results:
MAE     0.167132
RMSE    0.307081
MAD     0.083849
dtype: float64

NN Entire Dataset: {'MAE': 0.09145689630910842, 'RMSE': 0.3015830023009229, 'MAD': 0.04071778093971934}
GP Training Ti



NN Training Time for Entire Dataset: 0.23798012733459473

NN MSE for Entire Dataset: 0.0926759717223085

NN Training Times for Each Cluster:
Cluster 0.0: 0.1060171127319336
Cluster 1.0: 0.062124013900756836
Cluster 2.0: 0.058319807052612305

NN MSE for Each Cluster:
Cluster 0.0: 0.01747633578182839
Cluster 1.0: 0.007429571997527209
Cluster 2.0: 0.2824625092412581

Average of all entries in nn_cluster_results:
MAE     0.088232
RMSE    0.249955
MAD     0.037710
dtype: float64

NN Entire Dataset: {'MAE': 0.09439274186425532, 'RMSE': 0.30442728478621706, 'MAD': 0.04094313872763783}
GP Training Time for Entire Dataset: 0.25222015380859375

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06631708145141602
Cluster 1.0: 0.08059382438659668
Cluster 2.0: 0.08364510536193848

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MA



NN Training Time for Entire Dataset: 0.15776395797729492

NN MSE for Entire Dataset: 0.09123998594939063

NN Training Times for Each Cluster:
Cluster 0.0: 0.030949115753173828
Cluster 1.0: 0.022443056106567383
Cluster 2.0: 0.012820959091186523
Cluster 3.0: 0.009185791015625
Cluster 4.0: 0.020315885543823242
Cluster 5.0: 0.029835939407348633

NN MSE for Each Cluster:
Cluster 0.0: 0.010917702885763355
Cluster 1.0: 0.29369428358477717
Cluster 2.0: 0.008326564861382352
Cluster 3.0: 1.2122617987885256
Cluster 4.0: 0.04049784063858708
Cluster 5.0: 0.3990075659342371

Average of all entries in nn_cluster_results:
MAE     0.143986
RMSE    0.445269
MAD     0.050767
dtype: float64

NN Entire Dataset: {'MAE': 0.09384065108904943, 'RMSE': 0.3020595735105753, 'MAD': 0.04081276846822257}
GP Training Time for Entire Dataset: 0.24008703231811523

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.05661201477050781
Cluster 1.0: 0.06977987289428711
Cluster



NN Training Time for Entire Dataset: 0.23577594757080078

NN MSE for Entire Dataset: 0.09312895542115449

NN Training Times for Each Cluster:
Cluster 0.0: 0.011466026306152344
Cluster 1.0: 0.026371240615844727
Cluster 2.0: 0.018704891204833984
Cluster 3.0: 0.008033990859985352
Cluster 4.0: 0.0200803279876709
Cluster 5.0: 0.018603086471557617
Cluster 6.0: 0.009559154510498047
Cluster 7.0: 0.01697516441345215
Cluster 8.0: 0.02523326873779297

NN MSE for Each Cluster:
Cluster 0.0: 0.02966122160296063
Cluster 1.0: 0.009164056602696357
Cluster 2.0: 0.08239280624543739
Cluster 3.0: 0.019384261366313292
Cluster 4.0: 0.5787986085203086
Cluster 5.0: 0.6571059684993105
Cluster 6.0: 0.05708908585124548
Cluster 7.0: 0.013485503808431385
Cluster 8.0: 0.005472929937775837

Average of all entries in nn_cluster_results:
MAE     0.157795
RMSE    0.299408
MAD     0.091450
dtype: float64

NN Entire Dataset: {'MAE': 0.09264711363866075, 'RMSE': 0.3051703711390647, 'MAD': 0.03311328072952153}
GP Training T



NN Training Time for Entire Dataset: 0.23718786239624023

NN MSE for Entire Dataset: 0.09695685758962855

NN Training Times for Each Cluster:
Cluster 0.0: 0.07065391540527344
Cluster 1.0: 0.043440818786621094
Cluster 2.0: 0.08652281761169434

NN MSE for Each Cluster:
Cluster 0.0: 0.02167081784996997
Cluster 1.0: 0.0038136931318683573
Cluster 2.0: 0.2968188722063802

Average of all entries in nn_cluster_results:
MAE     0.089696
RMSE    0.251259
MAD     0.037408
dtype: float64

NN Entire Dataset: {'MAE': 0.1000524966840906, 'RMSE': 0.31137896137926296, 'MAD': 0.050695242012155986}
GP Training Time for Entire Dataset: 0.26158595085144043

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06433892250061035
Cluster 1.0: 0.07015085220336914
Cluster 2.0: 0.0669560432434082

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
M



NN Training Time for Entire Dataset: 0.17229008674621582

NN MSE for Entire Dataset: 0.09114196482548145

NN Training Times for Each Cluster:
Cluster 0.0: 0.023739099502563477
Cluster 1.0: 0.029009103775024414
Cluster 2.0: 0.007715940475463867
Cluster 3.0: 0.010164976119995117
Cluster 4.0: 0.020812034606933594
Cluster 5.0: 0.021615028381347656

NN MSE for Each Cluster:
Cluster 0.0: 0.013183797231935239
Cluster 1.0: 0.2759654255309927
Cluster 2.0: 0.005941460722428418
Cluster 3.0: 1.2483130139608403
Cluster 4.0: 0.03704563782796745
Cluster 5.0: 0.3985657167623211

Average of all entries in nn_cluster_results:
MAE     0.152360
RMSE    0.443050
MAD     0.056701
dtype: float64

NN Entire Dataset: {'MAE': 0.09052519860248089, 'RMSE': 0.30189727528661375, 'MAD': 0.032162259464815005}
GP Training Time for Entire Dataset: 0.2292919158935547

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.09012198448181152
Cluster 1.0: 0.07467198371887207
Clus



NN Training Time for Entire Dataset: 0.22494220733642578

NN MSE for Entire Dataset: 0.09294464051577524

NN Training Times for Each Cluster:
Cluster 0.0: 0.009564876556396484
Cluster 1.0: 0.03167414665222168
Cluster 2.0: 0.0188448429107666
Cluster 3.0: 0.008373737335205078
Cluster 4.0: 0.01995062828063965
Cluster 5.0: 0.01956772804260254
Cluster 6.0: 0.008222103118896484
Cluster 7.0: 0.020460844039916992
Cluster 8.0: 0.025556325912475586

NN MSE for Each Cluster:
Cluster 0.0: 0.010921434954787238
Cluster 1.0: 0.016888218072447458
Cluster 2.0: 0.06630857441579015
Cluster 3.0: 0.005184672938289151
Cluster 4.0: 0.5648380668396158
Cluster 5.0: 0.6507126260484305
Cluster 6.0: 0.00528990351389827
Cluster 7.0: 0.010727437635304976
Cluster 8.0: 0.011385646690284667

Average of all entries in nn_cluster_results:
MAE     0.137819
RMSE    0.267245
MAD     0.067962
dtype: float64

NN Entire Dataset: {'MAE': 0.09111067606497521, 'RMSE': 0.30486823467815605, 'MAD': 0.035533623529808475}
GP Training



NN Training Time for Entire Dataset: 0.12349128723144531

NN MSE for Entire Dataset: 0.09430974316276497

NN Training Times for Each Cluster:
Cluster 0.0: 0.1046750545501709
Cluster 1.0: 0.06666326522827148
Cluster 2.0: 0.06778287887573242

NN MSE for Each Cluster:
Cluster 0.0: 0.023572057403656834
Cluster 1.0: 0.0060227598788655315
Cluster 2.0: 0.2792176177670742

Average of all entries in nn_cluster_results:
MAE     0.084499
RMSE    0.253183
MAD     0.033019
dtype: float64

NN Entire Dataset: {'MAE': 0.09659049807282034, 'RMSE': 0.3070989142976005, 'MAD': 0.0459791861173256}
GP Training Time for Entire Dataset: 0.24309992790222168

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.07285022735595703
Cluster 1.0: 0.07894277572631836
Cluster 2.0: 0.07194304466247559

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MAE



NN Training Time for Entire Dataset: 0.26569604873657227

NN MSE for Entire Dataset: 0.09255071702306852

NN Training Times for Each Cluster:
Cluster 0.0: 0.031168699264526367
Cluster 1.0: 0.027862071990966797
Cluster 2.0: 0.009418010711669922
Cluster 3.0: 0.011132955551147461
Cluster 4.0: 0.02117466926574707
Cluster 5.0: 0.04352903366088867

NN MSE for Each Cluster:
Cluster 0.0: 0.008592229206661568
Cluster 1.0: 0.29194350080160963
Cluster 2.0: 0.007075336414476531
Cluster 3.0: 1.062040138315654
Cluster 4.0: 0.01365508956545128
Cluster 5.0: 0.3708513566585691

Average of all entries in nn_cluster_results:
MAE     0.140459
RMSE    0.412252
MAD     0.050383
dtype: float64

NN Entire Dataset: {'MAE': 0.0908612928726971, 'RMSE': 0.3042214933614463, 'MAD': 0.03412753906602557}
GP Training Time for Entire Dataset: 0.27297425270080566

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06304621696472168
Cluster 1.0: 0.09145307540893555
Cluster 



NN Training Time for Entire Dataset: 0.14749813079833984

NN MSE for Entire Dataset: 0.09695309611751074

NN Training Times for Each Cluster:
Cluster 0.0: 0.012533903121948242
Cluster 1.0: 0.02609086036682129
Cluster 2.0: 0.01865410804748535
Cluster 3.0: 0.008189916610717773
Cluster 4.0: 0.020143985748291016
Cluster 5.0: 0.023840904235839844
Cluster 6.0: 0.006239175796508789
Cluster 7.0: 0.023868083953857422
Cluster 8.0: 0.026107072830200195

NN MSE for Each Cluster:
Cluster 0.0: 0.011621608856578642
Cluster 1.0: 0.009201522784787508
Cluster 2.0: 0.3794518878784849
Cluster 3.0: 0.005880207768516963
Cluster 4.0: 0.6503719870884435
Cluster 5.0: 0.5803714234030438
Cluster 6.0: 0.04356210654183216
Cluster 7.0: 0.010043263585197394
Cluster 8.0: 0.008534701839810031

Average of all entries in nn_cluster_results:
MAE     0.174104
RMSE    0.318444
MAD     0.099450
dtype: float64

NN Entire Dataset: {'MAE': 0.09499789285742749, 'RMSE': 0.31137292129777555, 'MAD': 0.04125720383582665}
GP Trainin



NN Training Time for Entire Dataset: 0.2456669807434082

NN MSE for Entire Dataset: 0.09791244318013945

NN Training Times for Each Cluster:
Cluster 0.0: 0.05671501159667969
Cluster 1.0: 0.04053902626037598
Cluster 2.0: 0.06778216361999512

NN MSE for Each Cluster:
Cluster 0.0: 0.019946924282797627
Cluster 1.0: 0.00718772831563882
Cluster 2.0: 0.29186468190408565

Average of all entries in nn_cluster_results:
MAE     0.094048
RMSE    0.255420
MAD     0.048477
dtype: float64

NN Entire Dataset: {'MAE': 0.10050102532050456, 'RMSE': 0.31290964059954984, 'MAD': 0.04920668588963477}
GP Training Time for Entire Dataset: 0.22387075424194336

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06584692001342773
Cluster 1.0: 0.07386898994445801
Cluster 2.0: 0.07677817344665527

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MA



NN Training Time for Entire Dataset: 0.19789910316467285

NN MSE for Entire Dataset: 0.09540522154495278

NN Training Times for Each Cluster:
Cluster 0.0: 0.026304960250854492
Cluster 1.0: 0.01815199851989746
Cluster 2.0: 0.008796930313110352
Cluster 3.0: 0.011415243148803711
Cluster 4.0: 0.02086186408996582
Cluster 5.0: 0.03996109962463379

NN MSE for Each Cluster:
Cluster 0.0: 0.014818713448294756
Cluster 1.0: 0.2991735340308277
Cluster 2.0: 0.004328805789584908
Cluster 3.0: 1.2527828484362877
Cluster 4.0: 0.021889689422932034
Cluster 5.0: 0.020611738288449512

Average of all entries in nn_cluster_results:
MAE     0.134240
RMSE    0.357548
MAD     0.054700
dtype: float64

NN Entire Dataset: {'MAE': 0.09468793171014206, 'RMSE': 0.30887735680193973, 'MAD': 0.03897060738665081}
GP Training Time for Entire Dataset: 0.19573092460632324

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.05082297325134277
Cluster 1.0: 0.07078886032104492
Clus



NN Training Time for Entire Dataset: 0.155181884765625

NN MSE for Entire Dataset: 0.09494571873137173

NN Training Times for Each Cluster:
Cluster 0.0: 0.02529120445251465
Cluster 1.0: 0.024192094802856445
Cluster 2.0: 0.011890888214111328
Cluster 3.0: 0.011756181716918945
Cluster 4.0: 0.019488096237182617
Cluster 5.0: 0.03987622261047363
Cluster 6.0: 0.016708850860595703
Cluster 7.0: 0.014924049377441406
Cluster 8.0: 0.009205102920532227

NN MSE for Each Cluster:
Cluster 0.0: 0.006326668580232788
Cluster 1.0: 0.017626263904754896
Cluster 2.0: 0.01283084048849172
Cluster 3.0: 0.035837835034588775
Cluster 4.0: 0.4649448854666403
Cluster 5.0: 0.3435983659969384
Cluster 6.0: 0.16171872527963196
Cluster 7.0: 0.007927730697721798
Cluster 8.0: 0.028278265943780734

Average of all entries in nn_cluster_results:
MAE     0.172974
RMSE    0.271363
MAD     0.103466
dtype: float64

NN Entire Dataset: {'MAE': 0.0972473049214668, 'RMSE': 0.3081326317211011, 'MAD': 0.04638955536341771}
GP Training T



NN Training Time for Entire Dataset: 0.09873318672180176

NN MSE for Entire Dataset: 0.09853120206129803

NN Training Times for Each Cluster:
Cluster 0.0: 0.023964881896972656
Cluster 1.0: 0.023762941360473633
Cluster 2.0: 0.010887861251831055
Cluster 3.0: 0.009510040283203125
Cluster 4.0: 0.02079010009765625
Cluster 5.0: 0.04452919960021973

NN MSE for Each Cluster:
Cluster 0.0: 0.014460478356274405
Cluster 1.0: 0.28644260564538293
Cluster 2.0: 0.0014075269383374619
Cluster 3.0: 1.1274902185079372
Cluster 4.0: 0.02171887277141933
Cluster 5.0: 0.01645376530310539

Average of all entries in nn_cluster_results:
MAE     0.122315
RMSE    0.338408
MAD     0.052076
dtype: float64

NN Entire Dataset: {'MAE': 0.09258456529171002, 'RMSE': 0.3138968016104943, 'MAD': 0.036714827195840745}
GP Training Time for Entire Dataset: 0.20350217819213867

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.04922962188720703
Cluster 1.0: 0.07972192764282227
Clu



NN Training Time for Entire Dataset: 0.13743376731872559

NN MSE for Entire Dataset: 0.0992043566796303

NN Training Times for Each Cluster:
Cluster 0.0: 0.015712976455688477
Cluster 1.0: 0.02982783317565918
Cluster 2.0: 0.012295007705688477
Cluster 3.0: 0.009382963180541992
Cluster 4.0: 0.019659042358398438
Cluster 5.0: 0.03726387023925781
Cluster 6.0: 0.01662421226501465
Cluster 7.0: 0.017993927001953125
Cluster 8.0: 0.007844924926757812

NN MSE for Each Cluster:
Cluster 0.0: 0.007988831057328144
Cluster 1.0: 0.02043938832585011
Cluster 2.0: 0.011292524793884295
Cluster 3.0: 0.09789376251820424
Cluster 4.0: 0.3228887478913019
Cluster 5.0: 0.3924397533196273
Cluster 6.0: 0.06322464893730106
Cluster 7.0: 0.010797622862901004
Cluster 8.0: 0.012298786737757222

Average of all entries in nn_cluster_results:
MAE     0.153199
RMSE    0.256937
MAD     0.094365
dtype: float64

NN Entire Dataset: {'MAE': 0.09453031753831541, 'RMSE': 0.3149672311203664, 'MAD': 0.04065849086120987}
GP Training T



NN Training Time for Entire Dataset: 0.16663503646850586

NN MSE for Entire Dataset: 0.09538635586031628

NN Training Times for Each Cluster:
Cluster 0.0: 0.0733339786529541
Cluster 1.0: 0.036628007888793945
Cluster 2.0: 0.07255411148071289

NN MSE for Each Cluster:
Cluster 0.0: 0.025134033302823964
Cluster 1.0: 0.007214382264890843
Cluster 2.0: 0.27174285514680496

Average of all entries in nn_cluster_results:
MAE     0.089584
RMSE    0.254921
MAD     0.038652
dtype: float64

NN Entire Dataset: {'MAE': 0.09220057929351476, 'RMSE': 0.3088468161731901, 'MAD': 0.03690609062147582}
GP Training Time for Entire Dataset: 0.23601222038269043

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.062157630920410156
Cluster 1.0: 0.07194709777832031
Cluster 2.0: 0.06727766990661621

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:




NN Training Time for Entire Dataset: 0.11048698425292969

NN MSE for Entire Dataset: 0.10205451650596398

NN Training Times for Each Cluster:
Cluster 0.0: 0.028688907623291016
Cluster 1.0: 0.020090103149414062
Cluster 2.0: 0.015505790710449219
Cluster 3.0: 0.011775970458984375
Cluster 4.0: 0.02083301544189453
Cluster 5.0: 0.043854713439941406

NN MSE for Each Cluster:
Cluster 0.0: 0.01039943965361611
Cluster 1.0: 0.2915376136265408
Cluster 2.0: 0.0017077840950515352
Cluster 3.0: 1.13136358127817
Cluster 4.0: 0.022640229017908797
Cluster 5.0: 0.018758589698063786

Average of all entries in nn_cluster_results:
MAE     0.137002
RMSE    0.339055
MAD     0.065710
dtype: float64

NN Entire Dataset: {'MAE': 0.09861043708461086, 'RMSE': 0.31945972595299704, 'MAD': 0.04592435196747724}
GP Training Time for Entire Dataset: 0.1982288360595703

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.04765200614929199
Cluster 1.0: 0.07211017608642578
Clust



NN Training Time for Entire Dataset: 0.21100687980651855

NN MSE for Entire Dataset: 0.09297064005154834

NN Training Times for Each Cluster:
Cluster 0.0: 0.024618148803710938
Cluster 1.0: 0.02751898765563965
Cluster 2.0: 0.010920047760009766
Cluster 3.0: 0.01005101203918457
Cluster 4.0: 0.020387649536132812
Cluster 5.0: 0.02450418472290039
Cluster 6.0: 0.01947784423828125
Cluster 7.0: 0.0216672420501709
Cluster 8.0: 0.008580923080444336

NN MSE for Each Cluster:
Cluster 0.0: 0.011265253671109754
Cluster 1.0: 0.021327995960886824
Cluster 2.0: 0.006692178922435406
Cluster 3.0: 0.028587491372546
Cluster 4.0: 0.21987891473859988
Cluster 5.0: 0.43857379617566955
Cluster 6.0: 0.09620715262969022
Cluster 7.0: 0.02061615228371356
Cluster 8.0: 0.0053194756474721355

Average of all entries in nn_cluster_results:
MAE     0.141084
RMSE    0.240102
MAD     0.079427
dtype: float64

NN Entire Dataset: {'MAE': 0.09637618021708412, 'RMSE': 0.3049108723078735, 'MAD': 0.04102270387594997}
GP Training Ti



NN Training Time for Entire Dataset: 0.10898995399475098

NN MSE for Entire Dataset: 0.09964515754691063

NN Training Times for Each Cluster:
Cluster 0.0: 0.06440615653991699
Cluster 1.0: 0.035192251205444336
Cluster 2.0: 0.012080192565917969
Cluster 3.0: 0.01667618751525879
Cluster 4.0: 0.020434856414794922
Cluster 5.0: 0.028649091720581055

NN MSE for Each Cluster:
Cluster 0.0: 0.023130228552588655
Cluster 1.0: 0.010943452528658658
Cluster 2.0: 0.00917497590148576
Cluster 3.0: 1.1287903625488482
Cluster 4.0: 0.03897883216113829
Cluster 5.0: 0.37010467226965077

Average of all entries in nn_cluster_results:
MAE     0.135375
RMSE    0.370120
MAD     0.043593
dtype: float64

NN Entire Dataset: {'MAE': 0.09656263751325418, 'RMSE': 0.3156662122351878, 'MAD': 0.040629816844956146}
GP Training Time for Entire Dataset: 0.24378132820129395

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.05586504936218262
Cluster 1.0: 0.08214592933654785
Clus



NN Training Time for Entire Dataset: 0.20354509353637695

NN MSE for Entire Dataset: 0.09460379606688278

NN Training Times for Each Cluster:
Cluster 0.0: 0.008986711502075195
Cluster 1.0: 0.015069007873535156
Cluster 2.0: 0.01852583885192871
Cluster 3.0: 0.007447004318237305
Cluster 4.0: 0.019684791564941406
Cluster 5.0: 0.010161876678466797
Cluster 6.0: 0.00632786750793457
Cluster 7.0: 0.019385814666748047
Cluster 8.0: 0.025532960891723633

NN MSE for Each Cluster:
Cluster 0.0: 0.01445843818508396
Cluster 1.0: 0.013897652592197928
Cluster 2.0: 0.0805350665517875
Cluster 3.0: 0.006311574507422701
Cluster 4.0: 0.49426125967103435
Cluster 5.0: 0.6442343852902873
Cluster 6.0: 0.030101589910802244
Cluster 7.0: 0.01987375984144766
Cluster 8.0: 0.00999258914459346

Average of all entries in nn_cluster_results:
MAE     0.147026
RMSE    0.280164
MAD     0.080639
dtype: float64

NN Entire Dataset: {'MAE': 0.09189910555946494, 'RMSE': 0.3075773009616977, 'MAD': 0.04058673080927119}
GP Training 



NN Training Time for Entire Dataset: 0.24817800521850586

NN MSE for Entire Dataset: 0.09252442917604486

NN Training Times for Each Cluster:
Cluster 0.0: 0.07464194297790527
Cluster 1.0: 0.04179215431213379
Cluster 2.0: 0.03255105018615723

NN MSE for Each Cluster:
Cluster 0.0: 0.029508015439759807
Cluster 1.0: 0.005820199136753279
Cluster 2.0: 0.2785701393741098

Average of all entries in nn_cluster_results:
MAE     0.087806
RMSE    0.258622
MAD     0.038176
dtype: float64

NN Entire Dataset: {'MAE': 0.08992996316562563, 'RMSE': 0.30417828518164286, 'MAD': 0.03734362112191214}
GP Training Time for Entire Dataset: 0.2976357936859131

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06701397895812988
Cluster 1.0: 0.07907819747924805
Cluster 2.0: 0.07127618789672852

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MA



NN Training Time for Entire Dataset: 0.09779882431030273

NN MSE for Entire Dataset: 0.090673255427808

NN Training Times for Each Cluster:
Cluster 0.0: 0.03889298439025879
Cluster 1.0: 0.02607107162475586
Cluster 2.0: 0.010348081588745117
Cluster 3.0: 0.011230945587158203
Cluster 4.0: 0.02059793472290039
Cluster 5.0: 0.02447366714477539

NN MSE for Each Cluster:
Cluster 0.0: 0.0144972645927126
Cluster 1.0: 0.010953261692545626
Cluster 2.0: 0.007688366833954135
Cluster 3.0: 1.1512729710695335
Cluster 4.0: 0.035701673984432344
Cluster 5.0: 0.389184035533228

Average of all entries in nn_cluster_results:
MAE     0.140097
RMSE    0.366419
MAD     0.052598
dtype: float64

NN Entire Dataset: {'MAE': 0.0906840254263502, 'RMSE': 0.3011200017066419, 'MAD': 0.03507099651271284}
GP Training Time for Entire Dataset: 0.2731921672821045

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.05631899833679199
Cluster 1.0: 0.07635307312011719
Cluster 2.0: 



NN Training Time for Entire Dataset: 0.13769793510437012

NN MSE for Entire Dataset: 0.09708078170548338

NN Training Times for Each Cluster:
Cluster 0.0: 0.010278940200805664
Cluster 1.0: 0.030811071395874023
Cluster 2.0: 0.018800020217895508
Cluster 3.0: 0.008821964263916016
Cluster 4.0: 0.020892858505249023
Cluster 5.0: 0.019566059112548828
Cluster 6.0: 0.010452985763549805
Cluster 7.0: 0.030890941619873047
Cluster 8.0: 0.028116941452026367

NN MSE for Each Cluster:
Cluster 0.0: 0.022902620314304057
Cluster 1.0: 0.00849370727582446
Cluster 2.0: 0.09404642865286755
Cluster 3.0: 0.0038624166803544633
Cluster 4.0: 0.5441598681366338
Cluster 5.0: 0.6678304528055365
Cluster 6.0: 0.0022163345968221684
Cluster 7.0: 0.006061314531031366
Cluster 8.0: 0.01047844582343493

Average of all entries in nn_cluster_results:
MAE     0.132204
RMSE    0.266055
MAD     0.066048
dtype: float64

NN Entire Dataset: {'MAE': 0.09451963695011098, 'RMSE': 0.31157789027060856, 'MAD': 0.04206172682101186}
GP Tra



NN Training Time for Entire Dataset: 0.11107492446899414

NN MSE for Entire Dataset: 0.09197138168595886

NN Training Times for Each Cluster:
Cluster 0.0: 0.07444190979003906
Cluster 1.0: 0.04569196701049805
Cluster 2.0: 0.07887005805969238

NN MSE for Each Cluster:
Cluster 0.0: 0.0169673432443433
Cluster 1.0: 0.0038332631080592416
Cluster 2.0: 0.30363312111607105

Average of all entries in nn_cluster_results:
MAE     0.081555
RMSE    0.247734
MAD     0.034588
dtype: float64

NN Entire Dataset: {'MAE': 0.09379977439556722, 'RMSE': 0.3032678381991056, 'MAD': 0.04431973766277757}
GP Training Time for Entire Dataset: 0.2933230400085449

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.07521891593933105
Cluster 1.0: 0.07739686965942383
Cluster 2.0: 0.06939005851745605

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MAE



NN Training Time for Entire Dataset: 0.13793492317199707

NN MSE for Entire Dataset: 0.09295925668008807

NN Training Times for Each Cluster:
Cluster 0.0: 0.0215761661529541
Cluster 1.0: 0.02561473846435547
Cluster 2.0: 0.014972209930419922
Cluster 3.0: 0.011586904525756836
Cluster 4.0: 0.02098679542541504
Cluster 5.0: 0.047621965408325195

NN MSE for Each Cluster:
Cluster 0.0: 0.01708350856117452
Cluster 1.0: 0.00990994713038725
Cluster 2.0: 0.0013776585268983096
Cluster 3.0: 1.2075618353285114
Cluster 4.0: 0.05867947435058715
Cluster 5.0: 0.3625712983878716

Average of all entries in nn_cluster_results:
MAE     0.142256
RMSE    0.368440
MAD     0.055884
dtype: float64

NN Entire Dataset: {'MAE': 0.09621603751386541, 'RMSE': 0.3048922050169339, 'MAD': 0.044802846856764424}
GP Training Time for Entire Dataset: 0.252061128616333

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.0507969856262207
Cluster 1.0: 0.0799100399017334
Cluster 2.0



NN Training Time for Entire Dataset: 0.14725804328918457

NN MSE for Entire Dataset: 0.09326148434220506

NN Training Times for Each Cluster:
Cluster 0.0: 0.008327960968017578
Cluster 1.0: 0.038471221923828125
Cluster 2.0: 0.02011585235595703
Cluster 3.0: 0.009088993072509766
Cluster 4.0: 0.019979000091552734
Cluster 5.0: 0.013897180557250977
Cluster 6.0: 0.013927936553955078
Cluster 7.0: 0.022003889083862305
Cluster 8.0: 0.02620220184326172

NN MSE for Each Cluster:
Cluster 0.0: 0.00718193723939361
Cluster 1.0: 0.007762951116344265
Cluster 2.0: 0.46265928989662886
Cluster 3.0: 0.031306424043340254
Cluster 4.0: 0.5791546055418675
Cluster 5.0: 0.6583145685049414
Cluster 6.0: 0.024970693141051947
Cluster 7.0: 0.008134244464326475
Cluster 8.0: 0.012040933023217869

Average of all entries in nn_cluster_results:
MAE     0.179139
RMSE    0.328923
MAD     0.104320
dtype: float64

NN Entire Dataset: {'MAE': 0.09103455436182306, 'RMSE': 0.30538743317662087, 'MAD': 0.03842907593553346}
GP Traini



NN Training Time for Entire Dataset: 0.19089102745056152

NN MSE for Entire Dataset: 0.0925006852728826

NN Training Times for Each Cluster:
Cluster 0.0: 0.08053302764892578
Cluster 1.0: 0.050987958908081055
Cluster 2.0: 0.05262899398803711

NN MSE for Each Cluster:
Cluster 0.0: 0.033482876301445696
Cluster 1.0: 0.006782061004887079
Cluster 2.0: 0.27554269893552424

Average of all entries in nn_cluster_results:
MAE     0.090522
RMSE    0.263419
MAD     0.033924
dtype: float64

NN Entire Dataset: {'MAE': 0.08722265144472573, 'RMSE': 0.30413925309450374, 'MAD': 0.031914092918574}
GP Training Time for Entire Dataset: 0.25373268127441406

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06295299530029297
Cluster 1.0: 0.07624197006225586
Cluster 2.0: 0.06653523445129395

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MA



NN Training Time for Entire Dataset: 0.1713259220123291

NN MSE for Entire Dataset: 0.0938724848904963

NN Training Times for Each Cluster:
Cluster 0.0: 0.028168916702270508
Cluster 1.0: 0.02257990837097168
Cluster 2.0: 0.012719869613647461
Cluster 3.0: 0.009774208068847656
Cluster 4.0: 0.02078390121459961
Cluster 5.0: 0.036820173263549805

NN MSE for Each Cluster:
Cluster 0.0: 0.008651122579965923
Cluster 1.0: 0.2882984164029927
Cluster 2.0: 0.005677676329487344
Cluster 3.0: 1.2456414809044047
Cluster 4.0: 0.021978916422265276
Cluster 5.0: 0.3453548104650564

Average of all entries in nn_cluster_results:
MAE     0.137585
RMSE    0.426217
MAD     0.047150
dtype: float64

NN Entire Dataset: {'MAE': 0.09674781026742867, 'RMSE': 0.3063861695483272, 'MAD': 0.04305947738895316}
GP Training Time for Entire Dataset: 0.2581818103790283

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.05013012886047363
Cluster 1.0: 0.07184982299804688
Cluster 2



NN Training Time for Entire Dataset: 0.16064691543579102

NN MSE for Entire Dataset: 0.09615747201501464

NN Training Times for Each Cluster:
Cluster 0.0: 0.010468006134033203
Cluster 1.0: 0.030318737030029297
Cluster 2.0: 0.01893901824951172
Cluster 3.0: 0.011083126068115234
Cluster 4.0: 0.019548892974853516
Cluster 5.0: 0.014314889907836914
Cluster 6.0: 0.011955022811889648
Cluster 7.0: 0.02093195915222168
Cluster 8.0: 0.0253298282623291

NN MSE for Each Cluster:
Cluster 0.0: 0.006744393311377195
Cluster 1.0: 0.014977170079119773
Cluster 2.0: 0.07320620179730648
Cluster 3.0: 0.009513554055416136
Cluster 4.0: 0.5904875622029327
Cluster 5.0: 0.6676750344415533
Cluster 6.0: 0.017661531643940208
Cluster 7.0: 0.016397389349312886
Cluster 8.0: 0.01944249948405841

Average of all entries in nn_cluster_results:
MAE     0.144771
RMSE    0.284282
MAD     0.073914
dtype: float64

NN Entire Dataset: {'MAE': 0.09230802614441606, 'RMSE': 0.31009268294336556, 'MAD': 0.034860574020549984}
GP Trainin



NN Training Time for Entire Dataset: 0.2082669734954834

NN MSE for Entire Dataset: 0.09247413199129512

NN Training Times for Each Cluster:
Cluster 0.0: 0.06994175910949707
Cluster 1.0: 0.05002117156982422
Cluster 2.0: 0.08613705635070801

NN MSE for Each Cluster:
Cluster 0.0: 0.01577946832763754
Cluster 1.0: 0.006080795478546501
Cluster 2.0: 0.2970937517168951

Average of all entries in nn_cluster_results:
MAE     0.086260
RMSE    0.249553
MAD     0.040906
dtype: float64

NN Entire Dataset: {'MAE': 0.09375795525672022, 'RMSE': 0.3040955967969532, 'MAD': 0.03521673506614789}
GP Training Time for Entire Dataset: 0.27335572242736816

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06944394111633301
Cluster 1.0: 0.08002114295959473
Cluster 2.0: 0.07303285598754883

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MAE 



CodeCache: size=131072Kb used=45747Kb max_used=45763Kb free=85324Kb
 bounds [0x00000001099d8000, 0x000000010c6d8000, 0x00000001119d8000]
 total_blobs=15284 nmethods=14349 adapters=845
 compilation: disabled (not enough contiguous free space left)
Training time for entire dataset: 0.0007300376892089844

Linear Regression MSE for Entire Dataset: 0.10090998209080261

Linear Regression Training Times for Each Cluster:
Cluster 0.0: 0.0009911060333251953
Cluster 1.0: 0.0007328987121582031
Cluster 2.0: 0.0006253719329833984
Cluster 3.0: 0.0005941390991210938
Cluster 4.0: 0.0005891323089599609
Cluster 5.0: 0.0006158351898193359

Linear Regression MSE for Each Cluster:
Cluster 0.0: 0.001081374571046037
Cluster 1.0: 0.2899474065731582
Cluster 2.0: 0.00046725242156208416
Cluster 3.0: 1.2581623446505168
Cluster 4.0: 0.12072767446665389
Cluster 5.0: 0.4872585540951948

Average of all entries in lr_cluster_results:
MAE     0.143322
RMSE    0.460024
MAD     0.053070
dtype: float64

Linear Regression 



NN Training Time for Entire Dataset: 0.08148407936096191

NN MSE for Entire Dataset: 0.09593327405499581

NN Training Times for Each Cluster:
Cluster 0.0: 0.016382932662963867
Cluster 1.0: 0.02994704246520996
Cluster 2.0: 0.008491039276123047
Cluster 3.0: 0.008610963821411133
Cluster 4.0: 0.020972013473510742
Cluster 5.0: 0.04097104072570801

NN MSE for Each Cluster:
Cluster 0.0: 0.022364072025669986
Cluster 1.0: 0.286031411568918
Cluster 2.0: 0.0077113917246961275
Cluster 3.0: 1.1415964572647634
Cluster 4.0: 0.009366930863886316
Cluster 5.0: 0.354027244141391

Average of all entries in nn_cluster_results:
MAE     0.136489
RMSE    0.422070
MAD     0.042089
dtype: float64

NN Entire Dataset: {'MAE': 0.0973800393938474, 'RMSE': 0.30973097044854236, 'MAD': 0.04469803253877949}
GP Training Time for Entire Dataset: 0.24933505058288574

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.05143308639526367
Cluster 1.0: 0.07558417320251465
Cluster



NN Training Time for Entire Dataset: 0.19357705116271973

NN MSE for Entire Dataset: 0.09569831641896887

NN Training Times for Each Cluster:
Cluster 0.0: 0.010727882385253906
Cluster 1.0: 0.027541160583496094
Cluster 2.0: 0.018546104431152344
Cluster 3.0: 0.011859893798828125
Cluster 4.0: 0.019928932189941406
Cluster 5.0: 0.013987064361572266
Cluster 6.0: 0.010319709777832031
Cluster 7.0: 0.014829635620117188
Cluster 8.0: 0.026280879974365234

NN MSE for Each Cluster:
Cluster 0.0: 0.009874875750957994
Cluster 1.0: 0.006249663211595774
Cluster 2.0: 0.06275925504799738
Cluster 3.0: 0.0026315523322476167
Cluster 4.0: 0.5878957227869809
Cluster 5.0: 0.6547789412753555
Cluster 6.0: 0.019692067559340463
Cluster 7.0: 0.011766289274735853
Cluster 8.0: 0.008804021744166064

Average of all entries in nn_cluster_results:
MAE     0.134767
RMSE    0.266534
MAD     0.068375
dtype: float64

NN Entire Dataset: {'MAE': 0.09698489892137716, 'RMSE': 0.30935144483090565, 'MAD': 0.04111300335290215}
GP Tr



NN Training Time for Entire Dataset: 0.19662189483642578

NN MSE for Entire Dataset: 0.09467716649413421

NN Training Times for Each Cluster:
Cluster 0.0: 0.06606674194335938
Cluster 1.0: 0.049043893814086914
Cluster 2.0: 0.060459136962890625

NN MSE for Each Cluster:
Cluster 0.0: 0.01976576315531197
Cluster 1.0: 0.00778612383621449
Cluster 2.0: 0.2728288123884529

Average of all entries in nn_cluster_results:
MAE     0.085788
RMSE    0.250387
MAD     0.037789
dtype: float64

NN Entire Dataset: {'MAE': 0.09680981012546871, 'RMSE': 0.3076965493698852, 'MAD': 0.04412228856666625}
GP Training Time for Entire Dataset: 0.23952484130859375

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06836199760437012
Cluster 1.0: 0.07457804679870605
Cluster 2.0: 0.07001399993896484

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MA



NN Training Time for Entire Dataset: 0.1695570945739746

NN MSE for Entire Dataset: 0.09556623429268535

NN Training Times for Each Cluster:
Cluster 0.0: 0.024075984954833984
Cluster 1.0: 0.028390884399414062
Cluster 2.0: 0.008023977279663086
Cluster 3.0: 0.009727001190185547
Cluster 4.0: 0.020483970642089844
Cluster 5.0: 0.023192167282104492

NN MSE for Each Cluster:
Cluster 0.0: 0.008609897877673444
Cluster 1.0: 0.2993543529638848
Cluster 2.0: 0.005678088541739518
Cluster 3.0: 1.1471179434577274
Cluster 4.0: 0.018696170172175466
Cluster 5.0: 0.33973374155905856

Average of all entries in nn_cluster_results:
MAE     0.141565
RMSE    0.417652
MAD     0.056653
dtype: float64

NN Entire Dataset: {'MAE': 0.09223142977856778, 'RMSE': 0.30913788880155946, 'MAD': 0.0358721551783602}
GP Training Time for Entire Dataset: 0.22802519798278809

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.054653167724609375
Cluster 1.0: 0.06860589981079102
Clu



NN Training Time for Entire Dataset: 0.1443650722503662

NN MSE for Entire Dataset: 0.09238062136521624

NN Training Times for Each Cluster:
Cluster 0.0: 0.008772850036621094
Cluster 1.0: 0.05883526802062988
Cluster 2.0: 0.01646709442138672
Cluster 3.0: 0.008629798889160156
Cluster 4.0: 0.0202329158782959
Cluster 5.0: 0.02403569221496582
Cluster 6.0: 0.01149606704711914
Cluster 7.0: 0.02952718734741211
Cluster 8.0: 0.027128219604492188

NN MSE for Each Cluster:
Cluster 0.0: 0.009690115073595988
Cluster 1.0: 0.004253733618653885
Cluster 2.0: 0.44303888857908447
Cluster 3.0: 0.006894276697265464
Cluster 4.0: 0.5306066224255701
Cluster 5.0: 0.6374430374341539
Cluster 6.0: 0.0029042176810215006
Cluster 7.0: 0.010860287966144494
Cluster 8.0: 0.008659736725125142

Average of all entries in nn_cluster_results:
MAE     0.162829
RMSE    0.298921
MAD     0.086338
dtype: float64

NN Entire Dataset: {'MAE': 0.08573106486129578, 'RMSE': 0.30394180588595615, 'MAD': 0.030584703355865273}
GP Training 



NN Training Time for Entire Dataset: 0.2575571537017822

NN MSE for Entire Dataset: 0.09307466712521556

NN Training Times for Each Cluster:
Cluster 0.0: 0.06769084930419922
Cluster 1.0: 0.04693770408630371
Cluster 2.0: 0.04757189750671387

NN MSE for Each Cluster:
Cluster 0.0: 0.027909468773800048
Cluster 1.0: 0.005458834519119369
Cluster 2.0: 0.2721033246257131

Average of all entries in nn_cluster_results:
MAE     0.089938
RMSE    0.254193
MAD     0.036693
dtype: float64

NN Entire Dataset: {'MAE': 0.09637266112261612, 'RMSE': 0.3050814106516743, 'MAD': 0.040467285975480285}
GP Training Time for Entire Dataset: 0.21960783004760742

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06360793113708496
Cluster 1.0: 0.07480788230895996
Cluster 2.0: 0.06962323188781738

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MA



NN Training Time for Entire Dataset: 0.2166903018951416

NN MSE for Entire Dataset: 0.09261710665249928

NN Training Times for Each Cluster:
Cluster 0.0: 0.016836881637573242
Cluster 1.0: 0.02224898338317871
Cluster 2.0: 0.01632523536682129
Cluster 3.0: 0.007864713668823242
Cluster 4.0: 0.020617961883544922
Cluster 5.0: 0.05298495292663574

NN MSE for Each Cluster:
Cluster 0.0: 0.01002265036595691
Cluster 1.0: 0.2949728364314378
Cluster 2.0: 0.004642056688248701
Cluster 3.0: 1.263561825881984
Cluster 4.0: 0.02319783897389114
Cluster 5.0: 0.03256926343689613

Average of all entries in nn_cluster_results:
MAE     0.138937
RMSE    0.361370
MAD     0.056871
dtype: float64

NN Entire Dataset: {'MAE': 0.0904465882797642, 'RMSE': 0.3043305877701078, 'MAD': 0.029432378007715823}
GP Training Time for Entire Dataset: 0.23535990715026855

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.05173802375793457
Cluster 1.0: 0.07815814018249512
Cluster 2.



NN Training Time for Entire Dataset: 0.10970187187194824

NN MSE for Entire Dataset: 0.09526241857000643

NN Training Times for Each Cluster:
Cluster 0.0: 0.024091005325317383
Cluster 1.0: 0.029221057891845703
Cluster 2.0: 0.013184070587158203
Cluster 3.0: 0.013222932815551758
Cluster 4.0: 0.02070903778076172
Cluster 5.0: 0.03732895851135254
Cluster 6.0: 0.01645803451538086
Cluster 7.0: 0.012768030166625977
Cluster 8.0: 0.007903814315795898

NN MSE for Each Cluster:
Cluster 0.0: 0.007084709050479241
Cluster 1.0: 0.017870549675205617
Cluster 2.0: 0.006730074151583901
Cluster 3.0: 0.06376681617918618
Cluster 4.0: 0.5223876881992702
Cluster 5.0: 0.4209857964189352
Cluster 6.0: 0.07171497277126779
Cluster 7.0: 0.005587890462549501
Cluster 8.0: 0.1018563035429269

Average of all entries in nn_cluster_results:
MAE     0.176334
RMSE    0.287301
MAD     0.104243
dtype: float64

NN Entire Dataset: {'MAE': 0.09759332419728395, 'RMSE': 0.3086461057100939, 'MAD': 0.04181085109912828}
GP Training T



NN Training Time for Entire Dataset: 0.14316105842590332

NN MSE for Entire Dataset: 0.09591161380628531

NN Training Times for Each Cluster:
Cluster 0.0: 0.0627589225769043
Cluster 1.0: 0.03210282325744629
Cluster 2.0: 0.03321027755737305

NN MSE for Each Cluster:
Cluster 0.0: 0.020808997233970648
Cluster 1.0: 0.004441810745588661
Cluster 2.0: 0.2741034901871675

Average of all entries in nn_cluster_results:
MAE     0.088042
RMSE    0.244816
MAD     0.039086
dtype: float64

NN Entire Dataset: {'MAE': 0.10072429562779442, 'RMSE': 0.3096960022445968, 'MAD': 0.046094390758509574}
GP Training Time for Entire Dataset: 0.26409006118774414

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.06590104103088379
Cluster 1.0: 0.07212495803833008
Cluster 2.0: 0.06694602966308594

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
MA



NN Training Time for Entire Dataset: 0.17261910438537598

NN MSE for Entire Dataset: 0.09194630276007805

NN Training Times for Each Cluster:
Cluster 0.0: 0.02468395233154297
Cluster 1.0: 0.0235140323638916
Cluster 2.0: 0.008891820907592773
Cluster 3.0: 0.00846409797668457
Cluster 4.0: 0.020900964736938477
Cluster 5.0: 0.04294562339782715

NN MSE for Each Cluster:
Cluster 0.0: 0.021071263691681955
Cluster 1.0: 0.3017373349907112
Cluster 2.0: 0.005948113954946404
Cluster 3.0: 1.1250866044723968
Cluster 4.0: 0.03423862643411491
Cluster 5.0: 0.022967098112073068

Average of all entries in nn_cluster_results:
MAE     0.140756
RMSE    0.361479
MAD     0.062569
dtype: float64

NN Entire Dataset: {'MAE': 0.09116869011401672, 'RMSE': 0.30322648756346804, 'MAD': 0.039550878480087134}
GP Training Time for Entire Dataset: 0.20784497261047363

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.04761910438537598
Cluster 1.0: 0.06751108169555664
Cluste



NN Training Time for Entire Dataset: 0.20005393028259277

NN MSE for Entire Dataset: 0.08897527183126995

NN Training Times for Each Cluster:
Cluster 0.0: 0.028613805770874023
Cluster 1.0: 0.02549290657043457
Cluster 2.0: 0.011478185653686523
Cluster 3.0: 0.010677814483642578
Cluster 4.0: 0.01962113380432129
Cluster 5.0: 0.024445772171020508
Cluster 6.0: 0.016633033752441406
Cluster 7.0: 0.020212888717651367
Cluster 8.0: 0.011778116226196289

NN MSE for Each Cluster:
Cluster 0.0: 0.01013647384633293
Cluster 1.0: 0.016388441131156662
Cluster 2.0: 0.004625847705085985
Cluster 3.0: 0.045462241431993664
Cluster 4.0: 0.2584660655358327
Cluster 5.0: 0.42053521555325046
Cluster 6.0: 0.08735028311109197
Cluster 7.0: 0.007546540151327609
Cluster 8.0: 0.007201129458189866

Average of all entries in nn_cluster_results:
MAE     0.137120
RMSE    0.237121
MAD     0.087204
dtype: float64

NN Entire Dataset: {'MAE': 0.09357149846435109, 'RMSE': 0.2982872304193895, 'MAD': 0.03876646655836827}
GP Traini



NN Training Time for Entire Dataset: 0.10619926452636719

NN MSE for Entire Dataset: 0.09326583318777093

NN Training Times for Each Cluster:
Cluster 0.0: 0.07014584541320801
Cluster 1.0: 0.04541206359863281
Cluster 2.0: 0.05126380920410156

NN MSE for Each Cluster:
Cluster 0.0: 0.013965884757619238
Cluster 1.0: 0.005101279356758137
Cluster 2.0: 0.28055503282671196

Average of all entries in nn_cluster_results:
MAE     0.083836
RMSE    0.239758
MAD     0.039974
dtype: float64

NN Entire Dataset: {'MAE': 0.09771186715637345, 'RMSE': 0.30539455330403475, 'MAD': 0.04721031301016565}
GP Training Time for Entire Dataset: 0.20613598823547363

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.0993809700012207
Cluster 1.0: 0.07301902770996094
Cluster 2.0: 0.06634902954101562

GP MSE for Each Cluster:
Cluster 0.0: 0.09015149623155594
Cluster 1.0: 0.00502082658931613
Cluster 2.0: 0.26126882433891296

Average of all entries in gp_cluster_results:
M



NN Training Time for Entire Dataset: 0.2068619728088379

NN MSE for Entire Dataset: 0.09202789574383008

NN Training Times for Each Cluster:
Cluster 0.0: 0.02764272689819336
Cluster 1.0: 0.02589106559753418
Cluster 2.0: 0.014191865921020508
Cluster 3.0: 0.010689973831176758
Cluster 4.0: 0.020751237869262695
Cluster 5.0: 0.049748897552490234

NN MSE for Each Cluster:
Cluster 0.0: 0.02143884343807723
Cluster 1.0: 0.29365963816789026
Cluster 2.0: 0.002155669891781199
Cluster 3.0: 1.1534517470961598
Cluster 4.0: 0.045407746230423915
Cluster 5.0: 0.019456032513713886

Average of all entries in nn_cluster_results:
MAE     0.135949
RMSE    0.360220
MAD     0.050020
dtype: float64

NN Entire Dataset: {'MAE': 0.09523690003842664, 'RMSE': 0.3033609990487078, 'MAD': 0.040913743323538476}
GP Training Time for Entire Dataset: 0.19430017471313477

GP MSE for Entire Dataset: 0.07901941984891891

GP Training Times for Each Cluster:
Cluster 0.0: 0.05358004570007324
Cluster 1.0: 0.06697988510131836
Clus



NN Training Time for Entire Dataset: 0.2017810344696045

NN MSE for Entire Dataset: 0.09246715941368494

NN Training Times for Each Cluster:
Cluster 0.0: 0.024064064025878906
Cluster 1.0: 0.024164915084838867
Cluster 2.0: 0.008964061737060547
Cluster 3.0: 0.008326053619384766
Cluster 4.0: 0.019514799118041992
Cluster 5.0: 0.035622358322143555
Cluster 6.0: 0.01651620864868164
Cluster 7.0: 0.03130221366882324
Cluster 8.0: 0.007004261016845703

NN MSE for Each Cluster:
Cluster 0.0: 0.021698807220846477
Cluster 1.0: 0.007941364394007475
Cluster 2.0: 0.009397724141012018
Cluster 3.0: 0.0670971696550193
Cluster 4.0: 0.30699184089410775
Cluster 5.0: 0.3872675901809409
Cluster 6.0: 0.055424224295458374
Cluster 7.0: 0.01667754231809662
Cluster 8.0: 0.0034908879451255336

Average of all entries in nn_cluster_results:
MAE     0.141876
RMSE    0.243602
MAD     0.087660
dtype: float64

NN Entire Dataset: {'MAE': 0.09409803861376734, 'RMSE': 0.30408413213070645, 'MAD': 0.04360653214217414}
GP Traini



NN Training Time for Entire Dataset: 2.119266986846924

NN MSE for Entire Dataset: 0.07761708678448369

NN Training Times for Each Cluster:
Cluster 0.0: 0.511699914932251
Cluster 1.0: 0.6151707172393799
Cluster 2.0: 0.7671592235565186

NN MSE for Each Cluster:
Cluster 0.0: 0.013992901293762664
Cluster 1.0: 0.06749415783684175
Cluster 2.0: 0.05793038292377621

Average of all entries in nn_cluster_results:
MAE     0.062712
RMSE    0.206258
MAD     0.024669
dtype: float64

NN Entire Dataset: {'MAE': 0.07446118010743519, 'RMSE': 0.2785984328464245, 'MAD': 0.030485969309576616}
GP Training Time for Entire Dataset: 9.424957990646362

GP MSE for Entire Dataset: 0.06502759456634521

GP Training Times for Each Cluster:
Cluster 0.0: 1.1445198059082031
Cluster 1.0: 2.2439639568328857
Cluster 2.0: 1.864781141281128

GP MSE for Each Cluster:
Cluster 0.0: 0.02290000207722187
Cluster 1.0: 0.05838930979371071
Cluster 2.0: 0.03763936087489128

Average of all entries in gp_cluster_results:
MAE     0.060



NN Training Time for Entire Dataset: 1.8948681354522705

NN MSE for Entire Dataset: 0.08997616063550794

NN Training Times for Each Cluster:
Cluster 0.0: 0.2546720504760742
Cluster 1.0: 0.12215590476989746
Cluster 2.0: 0.271543025970459
Cluster 3.0: 0.2690308094024658
Cluster 4.0: 0.08418989181518555
Cluster 5.0: 0.05862307548522949

NN MSE for Each Cluster:
Cluster 0.0: 0.06510874213516042
Cluster 1.0: 0.22846653200249906
Cluster 2.0: 0.027926060624279443
Cluster 3.0: 0.0679570015233908
Cluster 4.0: 0.12668289984187125
Cluster 5.0: 0.016276510409034262

Average of all entries in nn_cluster_results:
MAE     0.078471
RMSE    0.274075
MAD     0.032228
dtype: float64

NN Entire Dataset: {'MAE': 0.07370750577684347, 'RMSE': 0.2999602650944087, 'MAD': 0.02384816191490488}
GP Training Time for Entire Dataset: 9.233605146408081

GP MSE for Entire Dataset: 0.06504029780626297

GP Training Times for Each Cluster:
Cluster 0.0: 1.3206112384796143
Cluster 1.0: 1.3362438678741455
Cluster 2.0: 0.300

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Users/ekinokos2/anaconda3/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/ekinokos2/anaconda3/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/ekinokos2/anaconda3/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


Training time for entire dataset: 0.5943160057067871

Gradient Boosting MSE for Entire Dataset: 0.03812300038865588

Gradient Boosting Training Times for Each Cluster:
Cluster 0.0: 0.14795589447021484
Cluster 1.0: 0.15158796310424805
Cluster 2.0: 0.07992100715637207
Cluster 3.0: 0.06630969047546387
Cluster 4.0: 0.04640483856201172
Cluster 5.0: 0.06294727325439453

Gradient Boosting MSE for Each Cluster:
Cluster 0.0: 0.06274962066655296
Cluster 1.0: 0.03665499167251396
Cluster 2.0: 0.16071195685049947
Cluster 3.0: 0.023243791335450972
Cluster 4.0: 0.18412811294053802
Cluster 5.0: 0.011363681478177559

Average of all entries in gb_cluster_results:
MAE     0.052038
RMSE    0.255167
MAD     0.019330
dtype: float64

Gradient Boosting Entire Dataset: {'MAE': 0.046026849144372294, 'RMSE': 0.19525112135057224, 'MAD': 0.022916511687969987}

N = 10000, Norm = 1, Lambda = 0.1, Gamma = 0.5, K = 9



KeyboardInterrupt: 

In [58]:
# convert lr_cluster_times to a DataFrame
pd.DataFrame(lr_cluster_times, index=[0]).mean(axis=1)

0    0.000942
dtype: float64

In [63]:
lr_cluster_results_pd.T['MAE'].median()

0.08853538707805389

In [38]:
lr_entire_results['MAE']

0.07572404595198262

In [36]:
# Convert lr_entire_results to a DataFrame
pd.DataFrame(lr_entire_results, index=[0]).rename(columns={"MAE": "LR_Entire_MAE", "RMSE": "LR_Entire_RMSE", "MAD": "LR_Entire_MAD"})

Unnamed: 0,LR_Entire_MAE,LR_Entire_RMSE,LR_Entire_MAD
0,0.075724,0.317663,0.045674


In [None]:
results = {
                        "N": n,
                        "Norm": norm,
                        "Lambda": lam,
                        "Gamma": gam,
                        "K": k,
                        "Linear Regression": {
                            "Training Time": lr_time,
                            "MSE Entire Dataset": lr_entire_mse,
                            "MSE Per Cluster": linear_regression_mse,
                            "Cluster Training Times": lr_cluster_times,
                            "Cluster Results": lr_cluster_results,
                            "Entire Dataset Results": lr_entire_results
                        },
                        "Random Forest": {
                            "Training Time": rf_time,
                            "MSE Entire Dataset": rf_entire_mse,
                            "MSE Per Cluster": random_forest_mse,
                            "Cluster Training Times": rf_cluster_times,
                            "Cluster Results": rf_cluster_results,
                            "Entire Dataset Results": rf_entire_results
                        },
                        "Neural Network": {
                            "Training Time": nn_time,
                            "MSE Entire Dataset": nn_mse,
                            "MSE Per Cluster": neural_network_mse,
                            "Cluster Training Times": nn_cluster_times,
                            "Cluster Results": nn_cluster_results,
                            "Entire Dataset Results": nn_entire_results
                        },
                        "Gaussian Process": {
                            "Training Time": gp_time,
                            "MSE Entire Dataset": gp_entire_mse,
                            "MSE Per Cluster": gaussian_process_mse,
                            "Cluster Training Times": gp_cluster_times,
                            "Cluster Results": gp_cluster_results,
                            "Entire Dataset Results": gp_entire_results
                        },
                        "Gradient Boosting": {
                            "Training Time": gb_time,
                            "MSE Entire Dataset": gb_entire_mse,
                            "MSE Per Cluster": gradient_boosting_mse,
                            "Cluster Training Times": gb_cluster_times,
                            "Cluster Results": gb_cluster_results,
                            "Entire Dataset Results": gb_entire_results
                        }
                    }

                    # Save the results to a file
                    with open("sensitivity_results.txt", "a") as f:
                        f.write(str(results))
                        f.write("\n")
                        f.write("\n")

In [27]:
results

{'N': 1000,
 'Norm': 1,
 'Lambda': 0.1,
 'Gamma': 0.5,
 'K': 3,
 'Linear Regression': {'Training Time': 0.0008699893951416016,
  'MSE Entire Dataset': 0.10090998209080261,
  'MSE Per Cluster': {0.0: 0.012722107247752275,
   1.0: 0.0018078346424578835,
   2.0: 0.2917196343294968},
  'Cluster Training Times': {0.0: 0.0010688304901123047,
   1.0: 0.0011599063873291016,
   2.0: 0.0009622573852539062},
  'Cluster Results': {0.0: {'MAE': 0.08853538707805389,
    'RMSE': 0.1127923190990959,
    'MAD': 0.07357154908074431},
   1.0: {'MAE': 0.03035321666698034,
    'RMSE': 0.04251863876534482,
    'MAD': 0.025996888081637914},
   2.0: {'MAE': 0.1384634792018459,
    'RMSE': 0.5401107611680189,
    'MAD': 0.06972040358351161}},
  'Entire Dataset Results': {'MAE': 0.07572404595198262,
   'RMSE': 0.3176633156201745,
   'MAD': 0.045674406410488144}},
 'Random Forest': {'Training Time': 0.19759082794189453,
  'MSE Entire Dataset': 0.0009520665756600807,
  'MSE Per Cluster': {0.0: 0.00255930852078014

In [22]:
# Load the results as a dictionary
results = {}
with open("sensitivity_results.txt", "r") as f:
    for line in f:
        line = line.strip()
        if line:
            line = line.replace("'", "\"")
            line = line.replace("True", "true")
            line = line.replace("False", "false")
            line = line.replace("None", "null")
            line = line.replace("nan", "null")
            line = line.replace("inf", "null")
            line = line.replace("-inf", "null")
            line = line.replace("array", "null")
            line = line.replace("dtype", "null")
            line = line.replace("float64", "null")
            line = line.replace("int64", "null")
            line = line.replace("object", "null")
            line = line.replace("int32", "null")
            line = line.replace("float32", "null")
            line = line.replace("int16", "null")
            line = line.replace("float16", "null")
            line = line.replace("int8", "null")
            line = line.replace("float8", "null")
            line = line.replace("int4", "null")
            line = line.replace("float4", "null")
            line = line.replace("int2", "null")
            line = line.replace("float2", "null")
            line = line.replace("int1", "null")
            line = line.replace("float1", "null")
            line = line.replace("int0", "null")
            line = line.replace("float0", "null")
            line = line.replace("int", "null")
            line = line.replace("float", "null")
            line = line.replace("null", "null")
            line = line.replace(":", ": ")
            line = line.replace(",", ", ")
            line = line.replace("{", "{ ")
            line = line.replace("}", " }")
            line = line.replace("[", "[ ")
            line = line.replace("]", " ]")
            line = line.replace("  ", " ")

            line = line.replace("Training Time", "\"Training Time\"")
            line = line.replace("MSE Entire Dataset", "\"MSE Entire Dataset\"")
            line = line.replace("MSE Per Cluster", "\"MSE Per Cluster\"



In [24]:
results

{}