In [17]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras import regularizers
from tensorflow.keras.layers import LeakyReLU
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso, ElasticNet, LinearRegression, RidgeCV
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [2]:
# Import Data
X_train_np = np.load("../VAE_Data/X_train.npy", allow_pickle=True).astype(np.float64)
X_test_np = np.load("../VAE_data/X_test.npy", allow_pickle=True).astype(np.float64)
y_train_np = np.load("../VAE_data/y_train.npy", allow_pickle=True).astype(np.float64)
y_test_np = np.load("../VAE_Data/y_test.npy", allow_pickle=True).astype(np.float64)

X_train_np.shape, X_test_np.shape, y_train_np.shape, y_test_np.shape

((14196, 2640), (1586, 2640), (14196, 1890), (1586, 1890))

In [18]:
# Combine train and test data for scaling
X_combined = np.vstack((X_train_np, X_test_np))
y_combined = np.vstack((y_train_np, y_test_np))

# Scale the Data
scaler = StandardScaler()
X_scaled = X_combined.copy()
X_scaled[:, 749:] = scaler.fit_transform(X_combined[:, 749:])
ordinal_days_scaler = MinMaxScaler()
X_scaled[:, 748] = ordinal_days_scaler.fit_transform(X_combined[:, 748].reshape(-1, 1)).ravel()

# Scale y values
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y_combined)

# Shuffle the data
shuffle_index = np.random.permutation(len(X_scaled))
X_scaled_shuffled = X_scaled[shuffle_index]
y_scaled_shuffled = y_scaled[shuffle_index]

# Split the shuffled, scaled data back into train and test sets
test_size = len(X_test_np) / len(X_combined)  # Calculate the original test set proportion
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
    X_scaled_shuffled, y_scaled_shuffled, test_size=test_size, random_state=42
)

# If you need to ensure the same number of samples in train and test as before:
X_train_scaled = X_train_scaled[:len(X_train_np)]
X_test_scaled = X_test_scaled[:len(X_test_np)]
y_train_scaled = y_train_scaled[:len(y_train_np)]
y_test_scaled = y_test_scaled[:len(y_test_np)]

In [19]:
X_train_scaled.shape, X_test_scaled.shape, y_train_scaled.shape, y_test_scaled.shape

((14196, 2640), (1586, 2640), (14196, 1890), (1586, 1890))

In [20]:
# Define the models
lasso = Lasso(max_iter=10000)
elastic_net = ElasticNet(max_iter=10000)
models = {
    #'KNN': KNeighborsRegressor(),
    #'Lasso': MultiOutputRegressor(lasso),
    #'ElasticNet': MultiOutputRegressor(elastic_net),
    #'LinearRegression': LinearRegression(),
    #'RidgeCV': RidgeCV(),
    #'ExtraTrees': ExtraTreesRegressor(),
    'GradientBoosting': MultiOutputRegressor(GradientBoostingRegressor(
        validation_fraction=0.1, 
        verbose=1,
        n_iter_no_change=50,
        subsample=0.9
    )),
    #'RandomForest': RandomForestRegressor(),
    #'AdaBoost': MultiOutputRegressor(AdaBoostRegressor()),
    #'DecisionTree': DecisionTreeRegressor()
}

In [21]:
# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate MSE and R2 score
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    
    return mse, r2, -cv_scores.mean()

In [None]:
# Run and evaluate all models
results = {}

for name, model in models.items():
    print(f"Evaluating {name}...")
    mse, r2, cv_mse = evaluate_model(model, X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled)
    results[name] = {'MSE': mse, 'R2': r2, 'CV_MSE': cv_mse}
    print(f"{name} - MSE: {mse}, R2: {r2}, CV MSE: {cv_mse}")

Evaluating GradientBoosting...
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.9757           0.3053           13.22s
         2           0.8988           1.2047           13.23s
         3           0.7280           0.0001           13.08s
         4           0.1127          -4.8169           12.90s
         5           0.5249           4.8169           12.77s
         6           0.4252           0.0000           12.62s
         7           0.3224          -0.1980           12.49s
         8           0.2790           0.1981           12.34s
         9           0.2013          -0.2208           12.20s
        10           0.1118          -0.4199           12.06s
        20           0.0223           0.0962           14.31s
        30           0.0027           0.0019           14.22s
        40           0.0003           0.0017           12.88s
        50           0.0000           0.0000           10.96s
      Iter       Train Loss      OOB I

In [None]:
# Print summary of results
print("\nSummary of Results:")
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  R2: {metrics['R2']}")
    print(f"  CV MSE: {metrics['CV_MSE']}")
    print()

In [None]:
# Find the best model based on CV MSE
best_model = min(results, key=lambda x: results[x]['CV_MSE'])
print(f"Best model based on Cross-Validation MSE: {best_model}")