In [14]:
import sys

sys.path.append('../')

import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge

from xgboost import XGBRegressor

from config.regressors import VotingRegressor, StackingRegressor, NNRegressor

from config.models import ConvNN

from config.loss_functions import RMSELoss

import pyriemann
import pyriemann.regression

from config.transformers import TimeDomainTransformer, TimeWindowTransformer, LabelWindowExtractor, WaveletFeatureTransformer
from config.validation import RMSE, NMSE, cross_validate_pipeline, cross_validate_NN

# Models

### Baseline models

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA

# Combine features
# Define your base feature extractors
time_feat = ('time_features', TimeDomainTransformer(sigma_mpr=0.3))
wavelet_feat = ('wavelet_features', WaveletFeatureTransformer())

combined_features = FeatureUnion([
    time_feat,
    wavelet_feat
])

# Wrap with session-wise transformer
sessionwise_combined = SessionwiseTransformer(combined_features)

baseline_guided_kr = Pipeline([
    ('feature_extraction', sessionwise_combined),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=float(0.9), random_state=42)),
    ('regressor', KernelRidge(
        alpha=0.01,
        gamma=0.01,
        kernel='rbf'
    ))
])

baseline_guided_knn = Pipeline([
    ('feature_extraction', sessionwise_combined),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=float(0.9), random_state=42)),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

baseline_guided_rf = Pipeline([
    ('feature_extraction', sessionwise_combined),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=float(0.9), random_state=42)),
    ('regressor', RandomForestRegressor(n_estimators=50, max_depth=10))
])

In [7]:
baseline_guided_kr = Pipeline(
    [
        ('feature_extraction', TimeDomainTransformer(sigma_mpr=0.3)),
        ('scaler', StandardScaler()),
        ('regressor', KernelRidge(
            alpha = 0.01,
            gamma = 0.01,
            kernel='laplacian'))
    ]
)

baseline_guided_knn = Pipeline(
    [
        ('feature_extraction', TimeDomainTransformer(sigma_mpr=0.3)),
        ('scaler', StandardScaler()),
        ('regressor', KNeighborsRegressor(
            n_neighbors = 7))
    ]
)

baseline_guided_rf = Pipeline(
    [
        ('feature_extraction', TimeDomainTransformer(sigma_mpr=0.3)),
        ('scaler', StandardScaler()),
        ('regressor', RandomForestRegressor(
            n_estimators = 50,
            max_depth = 10))
    ]
)

timedomain_xgboost = Pipeline(
    [
        ('feature_extraction', TimeDomainTransformer(sigma_mpr=0.3)),
        ('scaler', StandardScaler()),
        ('regressor', XGBRegressor(
            n_estimators = 100,
            max_depth = 5,
            learning_rate = 0.1,
            objective='reg:squarederror',
            n_jobs=-1,
            verbosity=1
        ))
    ]
)

### Riemannian models

In [8]:
# Riemannian geometry of covariance matrices
riem1 = Pipeline(
    [
        ('feature_extraction', pyriemann.estimation.Covariances()),
        ('transformation', pyriemann.tangentspace.TangentSpace(
            metric = 'riemann',
            tsupdate = True)),
        ('scaler', StandardScaler()),
        ('regressor', KernelRidge(
            alpha = 0.01,
            gamma = 0.01,
            kernel='laplacian'))
    ]
)

riem2 = Pipeline(
    [
        ('feature_extraction', pyriemann.estimation.Covariances()),
        ('transformation', pyriemann.tangentspace.TangentSpace(
            metric = 'riemann',
            tsupdate = True)),
        ('scaler', StandardScaler()),
        ('regressor', KNeighborsRegressor(
            n_neighbors = 7))
    ]
)

riem3 = Pipeline(
    [
        ('feature_extraction', pyriemann.estimation.Covariances()),
        ('transformation', pyriemann.tangentspace.TangentSpace(
            metric = 'riemann',
            tsupdate = True)),
        ('scaler', StandardScaler()),
        ('regressor', RandomForestRegressor(
            n_estimators = 50,
            max_depth = 10))
    ]
)

riem4 = Pipeline(
    [
        ('feature_extraction', pyriemann.estimation.Covariances()),
        ('transformation', pyriemann.tangentspace.TangentSpace(
            metric = 'riemann',
            tsupdate = True)),
        ('scaler', StandardScaler()),
        ('regressor', XGBRegressor(
            n_estimators = 100,
            max_depth = 5,
            learning_rate = 0.1,
            objective='reg:squarederror',
            n_jobs=-1,
            verbosity=1
        ))
    ]
)

### Ensemble models

In [None]:
voting_estimator = VotingRegressor(
    estimators = [
        baseline_guided_kr,
        baseline_guided_knn,
        baseline_guided_rf,
        timedomain_xgboost,
        riem1,
        riem2,
        riem3,
        riem4
    ]
)

stacking_estimator = StackingRegressor(
    estimators = [
        baseline_guided_kr,
        # baseline_guided_knn,
        # baseline_guided_rf,
        timedomain_xgboost,
        # riem1,
        # riem2,
        # riem3,
        riem4
    ],
    end_estimator = RandomForestRegressor(
        n_estimators = 50,
        max_depth = 10)
)

# Final generalization evaluation

In [11]:
# PATH = f'/Users/marco/PROJECTS/data/'
PATH = r'C:\Users\gianm\Documents\Uni\Big Data\F422\project\data\\'

# model = baseline_guided_kr
step = 250

metric_fns = {'RMSE': RMSE, 'NMSE': NMSE}

In [12]:
tw_extractor = TimeWindowTransformer(size = 500, step = step)
label_extractor = LabelWindowExtractor(size = 500, step = step)

# guided
X_guided = np.load(PATH + f'guided/guided_dataset_X.npy')
Y_guided = np.load(PATH + f'guided/guided_dataset_Y.npy')
X_guided_windows = tw_extractor.transform(X_guided)
Y_guided_labels = label_extractor.transform(Y_guided)

# freemoves
X_freemoves = np.load(PATH + f'freemoves/freemoves_dataset_X.npy')
Y_freemoves = np.load(PATH + f'freemoves/freemoves_dataset_Y.npy')
X_freemoves_windows = tw_extractor.transform(X_freemoves)
Y_freemoves_labels = label_extractor.transform(Y_freemoves)

# # stacked
# X_stacked_windows = np.concatenate([X_guided_windows, X_freemoves_windows], axis=1)
# Y_stacked_labels = np.concatenate([Y_guided_labels, Y_freemoves_labels], axis=1)

In [15]:
# Time-domain features
print("Time-domain feature shape:", X_freemoves_windows.shape)  # (n_windows, td_features)

riem_pipeline = Pipeline([
    ('feature_extraction', pyriemann.estimation.Covariances()),
    ('transformation', pyriemann.tangentspace.TangentSpace(
        metric = 'riemann',
        tsupdate = True))
])

X_riem = riem_pipeline.fit_transform(X_freemoves_windows)  # shape: (n_windows, riem_features)
print("Riemannian feature shape:", X_riem.shape)

# Step 3: Wavelet features
wave = WaveletFeatureTransformer(summary=True)
X_wave = wave.fit_transform(X_freemoves_windows)  # shape: (n_windows, wave_features)
print("Wavelet feature shape:", X_wave.shape)

# Step 4: Total features
total_features = X_freemoves_windows.shape[1] + X_riem.shape[1] + X_wave.shape[1]
print("Total combined feature dimension:", total_features)

Time-domain feature shape: (5, 1079, 8, 500)


ValueError: too many values to unpack (expected 3)

#### RMSE guided

In [None]:
# Define all your models (trained or pipelines)
estimators = [
    baseline_guided_kr,
    baseline_guided_knn,
    baseline_guided_rf,
    riem1,
    riem2,
    riem3
]

# Define weight combinations manually (they must sum to 1)
weight_sets = [
    [1/6] * 6,  # uniform
    [0.2, 0.2, 0.2, 0.2, 0.1, 0.1],
    [0.1, 0.1, 0.4, 0.1, 0.15, 0.15],
    [0.25, 0.25, 0.25, 0.0, 0.15, 0.10],
    [0.4, 0.0, 0.0, 0.2, 0.2, 0.2],
    [0.3, 0.3, 0.0, 0.1, 0.2, 0.1],
    [0.2, 0.2, 0.1, 0.1, 0.2, 0.2],
    [0.15, 0.15, 0.15, 0.15, 0.2, 0.2],
    [0.1, 0.1, 0.1, 0.2, 0.3, 0.2],
    [0.2, 0.0, 0.3, 0.0, 0.3, 0.2],
    [0.5, 0.2, 0.0, 0.0, 0.2, 0.1],
    [0.6, 0.0, 0.0, 0.0, 0.2, 0.2],
    [0.4, 0.3, 0.0, 0.1, 0.1, 0.1],
    [0.3, 0.0, 0.1, 0.1, 0.3, 0.2],
    [0.1, 0.1, 0.2, 0.2, 0.2, 0.2],
    [0.0, 0.3, 0.3, 0.2, 0.1, 0.1],
    [0.0, 0.0, 0.4, 0.2, 0.2, 0.2],
    [0.0, 0.0, 0.0, 0.5, 0.3, 0.2],
    [0.2, 0.2, 0.0, 0.0, 0.3, 0.3],
    [0.1, 0.1, 0.1, 0.1, 0.3, 0.3]
]

# Track results
results = []

# Loop through each weight set and evaluate with cross-validation
for weights in weight_sets:
    ensemble = VotingRegressor(estimators=estimators, weights=weights)
    
    cv_scores = cross_validate_pipeline(
        ensemble, X_guided_windows, Y_guided_labels, metric_fns, n_folds=5, verbose=0
    )
    results.append((weights, cv_scores['avg_val_RMSE']))
    print(f"Weights: {weights} → avg_val_RMSE: {cv_scores['avg_val_RMSE']:.4f}")


ValueError: Shape of weights must be consistent with shape of a along specified axis.

In [19]:
ensemble = VotingRegressor(estimators=estimators, weights=[0.0, 0.3, 0.3, 0.2, 0.1, 0.1])
results_guided = cross_validate_pipeline(ensemble, X_guided_windows, Y_guided_labels, metric_fns, n_folds=5, verbose=1)


Average Scores across folds:
RMSE: train=1.8400, val=4.2512
NMSE: train=0.0167, val=0.0906


#### RMSE freemoves

In [None]:
# Define all your models (trained or pipelines)
estimators = [
    baseline_guided_kr,
    baseline_guided_knn,
    baseline_guided_rf,
    riem1,
    riem2,
    riem3
]

# Define weight combinations manually (they must sum to 1)
weight_sets = [
    [1/6] * 6,  # uniform
    [0.2, 0.2, 0.2, 0.2, 0.1, 0.1],
    [0.1, 0.1, 0.4, 0.1, 0.15, 0.15],
    [0.25, 0.25, 0.25, 0.0, 0.15, 0.10],
    [0.4, 0.0, 0.0, 0.2, 0.2, 0.2],
    [0.3, 0.3, 0.0, 0.1, 0.2, 0.1],
    [0.2, 0.2, 0.1, 0.1, 0.2, 0.2],
    [0.15, 0.15, 0.15, 0.15, 0.2, 0.2],
    [0.1, 0.1, 0.1, 0.2, 0.3, 0.2],
    [0.2, 0.0, 0.0, 0.3, 0.3, 0.2],
    [0.5, 0.2, 0.0, 0.0, 0.2, 0.1],
    [0.6, 0.0, 0.0, 0.0, 0.2, 0.2],
    [0.4, 0.3, 0.0, 0.1, 0.1, 0.1],
    [0.3, 0.0, 0.1, 0.1, 0.3, 0.2],
    [0.1, 0.1, 0.2, 0.2, 0.2, 0.2],
    [0.0, 0.3, 0.3, 0.2, 0.1, 0.1],
    [0.0, 0.0, 0.4, 0.2, 0.2, 0.2],
    [0.0, 0.0, 0.0, 0.5, 0.3, 0.2],
    [0.2, 0.2, 0.0, 0.0, 0.3, 0.3],
    [0.1, 0.1, 0.1, 0.1, 0.3, 0.3]
]

# Track results
results = []

# Loop through each weight set and evaluate with cross-validation
for weights in weight_sets:
    ensemble = VotingRegressor(estimators=estimators, weights=weights)
    
    cv_scores = cross_validate_pipeline(
        ensemble, X_freemoves_windows, Y_freemoves_labels, metric_fns, n_folds=5, verbose=0
    )
    results.append((weights, cv_scores['avg_val_RMSE']))
    print(f"Weights: {weights} → avg_val_RMSE: {cv_scores['avg_val_RMSE']:.4f}")

In [None]:
metric_fns = {'RMSE': RMSE, 'NMSE': NMSE}
models = {
    'Combined features + Kernel Ridge': baseline_guided_kr,
    'Combined features + KNN': baseline_guided_knn,
    'Combined features + RF': baseline_guided_rf,
}

for model_name, model in models.items():
    print(f"\nModel: {model_name}")
    results = cross_validate_pipeline(
        model, 
        X_freemoves_windows, 
        Y_freemoves_labels, 
        metric_fns=metric_fns,
        n_folds=5,
        verbose=1
    )

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

base_models = [
        baseline_guided_kr,
        # baseline_guided_knn,
        # baseline_guided_rf,
        timedomain_xgboost,
        # riem1,
        # riem2,
        # riem3,
        riem4
    ]

meta_models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': MultiOutputRegressor(Lasso()),
    # 'SVR': MultiOutputRegressor(SVR()),
    # 'MLP': MultiOutputRegressor(MLPRegressor(max_iter=500)),
    # 'RF': RandomForestRegressor(n_estimators=50),
    # 'XGB': XGBRegressor(objective='reg:squarederror')
}

for name, meta in meta_models.items():
    model = StackingRegressor(estimators=base_models, end_estimator=meta)
    results = cross_validate_pipeline(model, X_freemoves_windows, Y_freemoves_labels, metric_fns, n_folds=5, verbose=0)
    print(f"{name} meta: avg_val_RMSE = {results['avg_val_RMSE']:.4f}")

In [20]:
ensemble = VotingRegressor(estimators=estimators, weights=[0.2, 0.2, 0.2, 0.2, 0.1, 0.1])
results_freemoves = cross_validate_pipeline(ensemble, X_freemoves_windows, Y_freemoves_labels, metric_fns, n_folds=5, verbose=1)


Average Scores across folds:
RMSE: train=3.9533, val=10.1630
NMSE: train=0.0987, val=0.7275


#### RMSE total

In [21]:
results_combined = \
    ((results_guided['avg_val_RMSE']**2 * X_guided.shape[0] + \
    results_freemoves['avg_val_RMSE']**2 * X_freemoves.shape[0])/(X_guided.shape[0]+X_freemoves.shape[0]))**0.5

results_combined # I think that this should be the result that is most correlated to the score

np.float64(7.789721324122596)

# Submission


### Guided training and predictions

In [26]:
DATASET = 'guided'

# training
X = np.load(PATH + f'{DATASET}/{DATASET}_dataset_X.npy')
Y = np.load(PATH + f'{DATASET}/{DATASET}_dataset_Y.npy')

tw_extractor = TimeWindowTransformer(size = 500, step = 50)
label_extractor = LabelWindowExtractor(size = 500, step = 50)

X_windows = tw_extractor.transform(X)
Y_labels = label_extractor.transform(Y)

X_train = X_windows.reshape(-1, *X_windows.shape[2:])
Y_train = Y_labels.reshape(-1, *Y_labels.shape[2:])

model_guided = VotingRegressor(estimators=estimators)

model_guided.fit(X_train, Y_train)

# predicting
X_test = np.load(PATH + f'{DATASET}/{DATASET}_testset_X.npy')
X_test = X_test.reshape(-1, *X_windows.shape[2:])

Y_guided_pred = model.predict(X_test)

### Freemoves training and predictions

In [27]:
DATASET = 'freemoves'

# training
X = np.load(PATH + f'{DATASET}/{DATASET}_dataset_X.npy')
Y = np.load(PATH + f'{DATASET}/{DATASET}_dataset_Y.npy')

tw_extractor = TimeWindowTransformer(size = 500, step = 50)
label_extractor = LabelWindowExtractor(size = 500, step = 50)

X_windows = tw_extractor.transform(X)
Y_labels = label_extractor.transform(Y)

X_train = X_windows.reshape(-1, *X_windows.shape[2:])
Y_train = Y_labels.reshape(-1, *Y_labels.shape[2:])

model_freemoves = VotingRegressor(estimators=estimators, weights=[0.2, 0.2, 0.2, 0.2, 0.1, 0.1])

model_freemoves.fit(X_train, Y_train)

# predicting
X_test = np.load(PATH + f'{DATASET}/{DATASET}_testset_X.npy')
X_test = X_test.reshape(-1, *X_windows.shape[2:])

Y_freemoves_pred = model.predict(X_test)

### CSV generation

In [28]:
import pandas as pd

fname = 'voting_step_50.csv'

Y_pred = np.vstack([Y_guided_pred, Y_freemoves_pred])
Y_pred_df = pd.DataFrame(Y_pred)
Y_pred_df.to_csv(fname, index=False, header=None)