In [1]:
%load_ext autoreload
%autoreload 2

# Fixed time series
> Use time-based splits rather than random splits

Context: the `neuron_33`, `neuron_46`, and `traffic` datasets were split randomly during the initial paper and rebuttal.
It would be better to use time-based splits, as the data is time series data.

I overwrote the `neuron_33` and `neuron_46` datasets in HuggingFace with transforms based on the first 80% of time 
points, so let's use those:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import manify
from manify.utils.dataloaders import load_hf
from manify.utils.benchmarks import benchmark

In [3]:
MODELS = ["product_rf", "sklearn_rf", "tangent_rf", "knn", "ambient_mlp", "kappa_gcn"]
# MODELS = ["sklearn_rf"]

In [4]:
# Agg
def agg(vals):
    mean = np.mean(vals)
    std = np.std(vals)
    ci = 1.96 * std / np.sqrt(len(vals))
    return f"{mean * 100:.1f} ± {ci * 100:.1f}"

In [None]:
# Neuron 33

features, _, _, labels = load_hf("neuron_33")
print(features.shape, labels.shape, labels.float().mean())

# Take first 80% of samples for training and last 20% for testing
train_size = int(0.8 * len(features))
X_train, y_train = features[:train_size], labels[:train_size]
X_test, y_test = features[train_size:], labels[train_size:]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Manifold
pm = manify.ProductManifold(signature=[(1.0, 1) for _ in range(10)])

res = []
for trial in range(10):
    # Subsample training data
    idx = np.random.choice(len(X_train), size=800, replace=False)
    X_train_sub = X_train[idx]
    y_train_sub = y_train[idx]

    # Subsample testing data
    idx = np.random.choice(len(X_test), size=200, replace=False)
    X_test_sub = X_test[idx]
    y_test_sub = y_test[idx]

    # Benchmark: Product RF Ambient RF Tangent RF k-Neighbors Ambient MLP κ-GCN
    res.append(
        benchmark(
            X=None,
            y=None,
            X_train=X_train_sub,
            y_train=y_train_sub,
            X_test=X_test_sub,
            y_test=y_test_sub,
            pm=pm,
            models=MODELS,
            device="cuda",
        )
    )
res = pd.DataFrame(res)
res.agg(agg)

torch.Size([401000, 20]) torch.Size([401000]) tensor(0.4978)
torch.Size([320800, 20]) torch.Size([320800]) torch.Size([80200, 20]) torch.Size([80200])


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

sklearn_rf_accuracy         66.3 ± 2.9
sklearn_rf_f1-micro         66.3 ± 2.9
sklearn_rf_f1-macro         65.8 ± 3.0
sklearn_rf_time              3.4 ± 0.0
product_rf_accuracy         64.2 ± 2.2
product_rf_f1-micro         64.2 ± 2.2
product_rf_f1-macro         63.0 ± 2.4
product_rf_time             30.5 ± 1.7
tangent_rf_accuracy         66.5 ± 3.0
tangent_rf_f1-micro         66.5 ± 3.0
tangent_rf_f1-macro         65.3 ± 3.5
tangent_rf_time              3.4 ± 0.0
knn_accuracy                50.7 ± 1.9
knn_f1-micro                50.7 ± 1.9
knn_f1-macro                49.0 ± 2.3
knn_time                     0.5 ± 0.0
ambient_mlp_accuracy        47.3 ± 2.0
ambient_mlp_f1-micro        47.3 ± 2.0
ambient_mlp_f1-macro        43.1 ± 3.9
ambient_mlp_time          3264.0 ± 8.8
kappa_gcn_accuracy          47.3 ± 2.0
kappa_gcn_f1-micro          47.3 ± 2.0
kappa_gcn_f1-macro          41.8 ± 4.3
kappa_gcn_time          32178.1 ± 76.7
dtype: object

In [None]:
# Neuron 46

features, _, _, labels = load_hf("neuron_46")
print(features.shape, labels.shape, labels.float().mean())

# Take first 80% of samples for training and last 20% for testing
train_size = int(0.8 * len(features))
X_train, y_train = features[:train_size], labels[:train_size]
X_test, y_test = features[train_size:], labels[train_size:]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Manifold
pm = manify.ProductManifold(signature=[(1.0, 1) for _ in range(10)])

res = []
for trial in range(10):
    # Subsample training data
    idx = np.random.choice(len(X_train), size=800, replace=False)
    X_train_sub = X_train[idx]
    y_train_sub = y_train[idx]

    # Subsample testing data
    idx = np.random.choice(len(X_test), size=200, replace=False)
    X_test_sub = X_test[idx]
    y_test_sub = y_test[idx]

    # Benchmark: Product RF Ambient RF Tangent RF k-Neighbors Ambient MLP κ-GCN
    res.append(
        benchmark(
            X=None,
            y=None,
            X_train=X_train_sub,
            y_train=y_train_sub,
            X_test=X_test_sub,
            y_test=y_test_sub,
            pm=pm,
            models=MODELS,
            device="cuda",
        )
    )
res = pd.DataFrame(res)
res.agg(agg)

torch.Size([1401000, 20]) torch.Size([1401000]) tensor(0.4867)
torch.Size([1120800, 20]) torch.Size([1120800]) torch.Size([280200, 20]) torch.Size([280200])


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

Loss is NaN, stopping training.


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

Loss is NaN, stopping training.


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

Loss is NaN, stopping training.


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

Loss is NaN, stopping training.


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

Loss is NaN, stopping training.


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

Loss is NaN, stopping training.


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

Loss is NaN, stopping training.


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

Loss is NaN, stopping training.


sklearn_rf_accuracy           24.4 ± 3.5
sklearn_rf_f1-micro           24.4 ± 3.5
sklearn_rf_f1-macro           21.1 ± 2.3
sklearn_rf_time                3.4 ± 0.0
product_rf_accuracy           24.1 ± 3.7
product_rf_f1-micro           24.1 ± 3.7
product_rf_f1-macro           20.9 ± 2.6
product_rf_time               30.2 ± 0.8
tangent_rf_accuracy           22.0 ± 5.7
tangent_rf_f1-micro           22.0 ± 5.7
tangent_rf_f1-macro           19.3 ± 4.3
tangent_rf_time                3.4 ± 0.0
knn_accuracy                  30.4 ± 4.7
knn_f1-micro                  30.4 ± 4.7
knn_f1-macro                  25.1 ± 3.2
knn_time                       0.5 ± 0.0
ambient_mlp_accuracy           3.1 ± 1.0
ambient_mlp_f1-micro           3.1 ± 1.0
ambient_mlp_f1-macro           3.0 ± 0.9
ambient_mlp_time           3273.2 ± 17.6
kappa_gcn_accuracy           79.1 ± 22.8
kappa_gcn_f1-micro           79.1 ± 22.8
kappa_gcn_f1-macro           45.5 ± 15.6
kappa_gcn_time          19524.8 ± 4528.2
dtype: object

In [9]:
# Traffic

features, _, _, labels = load_hf("traffic")
print(features.shape, labels.shape, labels.float().mean())

# Sort features by time
traffic_df = pd.read_csv("/home/phil/manify/data/traffic/traffic.csv")
idx = np.argsort(traffic_df["DateTime"])
features = features[idx]

# Take first 80% of samples for training and last 20% for testing
train_size = int(0.8 * len(features))
X_train, y_train = features[:train_size], labels[:train_size]
X_test, y_test = features[train_size:], labels[train_size:]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Manifold
pm = manify.ProductManifold(signature=[(0.0, 1), (1.0, 1), (1.0, 1), (1.0, 1), (1.0, 1)])

res = []
for trial in range(10):
    # Subsample training data
    idx = np.random.choice(len(X_train), size=800, replace=False)
    X_train_sub = X_train[idx]
    y_train_sub = y_train[idx]

    # Subsample testing data
    idx = np.random.choice(len(X_test), size=200, replace=False)
    X_test_sub = X_test[idx]
    y_test_sub = y_test[idx]

    # Benchmark: Product RF Ambient RF Tangent RF k-Neighbors Ambient MLP κ-GCN
    res.append(
        benchmark(
            X=None,
            y=None,
            X_train=X_train_sub,
            y_train=y_train_sub,
            X_test=X_test_sub,
            y_test=y_test_sub,
            pm=pm,
            models=MODELS,
            device="cuda",
            task="regression",
            score=["rmse"],
        )
    )
res = pd.DataFrame(res)
res.agg(agg)

torch.Size([48120, 9]) torch.Size([48120]) tensor(2.7724)
torch.Size([38496, 9]) torch.Size([38496]) torch.Size([9624, 9]) torch.Size([9624])


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

kappa_gcn:   0%|          | 0/4000 [00:00<?, ?it/s]

Loss is NaN, stopping training.


ValueError: Input contains NaN.

In [None]:
# Temperature

features, _, _, labels = load_hf("temperature")
print(features.shape, labels.shape, labels.float().mean())


def _unit_circle_point_to_month(month_x, month_y):
    """Convert unit circle point back to month abbreviation."""

    # Get angle from coordinates
    angle = np.arctan2(month_y, month_x)

    # Convert to month index (handle negative angles)
    index = int(round((angle % (2 * np.pi)) * 12 / (2 * np.pi))) % 12

    return index


# Usage with your feature matrix
def get_months_from_features(features):
    """Extract months from feature matrix."""
    month_x = features[:, 3]  # Month_X column
    month_y = features[:, 4]  # Month_Y column

    months = []
    for mx, my in zip(month_x, month_y):
        months.append(_unit_circle_point_to_month(mx.item(), my.item()))

    return months


months = get_months_from_features(features)

# Validation set is months 10, 11 (November, December)
train_mask = np.isin(months, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])  # January to September
train_size = len(features[train_mask])
X_train, y_train = features[train_mask], labels[train_mask]
X_test, y_test = features[~train_mask], labels[~train_mask]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Manifold
pm = manify.ProductManifold(signature=[(1.0, 2), (1.0, 1)])

res = []
for trial in range(10):
    # Subsample training data
    idx = np.random.choice(len(X_train), size=800, replace=False)
    X_train_sub = X_train[idx]
    y_train_sub = y_train[idx]

    # Subsample testing data
    idx = np.random.choice(len(X_test), size=200, replace=False)
    X_test_sub = X_test[idx]
    y_test_sub = y_test[idx]

    # Benchmark: Product RF Ambient RF Tangent RF k-Neighbors Ambient MLP κ-GCN
    res.append(
        benchmark(
            X=None,
            y=None,
            X_train=X_train_sub,
            y_train=y_train_sub,
            X_test=X_test_sub,
            y_test=y_test_sub,
            pm=pm,
            models=MODELS,
            device="cuda",
            task="regression",
            score=["rmse"],
        )
    )
res = pd.DataFrame(res)
res.agg(agg)

torch.Size([5352, 5]) torch.Size([5352]) tensor(17.8066)
torch.Size([4460, 5]) torch.Size([4460]) torch.Size([892, 5]) torch.Size([892])


ambient_mlp:   0%|          | 0/4000 [00:00<?, ?it/s]

KeyboardInterrupt: 