In [25]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go

In [2]:
#random change
for i in range(5):
    print("This is random change number:", i)

This is random change number: 0
This is random change number: 1
This is random change number: 2
This is random change number: 3
This is random change number: 4


The following cells fit the autogluon model for the resting, active, walking, running classification  
If not already done: make sure to run file in lab_data_processing for data

In [26]:
root_dir = "/home/elias/2025/sshfs_mounter_2025/data_elias/ECSS_2026/fft_lab"
ag_path_model = './base_ag_model'

### get data from csv files
df_list = []
for f in os.listdir(root_dir):
    df_list.append(pd.read_csv(os.path.join(root_dir, f)))
    

# use 20% of dataframes for testing
test_size = int(0.2 * len(df_list))
test_list = []
for _ in range(test_size):
    test_list.append(df_list.pop(np.random.randint(0, len(df_list))))
    
train_data = pd.concat(df_list, ignore_index=True)
test_data = pd.concat(test_list, ignore_index=True)

# set data labels of interest
labels = [f"acc_{i:.1f}" for i in np.arange(1.0, 10.2, 0.2)] + ['label']

# shuffle datasets
train_data = shuffle(train_data, random_state=42).reset_index(drop=True)
test_data = shuffle(test_data, random_state=42).reset_index(drop=True)

# reduce to labels of interest
train_data = train_data[labels]
test_data = test_data[labels]

The following cell selects the field data and adds it to the training set

before running this cell, you might want to run ../extra_data_gathering/run_data_selector.ipynb

In [27]:
def add_lab_data(train_data, include_field = True): 
    '''
    Adds the field data for walking to the train data
    param: train_data = training data from lab
    param: include_field = boolean to include field data or not
    return: train_data with field data added if include_field is True
    '''
    if include_field:
        output_path = "/home/elias/2025/sshfs_mounter_2025/data_elias/ECSS_2026/fft_field"
        
        files = sorted(os.listdir(output_path))
        
        df_l = []
        for f in files:
            df = pd.read_csv(os.path.join(output_path, f))
        
            if len(df) == 0:
                continue
            else:
                df_l.append(df)
            
        field_data = pd.concat(df_l, ignore_index=True)
        
        print('field training examples', len(field_data))
        
        # only keep locomotion labels
        #field_data = field_data[field_data['label'] == 'locomotion']
        
        # randomly choose 16000 examples to add to training data
        #field_data = field_data.sample(n=16000, random_state=42).reset_index(drop=True)
        
        #train_data = pd.concat([train_data, field_data], ignore_index=True)
        
        return field_data
    
    else:
        return train_data

test_data = add_lab_data(train_data, include_field=True)

#train_data = pd.concat([train_data, test_data.sample(n = 16000, random_state=42).reset_index(drop=True)], ignore_index=True)

field training examples 286997


In [28]:
# Disable Ray to avoid compatibility issues
os.environ['AUTOGLUON_DISABLE_RAY'] = '1'

# Also disable parallel processing that might cause issues
os.environ['RAY_DISABLE_IMPORT_WARNING'] = '1'

# Test environment setup
print("Testing environment setup...")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Labels distribution:")
print(train_data['label'].value_counts().sort_index())

# Check if data is properly prepared
print(f"\nData info:")
print(f"Features: {train_data.shape[1]-1}")
print(f"Samples: {train_data.shape[0]}")
print(f"Feature columns: {train_data.columns[:-1].tolist()[:5]}...")  # Show first 5 feature names

Testing environment setup...
NumPy version: 2.1.3
Pandas version: 2.3.1
Labels distribution:
label
locomotion        7209
no locomotion    23796
Name: count, dtype: int64

Data info:
Features: 46
Samples: 31005
Feature columns: ['acc_1.0', 'acc_1.2', 'acc_1.4', 'acc_1.6', 'acc_1.8']...


In [29]:
# Use simpler preset and disable problematic features
predictor = TabularPredictor(label='label', eval_metric='accuracy', path=ag_path_model).fit(
    train_data,
    presets='medium_quality',
    hyperparameters={
        'XGB': {},
        'RF': {},
    },
    num_cpus=1,
    dynamic_stacking=False, 
    verbosity=2)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #35-Ubuntu SMP PREEMPT_DYNAMIC Mon May 20 15:51:52 UTC 2024
CPU Count:          4
Memory Avail:       26.11 GB / 31.35 GB (83.3%)
Disk Space Avail:   19.98 GB / 96.28 GB (20.8%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "/opt/code/physiological-studies/playground_Elias/ECSS_2026/models/base_ag_model"
Train Data Rows:    31005
Train Data Columns: 46
Label Column:       label
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['locomotion', 'no locomotion']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       binary
Prepro

In [32]:
# gets and stores leaderboard in csv
leader = predictor.leaderboard(train_data, silent= False)
leader_df = pd.DataFrame(leader)

                 model  score_test  score_val eval_metric  pred_time_test  pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0         RandomForest    0.999742     0.9968    accuracy        0.161051       0.055261  14.230647                 0.161051                0.055261          14.230647            1       True          1
1              XGBoost    0.999677     0.9972    accuracy        0.135568       0.010314   1.702432                 0.135568                0.010314           1.702432            1       True          2
2  WeightedEnsemble_L2    0.999677     0.9972    accuracy        0.137500       0.011160   1.738023                 0.001932                0.000847           0.035592            2       True          4
3       NeuralNetTorch    0.999258     0.9960    accuracy        0.394089       0.041509  59.136271                 0.394089                0.041509          59.136271            1       T

In [33]:
# load model for evaluation
classification_model = TabularPredictor.load(ag_path_model)

# predictions for train, validation, and test sets
predictions_train = classification_model.predict_proba(train_data.drop(columns=['label']))
predictions_test = classification_model.predict_proba(test_data.drop(columns=['label']))

# gets max probability class for each sample
predictions_train = pd.Series(np.argmax(predictions_train.values, axis=1))
predictions_test = pd.Series(np.argmax(predictions_test.values, axis=1))

# calculate accuracy for each dataset
def label_from_index(index):
    return predictor.class_labels[index]
predictions_train = predictions_train.apply(label_from_index)
predictions_test = predictions_test.apply(label_from_index)

# prediction df
results_df = pd.DataFrame({
    'train_true': train_data['label'],
    'train_pred': predictions_train,
    'test_true': test_data['label'],
    'test_pred': predictions_test
})

# calculate accuracies
l = []
for splits in ['train', 'test']:
    set_list = []
    true_labels = results_df[f'{splits}_true'].dropna().values
    pred_labels = results_df[f'{splits}_pred'].dropna().values
    
    for metric in ['total_accuracy', 'locomotion', 'no locomotion']:
        if metric == 'total_accuracy':
            set_list.append((true_labels == pred_labels).mean())
        else:
            mask = true_labels == metric
            set_list.append((true_labels[mask] == pred_labels[mask]).mean())
            
    l.append(set_list)
    
accuracy_df = pd.DataFrame(l, columns=['total_accuracy', 'locomotion', 'no locomotion'], index=['train',  'test'])
print("\nAccuracy results:")
print(accuracy_df)


Accuracy results:
       total_accuracy  locomotion  no locomotion
train        0.999677    0.998752       0.999958
test         0.930013    0.919238       0.995256
