In [1]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [37]:
data_directory = '../data/old'

# Get all CSV files in the raw folder
all_files = glob.glob(os.path.join(data_directory+'/raw', 'raw_*.csv'))
# Read and concatenate all files
df_raw = pd.concat((pd.read_csv(f) for f in all_files[:-3]), ignore_index=True)
# Create a new session_id column using integer division
df_raw['session_id'] = df_raw.index // 256 + 1
# Reset timestamp to go from 1-256 and reset back to 1 for each session
df_raw['timestamp'] = df_raw.groupby('session_id').cumcount() + 1


In [38]:
df_raw_val = pd.concat((pd.read_csv(f) for f in all_files[-3:]), ignore_index=True)
df_raw_val['session_id'] = df_raw_val.index // 256 + 1
df_raw_val['timestamp'] = df_raw_val.groupby('session_id').cumcount() + 1

In [39]:
data = df_raw.drop(columns=['participantName', 'Signal_Quality', 'timestamp'])
data_val = df_raw_val.drop(columns=['participantName', 'Signal_Quality', 'timestamp'])

# FFT

In [41]:
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

def perform_fft_features(signal, sampling_rate=256):
    """Computes four frequency domain features: dominant frequency, average frequency, total energy, and average energy."""
    n = len(signal)
    fft_result = np.fft.fft(signal)  # Compute FFT
    freqs = np.fft.fftfreq(n, d=1/sampling_rate)[:n//2]  # Only positive frequencies
    magnitudes = np.abs(fft_result)[:n//2]  # Magnitude spectrum

    dominant_freq = freqs[np.argmax(magnitudes)]  # Peak frequency
    avg_freq = np.sum(freqs * magnitudes) / np.sum(magnitudes)  # Weighted mean frequency
    total_energy = np.sum(magnitudes ** 2)  # Total power
    avg_energy = total_energy / n  # Normalized energy per sample

    return dominant_freq, avg_freq, total_energy, avg_energy

def process_session(session_data):
    """Process a single session of data to extract features."""
    session_id = session_data['session_id'].iloc[0]
    features = {"session_id": session_id, 'frequency': session_data['frequency'].iloc[0]}
    
    # Compute features for each of the 8 signal columns
    for col in session_data.columns:
        if col != "session_id" and col != 'frequency':
            dominant_freq, avg_freq, total_energy, avg_energy = perform_fft_features(session_data[col].values)
            
            features[f"{col}_dominant_freq"] = dominant_freq
            features[f"{col}_avg_freq"] = avg_freq
            features[f"{col}_total_energy"] = total_energy
            features[f"{col}_avg_energy"] = avg_energy
    
    return features

def extract_features(df):
    """Transforms 256-row sessions into 1-row feature vectors for each session."""
    # Split data into groups by session_id
    grouped_data = [group for _, group in df.groupby("session_id")]
    
    # Create a pool of workers
    with Pool(processes=cpu_count()) as pool:
        # Map the process_session function to all groups in parallel
        feature_list = list(tqdm(pool.imap(process_session, grouped_data), total=len(grouped_data), desc='Extracting features'))

    # Convert to DataFrame
    return pd.DataFrame(feature_list)

# Extract features from the training data
df_train = extract_features(data)

# Extract features from the validation data
df_val = extract_features(data_val)


Extracting features: 100%|██████████| 5279/5279 [00:00<00:00, 6075.28it/s]
Extracting features: 100%|██████████| 585/585 [00:00<00:00, 7251.65it/s]


In [42]:
# Split features based on frequency column (labels)
X = df_train.drop(['frequency', 'session_id'], axis=1)
y = df_train['frequency']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Prepare validation data
X_val = df_val.drop(['frequency', 'session_id'], axis=1)
y_val = df_val['frequency']

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Validation set shape: {X_val.shape}")

Training set shape: (4223, 32)
Testing set shape: (1056, 32)
Validation set shape: (585, 32)


In [32]:
import pickle

# Save the data
with open('../data/pickles/fft_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train': X_train,
        'X_test': X_test,
        'X_val': X_val,
        'y_train': y_train,
        'y_test': y_test,
        'y_val': y_val
    }, f)

In [33]:
import pickle

# Load the data back
with open('../data/pickles/fft_data.pkl', 'rb') as f:
    dat = pickle.load(f)
    
X_train = dat['X_train']
X_test = dat['X_test']
X_val = dat['X_val']
y_train = dat['y_train']
y_test = dat['y_test']
y_val = dat['y_val']

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Validation set shape: {X_val.shape}")

Training set shape: (1057084, 32)
Testing set shape: (264271, 32)
Validation set shape: (148620, 32)


# EDA

In [34]:
# placeholder

# Models

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_test = rf_model.predict(X_test)

# Make predictions on validation set
y_pred_val = rf_model.predict(X_val)

# Print results
print("Test Set Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nTest Set Classification Report:")
print(classification_report(y_test, y_pred_test))

print("\nValidation Set Performance:")
print("Accuracy:", accuracy_score(y_val, y_pred_val))
print("\nValidation Set Classification Report:")
print(classification_report(y_val, y_pred_val))

Test Set Performance:
Accuracy: 0.6382575757575758

Test Set Classification Report:
              precision    recall  f1-score   support

          10       0.71      0.80      0.75       265
          12       0.58      0.57      0.58       256
          15       0.59      0.54      0.56       267
          20       0.65      0.64      0.65       268

    accuracy                           0.64      1056
   macro avg       0.63      0.64      0.63      1056
weighted avg       0.63      0.64      0.64      1056


Validation Set Performance:
Accuracy: 0.28717948717948716

Validation Set Classification Report:
              precision    recall  f1-score   support

          10       0.26      0.36      0.30       146
          12       0.27      0.20      0.23       145
          15       0.23      0.18      0.20       147
          20       0.37      0.41      0.39       147

    accuracy                           0.29       585
   macro avg       0.28      0.29      0.28       585
wei