See 'swallow classification bandpass.ipynb' for preprocessing.

In [None]:
import os
import pandas as pd
import random
import shutil

data_path = r'C:\Users\chloe\OneDrive\Desktop\swallow EMG\data\07_18_25\extracted signals'
class_folders = os.listdir(data_path)

# Feature Extraction
I got some of these features and functions from this link: https://www.kaggle.com/code/calulamabel/emg-hand-gesture-classification/notebook

In [None]:
import numpy as np

# mean absolute value
def mav(data):
    return np.mean(np.abs(data), axis=0)

# root mean square
def rms(data):
    return np.sqrt(np.mean(data**2,axis=0))  
    
# wavelength
def wavelength(data):
    return np.sum(np.abs(np.diff(data)), axis=0)
    
# zero crossing rate
def zcr(data):
    return np.sum(np.diff(np.sign(data), axis=0) != 0, axis=0) / (len(data)-1)

# variance
def var(data):
    return np.var(data, axis=0)

# sum of absolute differences between consecutive samples
def abs_diffs_signal(data):
    return np.sum(np.abs(np.diff(data,axis=0)),axis=0)

# mean frequency
def mean_freq(data, fs=500):
    freqs = np.fft.rfftfreq(len(data), d=1/fs)
    spectrum = np.abs(np.fft.rfft(data))**2
    return np.sum(freqs * spectrum) / np.sum(spectrum)

# median frequency
def median_freq(data, fs=500):
    data = data - np.mean(data)
    freqs = np.fft.rfftfreq(len(data), d=1/fs)
    spectrum = np.abs(np.fft.rfft(data))**2
    cumulative = np.cumsum(spectrum)
    total = cumulative[-1]
    med_idx = np.searchsorted(cumulative, total / 2)
    return freqs[med_idx]

# peak frequency
def peak_freq(data, fs=500):
    data = data - np.mean(data)
    freqs = np.fft.rfftfreq(len(data), d=1/fs)
    spectrum = np.abs(np.fft.rfft(data))**2
    peak_idx = np.argmax(spectrum)
    return freqs[peak_idx]

In [None]:
df = pd.DataFrame()

class_map = {}
i = 0

sampling_rate = 500 # in Hz
window_seconds = 0.15 # in seconds
window_size = int(window_seconds * sampling_rate)

for class_name in os.listdir(data_path):
    if class_name == 'oral prep 0':
        continue
        
    class_path = os.path.join(data_path, class_name)
    class_str = class_name.rstrip('.txt')
    parts = class_str.split()
    volume = parts[-1]
    substance = ' '.join(parts[:-1])
    
    for sample_name in os.listdir(class_path):
        sample_df = pd.read_csv(os.path.join(data_path, class_name, sample_name), delimiter = ",", header = None)

        # rectification
        sample_df = sample_df.abs()

        # rms smoothing
        sample_df = sample_df.rolling(window=window_size).apply(rms, raw=True)
        sample_df = sample_df.dropna()
        
        sample_df.columns = [f"channel {i+1}" for i in range(sample_df.shape[1])]
        if substance not in class_map:
            class_map[substance] = i
            i += 1
        
        sample_df['substance'] = substance
        sample_df['volume'] = volume
        sample_df_grouped = sample_df.groupby(['substance', 'volume'])
        
        features_df = sample_df_grouped.agg(['max', mav, rms, wavelength, var, abs_diffs_signal, mean_freq, median_freq, peak_freq])
        df = pd.concat([df, features_df])

df.reset_index(inplace=True)
sample_df

In [None]:
df

In [None]:
print(class_map)

In [None]:
df['substance'] = df['substance'].map(class_map)
df

In [None]:
df.to_csv(r'C:\Users\chloe\OneDrive\Desktop\swallow EMG\notebooks\clean_df.csv', index=False)

# Single Label Classification
### Substance

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm, metrics
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

x = df.drop(columns=['volume', 'substance'])
y = df['substance']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

print(x_train_scaled.shape)  
print(y_train.shape)  

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# joint accuracies
models = []
model_accuracies = []
model_precisions = []
model_recalls = []
model_f1scores = []

def train_and_plot_model(model, model_name):
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1score = f1_score(y_test, y_pred, average='weighted')
    model_confusion_matrix = confusion_matrix(y_test, y_pred)

    models.append(model_name)
    model_accuracies.append(accuracy * 100)
    model_precisions.append(precision * 100)
    model_recalls.append(recall * 100)
    model_f1scores.append(f1score * 100)
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 score: {f1score:.4f}')
    print('----------------------------')
    
    plt.figure(figsize=(4, 3))
    sns.heatmap(model_confusion_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

### SVM
The lines below are taken from here: https://www.kaggle.com/code/calulamabel/emg-hand-gesture-classification/notebook

In [None]:
svm_model = svm.SVC(kernel='linear')
train_and_plot_model(svm_model, 'SVM')

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=3)
train_and_plot_model(knn_model, 'KNN')

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
train_and_plot_model(dt_model, 'Decision Tree')

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
train_and_plot_model(nb_model, 'Naive Bayes')

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

regression_model = LogisticRegression()
train_and_plot_model(regression_model, 'Logistic Regression')

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
train_and_plot_model(gb_model, 'Gradient Boosting')

### Model Comparison

In [None]:
x = np.arange(len(models))  
width = 0.2                

plt.bar(x - 1.5*width, model_accuracies, width, label='Accuracy', color='dodgerblue')
plt.bar(x - 0.5*width, model_precisions, width, label='Precision', color='deepskyblue')
plt.bar(x + 0.5*width, model_recalls, width, label='Recall', color='lightskyblue')
plt.bar(x + 1.5*width, model_f1scores, width, label='F1 Score', color='lightblue')

plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
plt.xticks(x, labels=models, rotation=-45)
plt.xlabel('Model')
plt.ylabel('Percent')

plt.tight_layout()
plt.show()

### Volume

In [None]:
# accuracy plot
models = []
model_accuracies = []
model_precisions = []
model_recalls = []
model_f1scores = []

x = df.drop(columns=['volume', 'substance'])
y = df['volume']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

print(x_train_scaled.shape)  
print(y_train.shape)  

In [None]:
train_and_plot_model(svm_model, 'SVM')

In [None]:
train_and_plot_model(knn_model, 'KNN')

In [None]:
train_and_plot_model(dt_model, 'Decision Tree')

In [None]:
train_and_plot_model(nb_model, 'Naive Bayes')

In [None]:
train_and_plot_model(regression_model, 'Logistic Regression')

In [None]:
train_and_plot_model(gb_model, 'Gradient Boosting')

In [None]:
x = np.arange(len(models))  
width = 0.2                

plt.bar(x - 1.5*width, model_accuracies, width, label='Accuracy', color='dodgerblue')
plt.bar(x - 0.5*width, model_precisions, width, label='Precision', color='deepskyblue')
plt.bar(x + 0.5*width, model_recalls, width, label='Recall', color='lightskyblue')
plt.bar(x + 1.5*width, model_f1scores, width, label='F1 Score', color='lightblue')

plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
plt.xticks(x, labels=models, rotation=-45)
plt.xlabel('Model')
plt.ylabel('Percent')

plt.tight_layout()
plt.show()

# Multi Label Classification

In [None]:
label_names = ['Substance', 'Volume']

# accuracies
models = []
model_exact_accuracies = [] # predicts both labels right

def train_and_plot_multioutput_model(model, model_name, cm=True):
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)

    # Exact match accuracy (all labels correct)
    exact_match_accuracy = np.mean(np.all(y_test.values == y_pred, axis=1))

    # Per-label accuracy
    per_label_accuracy = (y_test == y_pred).mean(axis=0)
    average_label_accuracy = per_label_accuracy.mean()

    models.append(model_name)
    model_exact_accuracies.append(exact_match_accuracy * 100)

    print(model_name)
    print(f'Exact Match Accuracy: {exact_match_accuracy:.4f}')
    print(f'Average Per-Label Accuracy: {average_label_accuracy:.4f}')
    print(f'Per-Label Accuracies: {per_label_accuracy.values if hasattr(per_label_accuracy, "values") else per_label_accuracy}')
    print('----------------------------')

    # Confusion matrices for each label
    if cm == True:
        for i, col in enumerate(y_test.columns):
            cm = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])
            plt.figure(figsize=(4, 3))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
            plt.title(f'{model_name} Confusion Matrix, {label_names[i]}')
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            plt.show()

In [None]:
x = df.drop(columns=['volume', 'substance'])
y = df[['substance', 'volume']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

y_train = y_train.astype(int)
y_test = y_test.astype(int)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

print(x_train_scaled.shape)  
print(y_train.shape)  

In [None]:
from sklearn.multioutput import MultiOutputClassifier

model = MultiOutputClassifier(svm_model)
train_and_plot_multioutput_model(model, 'SVM', cm=False)

model = MultiOutputClassifier(knn_model)
train_and_plot_multioutput_model(model, 'KNN', cm=False)

model = MultiOutputClassifier(dt_model)
train_and_plot_multioutput_model(model, 'Decision Tree', cm=False)

model = MultiOutputClassifier(nb_model)
train_and_plot_multioutput_model(model, 'Naive Bayes', cm=False)

model = MultiOutputClassifier(regression_model)
train_and_plot_multioutput_model(model, 'Linear Regression', cm=False)

model = MultiOutputClassifier(gb_model)
train_and_plot_multioutput_model(model, 'Gradient Boosting', cm=False)

In [None]:
plt.bar(models, model_exact_accuracies, width=0.5)

plt.xticks(rotation=-45)
plt.xlabel('Model')
plt.ylabel('Exact Accuracy (%)')

plt.show()

In [None]:
import csv

with open('multioutput_rectified_accuracies.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(model_exact_accuracies)

In [None]:
bandpass_accuracies_df = pd.read_csv('multioutput_bandpass_accuracies.csv', header=None)
bandpass_accuracies = bandpass_accuracies_df.iloc[0].tolist()

x = np.arange(len(models))  # [0, 1, 2]
width = 0.35  # width of each bar

# Plot bars side by side
plt.bar(x - width/2, bandpass_accuracies, width, label='Bandpass', color='blue')
plt.bar(x + width/2, model_exact_accuracies, width, label='Rectified', color='green')

# Add labels
plt.xticks(x, models, rotation=-45)
plt.ylabel('Exact Accuracy (%)')
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()