In [1]:
import pandas as pd
import numpy as np

In [28]:
from collections import Counter
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import class_weight

from scipy import stats

In [64]:
from sklearn.metrics import classification_report

In [3]:
# read datasets
train_df = pd.read_csv('./train_data.csv')
validation_df = pd.read_csv('./validation_data.csv')
test_df = pd.read_csv('./test_data.csv')

In [4]:
def print_row_distribution(df):
    activity_counts = Counter(df['ACTIVITY'])
    sorted_counts = sorted(activity_counts.items(), key=lambda x: x[1], reverse=True)

    print("Row count per class (descending):")
    for label, count in sorted_counts:
        print(f"{label:20} {count}")

In [None]:
print_row_distribution(train_df)

In [6]:
# BUILD SLIDING WINDOW
# df - dataframe used
# window_size - size of the sliding window, by default 11s if not mentioned otherwise
# step_size - starting point for the current window given the previous, by default 5
# feature_cols - features to be used in the sliding window
def create_windows(dataset, window_size=11, step_size=5, feature_cols=['ACC_X', 'ACC_Y', 'ACC_Z']):
    X = []
    y = []
    window = []

    for person_id in dataset['PERSON_ID'].unique():
        person_data = dataset[dataset['PERSON_ID'] == person_id]
        feature_values = person_data[feature_cols].values
        activity = person_data['ACTIVITY']

        max_window_end = len(person_data)

        for i in range(0, max_window_end - window_size, step_size):
            window = feature_values[i:i+window_size]
            window_label = activity[i:i+window_size].mode(dropna=False).iloc[0]

            # Ensure the window is of the correct size
            if len(window) != window_size:
                continue  # Skip this window if it's the wrong shape

            X.append(window)
            y.append(window_label)

    print(len(X))

    return np.array(X), np.array(y)

In [12]:
window_size = 12
step_size = 6

In [None]:
X_train, y_train = create_windows(train_df, window_size=window_size, step_size=step_size)
X_val, y_val = create_windows(validation_df, window_size=window_size, step_size=step_size)
X_test, y_test = create_windows(test_df, window_size=window_size, step_size=step_size)

In [14]:
# print number of windows per class to see the imbalance ratio among windows
def print_window_distribution(y_labels):
    class_counts = Counter(y_labels)
    sorted_counts = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)

    print("Window count per class (descending):")
    for label, count in sorted_counts:
        print(f"{label:20} {count}")

In [None]:
print_window_distribution(y_train)

In [16]:
def downsample_upweight_majority_class(X_train, y_train, downsample_factor, majority_class, sample_weights):
    np.random.seed(42)

    majority_class_indices = np.where(y_train == majority_class)[0]

    X_majority_class = X_train[majority_class_indices]
    y_majority_class = y_train[majority_class_indices]

    print(f"total number of rows on X for majority class {majority_class}: {len(X_majority_class)}")
    print(f"total number of rows on y for majority class {majority_class}: {len(y_majority_class)}")

    number_of_majority_samples = len(X_majority_class)
    number_of_samples_to_extract = number_of_majority_samples // downsample_factor

    random_chosen_indices = np.random.choice(number_of_majority_samples, number_of_samples_to_extract, replace=False)

    # downsampled_X = X_majority_class[random_chosen_indices]
    # downsampled_y = y_majority_class[random_chosen_indices]

    new_X_train = []
    new_y_train = []
    new_sample_weights = []

    selected_majority_indices = majority_class_indices[random_chosen_indices]

    for index in range(0, len(X_train)):
        if index in selected_majority_indices:
            new_X_train.append(X_train[index])
            new_sample_weights.append(sample_weights[index] * downsample_factor)
            new_y_train.append(y_train[index])
        elif index in majority_class_indices:
            continue
        else:
            new_X_train.append(X_train[index])
            new_sample_weights.append(sample_weights[index])
            new_y_train.append(y_train[index])
    
    return np.array(new_X_train), np.array(new_y_train), np.array(new_sample_weights)


In [17]:
sample_weights = np.full(len(X_train), 1) # initialize weights array

In [18]:
class_downsample_factors = {
    'sleep': 20,
    'sitting': 10,
    'household-chores': 2,
    'walking': 2
}

In [None]:
for class_name, downsample_factor in class_downsample_factors.items():
    new_X_train, new_y_train, sample_weights = downsample_upweight_majority_class(X_train, y_train, downsample_factor, class_name, sample_weights)
    X_train = new_X_train
    y_train = new_y_train

In [20]:
X_train = new_X_train
y_train = new_y_train

In [None]:
print_window_distribution(y_train)

In [22]:
scaler = StandardScaler() # z-score

n_samples = X_train.shape[0]
n_timesteps = X_train.shape[1]
n_features = X_train.shape[2]

In [23]:
X_train_flat = X_train.reshape(-1, X_train.shape[-1])  # Flatten each window into a 1D array
X_val_flat = X_val.reshape(-1, X_val.shape[-1])
X_test_flat = X_test.reshape(-1, X_test.shape[-1])

In [24]:
X_train_scaled = scaler.fit_transform(X_train_flat).reshape(n_samples, n_timesteps, n_features)
X_val_scaled = scaler.transform(X_val_flat).reshape(X_val.shape[0], n_timesteps, n_features)
X_test_scaled = scaler.transform(X_test_flat).reshape(X_test.shape[0], n_timesteps, n_features)

In [None]:
# ENCODE LABELS
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)
y_test_enc = le.transform(y_test)

# Convert labels to one-hot encoding
y_train_cat = to_categorical(y_train_enc)
y_val_cat = to_categorical(y_val_enc)
y_test_cat = to_categorical(y_test_enc)

num_classes = y_train_cat.shape[1]  # Number of unique classes

In [None]:
# # Compute class weights
# class_weights_array = class_weight.compute_class_weight(
#     class_weight='balanced',
#     classes=np.unique(y_train_enc),
#     y=y_train_enc
# )

# # Convert to dictionary
# class_weights = dict(enumerate(class_weights_array))

In [80]:
# build model
# Build the LSTM model
model = Sequential([
    LSTM(128, input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2]), return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

In [81]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, restore_best_weights=True)

In [82]:
# compile model
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
# train model
history = model.fit(
    X_train_scaled, y_train_cat,
    validation_data=(X_val_scaled, y_val_cat),
    epochs=100,
    batch_size=128,
    verbose=1,
    # class_weight=class_weights
    sample_weight=sample_weights,
    callbacks=[early_stopping]
)

In [None]:
# evaluate
test_loss, test_acc = model.evaluate(X_test_scaled, y_test_cat, verbose=0)
print(f"Test accuracy: {test_acc:.4f} | Test loss: {test_loss:.4f}")

In [None]:
y_pred = model.predict(X_test_scaled)
y_pred_labels = le.inverse_transform(np.argmax(y_pred, axis=1))
y_true_labels = le.inverse_transform(np.argmax(y_test_cat, axis=1))

print(classification_report(y_true_labels, y_pred_labels))