In [None]:
import pandas as pd
import numpy as np
import glob

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Bidirectional, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Input, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder, StandardScaler

from scipy import stats
from collections import Counter

from sklearn.metrics import classification_report

In [2]:
# def handle_empty_activity(activity):
#     if pd.isna(activity):
#         activity = 'NAN'
#     return activity

In [3]:
# # read capture24 dataset
# fp = '/home/franci/licenta/datasets_archives/capture24/'
# capture24_csv_files = glob.glob(fp + 'P*.csv')
# capture24_csv_files.sort()

# print(capture24_csv_files)

In [4]:
# LABEL_SET = 'label:WillettsSpecific2018'
# activity_labels = 'annotation-label-dictionary.csv'
# annotations_label_dict = pd.read_csv(fp + activity_labels, index_col='annotation', dtype='string')

# person_id = 152
# for f in capture24_csv_files:
#     person_df = pd.read_csv(f, header=0, names=['TIMESTAMP', 'ACC_X', 'ACC_Y', 'ACC_Z', 'ANNOTATION'])
#     person_df['ACTIVITY'] = annotations_label_dict[LABEL_SET].reindex(person_df['ANNOTATION']).to_numpy()
#     person_df['ACTIVITY'] = person_df['ACTIVITY'].apply(lambda x: handle_empty_activity(x))
#     person_df = person_df.drop('ANNOTATION', axis=1)
#     person_df = person_df[person_df['ACTIVITY'] != 'NAN']
#     person_df['PERSON_ID'] = person_id
#     cols = person_df.columns.to_list()
#     cols = cols[-1:] + cols[:-1]
#     person_df = person_df[cols]
#     person_df = person_df[::100]

#     if person_id == 152:
#         person_df.to_csv(f, mode='w+', index=False)
#         print("Saved first file with header!\n")
#     else:
#         person_df.to_csv(f, mode='w+', index=False, header=False)

#     print(person_id)
#     person_id = person_id + 1

In [5]:
# read dataset & split 
capture24 = pd.read_csv('./all_capture24_original_timestamp.csv')
capture24['TIMESTAMP'] = pd.to_datetime(capture24['TIMESTAMP'])

In [6]:
def split_dataset(df):
    train_df = df[(df['PERSON_ID'] >= 152) & (df['PERSON_ID'] <= 257)].copy()
    val_df = df[(df['PERSON_ID'] >= 258) & (df['PERSON_ID'] <= 287)].copy()
    test_df = df[(df['PERSON_ID'] >= 288) & (df['PERSON_ID'] <= 302)].copy()

    return train_df, val_df, test_df

In [7]:
train_df, validation_df, test_df = split_dataset(capture24)

In [8]:
# BUILD SLIDING WINDOW
# df - dataframe used
# window_size - size of the sliding window, by default 11s if not mentioned otherwise
# step_size - starting point for the current window given the previous, by default 5
# feature_cols - features to be used in the sliding window
def create_windows(dataset, window_size=11, step_size=5, feature_cols=['ACC_X', 'ACC_Y', 'ACC_Z']):
    X = []
    y = []
    window = []

    for person_id in dataset['PERSON_ID'].unique():
        person_data = dataset[dataset['PERSON_ID'] == person_id]
        feature_values = person_data[feature_cols].values
        activity = person_data['ACTIVITY']

        max_window_end = len(person_data)

        for i in range(0, max_window_end - window_size, step_size):
            window = feature_values[i:i+window_size]
            window_label = activity[i:i+window_size].mode(dropna=False).iloc[0]

            # Ensure the window is of the correct size
            if len(window) != window_size:
                continue  # Skip this window if it's the wrong shape

            X.append(window)
            y.append(window_label)

    print(len(X))

    return np.array(X), np.array(y)

In [9]:
window_size = 180
step_size = 15

In [None]:
X_train, y_train = create_windows(train_df, window_size, step_size)
X_val, y_val = create_windows(validation_df, window_size, step_size)
X_test, y_test = create_windows(test_df, window_size, step_size)

In [None]:
X_time_train, y_time_train = create_windows(train_df, window_size, step_size, ['TIMESTAMP'])
X_time_val, y_time_val = create_windows(validation_df, window_size, step_size, ['TIMESTAMP'])
X_time_test, y_time_test = create_windows(test_df, window_size, step_size, ['TIMESTAMP'])

In [12]:
# print number of windows per class to see the imbalance ratio among windows
def print_window_distribution(y_labels):
    class_counts = Counter(y_labels)
    sorted_counts = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)

    print("Window count per class (descending):")
    for label, count in sorted_counts:
        print(f"{label:20} {count}")

In [None]:
print_window_distribution(y_train)

In [14]:
# NORMALIZE DATA
scaler = StandardScaler()

n_samples = X_train.shape[0]
n_timesteps = X_train.shape[1]
n_features = X_train.shape[2]

In [15]:
X_train_flat = X_train.reshape(-1, X_train.shape[-1])  # Flatten each window into a 1D array
X_val_flat = X_val.reshape(-1, X_val.shape[-1])
X_test_flat = X_test.reshape(-1, X_test.shape[-1])

In [16]:
X_train_scaled = scaler.fit_transform(X_train_flat).reshape(n_samples, n_timesteps, n_features)
X_val_scaled = scaler.transform(X_val_flat).reshape(X_val.shape[0], n_timesteps, n_features)
X_test_scaled = scaler.transform(X_test_flat).reshape(X_test.shape[0], n_timesteps, n_features)

In [17]:
# ENCODE LABELS
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)
y_test_enc = le.transform(y_test)

# Convert labels to one-hot encoding
y_train_cat = to_categorical(y_train_enc)
y_val_cat = to_categorical(y_val_enc)
y_test_cat = to_categorical(y_test_enc)

num_classes = y_train_cat.shape[1]  # Number of unique classes

In [None]:
# Main sequence input
sequence_input = Input(shape=(X_train_scaled.shape[1], X_train_scaled.shape[2]))
x = GRU(128, return_sequences=True)(sequence_input)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = GRU(64)(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

# Second input: time features
time_input = Input(shape=(X_time_train.shape[1],))  # flat input

# Combine both
combined = Concatenate()([x, time_input])

# Dense layers
x = Dense(64, activation='relu')(combined)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
output = Dense(num_classes, activation='softmax')(x)

# Define the model with two inputs
model = Model(inputs=[sequence_input, time_input], outputs=output)

In [19]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

In [20]:
# compile model
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
# train model
model.fit(
    [X_train_scaled, X_time_train],
    y_train_cat,
    validation_data=([X_val_scaled, X_time_val], y_val_cat), 
    epochs=100,
    batch_size=64,
    verbose=1,
    callbacks=[early_stopping]
)

In [None]:
# evaluate
test_loss, test_acc = model.evaluate([X_test_scaled, X_time_test], y_test_cat, verbose=0)
print(f"Test accuracy: {test_acc:.4f} | Test loss: {test_loss:.4f}")

In [None]:
y_pred = model.predict(X_test_scaled)
y_pred_labels = le.inverse_transform(np.argmax(y_pred, axis=1))
y_true_labels = le.inverse_transform(np.argmax(y_test_cat, axis=1))

from sklearn.metrics import classification_report
print(classification_report(y_true_labels, y_pred_labels))