In [1]:
import os
import numpy as np
import pandas as pd
from scipy.fft import fft
from sklearn.preprocessing import MinMaxScaler
import pickle

In [2]:
CHUNK_INTERVAL = 2500  # ms
SAMPLING_RATE = 144
DATASET_DIR = "./dataset/"
BASE_PATH = "~/Downloads/heterogeneity+activity+recognition/Activity recognition exp"

In [3]:
PHONE_ACCELEROMETER_FILE = os.path.join(BASE_PATH, "Phones_accelerometer.csv")
PHONE_GYRO_FILE = os.path.join(BASE_PATH, "Phones_gyroscope.csv")

In [4]:
def load_data():
    """Load accelerometer and gyroscope data."""
    phone_accelerometer = pd.read_csv(PHONE_ACCELEROMETER_FILE)
    phone_gyro = pd.read_csv(PHONE_GYRO_FILE)
    return phone_accelerometer, phone_gyro


def align_data(df, time_column="Arrival_Time"):
    """Align data based on timestamps."""
    return df.sort_values(by=time_column).reset_index(drop=True)


def chunk_data(df, time_column="Arrival_Time", interval_ms=CHUNK_INTERVAL):
    """Chunk data into fixed time intervals."""
    bins = np.arange(df[time_column].min(), df[time_column].max(), interval_ms)
    df["Time_Chunk"] = pd.cut(df[time_column], bins=bins, labels=False)
    return df

In [5]:
def apply_fourier_transform(df, sensor_name, chunk_column="Time_Chunk", feature_columns=None, target_f=SAMPLING_RATE):
    """Apply Fourier Transform to sensor data."""
    if feature_columns is None:
        feature_columns = ["x", "y", "z"]

    chunked_data = []
    errors = 0

    for chunk_id, group in df.groupby(chunk_column):
        try:
            chunk_fft = {f"{sensor_name}_Chunk_ID": chunk_id}
            for col in feature_columns:
                fft_result = fft(group[col].values)
                magnitude = np.abs(fft_result)[:len(fft_result) // 2]
                phase = np.angle(fft_result)[:len(fft_result) // 2]

                if target_f:
                    step = max(1, len(magnitude) // target_f)
                    indices = np.arange(0, len(magnitude), step)[:target_f]
                    magnitude, phase = magnitude[indices], phase[indices]

                chunk_fft[f"{sensor_name}_{col}_fft_magnitude"] = magnitude.tolist()
                chunk_fft[f"{sensor_name}_{col}_fft_phase"] = phase.tolist()

            chunk_fft[f"{sensor_name}_gt"] = group[['gt_acc', 'gt_gyro']].stack().value_counts().idxmax()
            chunked_data.append(chunk_fft)
        except Exception as e:
            errors += 1

    print(f"FFT Processing Errors: {errors}")
    return pd.DataFrame(chunked_data)


def pad_sequence(sequence, target_length):
    """Pad sequences to the target length."""
    return np.pad(sequence, (0, target_length - len(sequence)), mode="constant")


def normalize_and_save(df, axes, output_file, prefix="phone"):
    """Normalize data, combine into tensors, and save to file."""
    scaler = MinMaxScaler()

    # Precompute the maximum sequence length across all axes
    max_length = max(
        max(len(seq) for seq in df[f"{prefix}_{axis}_fft_magnitude"])
        for _, axes_data in axes.items()
        for axis in axes_data
    )

    # Iterate over each sensor to process its axes
    for sensor, axes_data in axes.items():
        padded_magnitudes = []
        for axis in axes_data:
            magnitude_col = f"{prefix}_{axis}_fft_magnitude"
            padded_magnitudes.append(
                np.stack(
                    df[magnitude_col].apply(lambda seq: pad_sequence(seq, max_length)).to_numpy()
                )
            )

        # Stack padded magnitudes into the correct shape: (num_samples, num_axes, max_length)
        combined_magnitudes = np.stack(padded_magnitudes, axis=1)  # Stack along axis=1 for num_axes

        # Normalize magnitudes
        flattened_magnitudes = combined_magnitudes.reshape(-1, combined_magnitudes.shape[-1])  # Flatten for scaler
        normalized_magnitudes = scaler.fit_transform(flattened_magnitudes).reshape(combined_magnitudes.shape)

        # Save normalized data back to the dataframe
        df[f"{sensor}_data"] = list(normalized_magnitudes)

    # Select the sensor data columns and ground truth column for saving
    sensor_data_cols = [f"{s}_data" for s in axes.keys()]
    df = df[sensor_data_cols + [f"{prefix}_gt"]]
    os.makedirs(DATASET_DIR, exist_ok=True)
    pickle.dump(df, open(os.path.join(DATASET_DIR, output_file), "wb"))
    print(f"Data saved to {output_file}")


In [6]:
# Load and preprocess data
phone_accelerometer, phone_gyro = load_data()
phone_data = pd.merge(phone_accelerometer, phone_gyro, on='Arrival_Time', how='inner', suffixes=('_acc', '_gyro'))
filtered_data = phone_data[['x_acc', 'y_acc', 'z_acc', 'gt_acc', 'x_gyro', 'y_gyro', 'z_gyro', 'gt_gyro', "Arrival_Time"]]

In [7]:
aligned_data = align_data(filtered_data)
chunked_data = chunk_data(aligned_data)

In [8]:
# Apply Fourier Transform
phone_acc_fft = apply_fourier_transform(chunked_data, "phone", feature_columns=['x_acc', 'y_acc', 'z_acc', 'x_gyro', 'y_gyro', 'z_gyro'])

FFT Processing Errors: 1056


In [9]:
# Normalize and save
axes = {
    "accelerometer": ["x_acc", "y_acc", "z_acc"],
    "gyro": ["x_gyro", "y_gyro", "z_gyro"]
}
normalize_and_save(phone_acc_fft, axes=axes, output_file="phone_data.pkl")

Data saved to phone_data.pkl


In [10]:
data = pd.read_pickle("./dataset/phone_data.pkl")
print(data.info())
print(f"\ndf shape: {data.shape}")
print(f"accelerometer_data shape: {data['accelerometer_data'].iloc[0].shape}")
print(f"gyro_data shape: {data['gyro_data'].iloc[0].shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6461 entries, 0 to 6460
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   accelerometer_data  6461 non-null   object
 1   gyro_data           6461 non-null   object
 2   phone_gt            6461 non-null   object
dtypes: object(3)
memory usage: 151.6+ KB
None

df shape: (6461, 3)
accelerometer_data shape: (3, 144)
gyro_data shape: (3, 144)
