# Data Cleaning and Preprocessing

This notebook focuses on cleaning the raw sensor data by handling missing values, normalizing the sensor readings, and identifying outliers. These steps ensure the data is ready for further analysis and pattern detection.

In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
import matplotlib.pyplot as plt
import numpy as np

In [6]:
# Load the raw data
data_path = '../data/raw/20251223192751.txt'
df = pd.read_csv(data_path, sep='\t')

# Parse 'time' column to datetime
df['time'] = pd.to_datetime(df['time'])

# Set 'time' as index
df.set_index('time', inplace=True)

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Fill missing values (if any) using forward fill
df = df.ffill()

# Verify no missing values remain
print("\nMissing values after filling:")
print(df.isnull().sum())

Missing values per column:
DeviceName          0
AccX(g)             0
AccY(g)             0
AccZ(g)             0
AsX(°/s)            0
AsY(°/s)            0
AsZ(°/s)            0
AngleX(°)           0
AngleY(°)           0
AngleZ(°)           0
HX(uT)              0
HY(uT)              0
HZ(uT)              0
Q0()                0
Q1()                0
Q2()                0
Q3()                0
Temperature(°C)     0
Height(m)           0
Pressure(kPa)       0
Version()           0
Battery level(%)    0
dtype: int64

Missing values after filling:
DeviceName          0
AccX(g)             0
AccY(g)             0
AccZ(g)             0
AsX(°/s)            0
AsY(°/s)            0
AsZ(°/s)            0
AngleX(°)           0
AngleY(°)           0
AngleZ(°)           0
HX(uT)              0
HY(uT)              0
HZ(uT)              0
Q0()                0
Q1()                0
Q2()                0
Q3()                0
Temperature(°C)     0
Height(m)           0
Pressure(kPa)       0
Versi

In [7]:
# Lowpass filter
from scipy.signal import butter, filtfilt
def butter_lowpass(data, cutoff, fs, order=4):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low')
    return filtfilt(b, a, data)

def lowpass_filter_series(series, cutoff=2.0, fs=20.0, order=4):
    filtered_data = butter_lowpass(series.values, cutoff, fs, order)
    return pd.Series(filtered_data, index=series.index)

# Apply lowpass filter to accelerometer data
df['accel_x_filtered'] = lowpass_filter_series(df['AccX(g)'])
df['accel_y_filtered'] = lowpass_filter_series(df['AccY(g)'])
df['accel_z_filtered'] = lowpass_filter_series(df['AccZ(g)'])

# Apply lowpass filter to gyroscope data
df['gyro_x_filtered'] = lowpass_filter_series(df['AsX(°/s)'])
df['gyro_y_filtered'] = lowpass_filter_series(df['AsY(°/s)'])
df['gyro_z_filtered'] = lowpass_filter_series(df['AsZ(°/s)'])

# Calculate magnitude of accelerometer data
df['Acc_Magnitude'] = np.sqrt(df['AccX(g)']**2 + df['AccY(g)']**2 + df['AccZ(g)']**2)
df['Acc_Magnitude_filtered'] = np.sqrt(df['accel_x_filtered']**2 + df['accel_y_filtered']**2 + df['accel_z_filtered']**2)

In [8]:
# Normalize sensor readings
columns_to_normalize = [
    'accel_x_filtered',
    'accel_y_filtered',
    'accel_z_filtered',
    'gyro_x_filtered',
    'gyro_y_filtered',
    'gyro_z_filtered',
    'Acc_Magnitude',
    'Acc_Magnitude_filtered',
    'AccX(g)',
    'AccY(g)',
    'AccZ(g)',
    'AsX(°/s)',
    'AsY(°/s)',
    'AsZ(°/s)',
    'AngleX(°)',
    'AngleY(°)',
    'AngleZ(°)'
]

#scaler = MinMaxScaler()
#df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

# Display the first few rows of normalized data
df.head()

Unnamed: 0_level_0,DeviceName,AccX(g),AccY(g),AccZ(g),AsX(°/s),AsY(°/s),AsZ(°/s),AngleX(°),AngleY(°),AngleZ(°),...,Version(),Battery level(%),accel_x_filtered,accel_y_filtered,accel_z_filtered,gyro_x_filtered,gyro_y_filtered,gyro_z_filtered,Acc_Magnitude,Acc_Magnitude_filtered
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-12-23 19:27:51.252,WT901BLE67(D2:B1:E5:85:1B:B2),0.951,0.054,0.448,-8.85,2.38,-5.127,13.98,-66.26,-21.17,...,10080.1.20,10,0.951057,0.054455,0.447113,-8.666166,2.431034,-5.101846,1.052626,1.052324
2025-12-23 19:27:51.341,WT901BLE67(D2:B1:E5:85:1B:B2),0.851,0.06,0.449,-5.615,10.864,-11.23,14.63,-65.4,-22.75,...,10080.1.20,10,0.920744,0.074661,0.452794,-7.306544,6.713592,-14.256866,0.964055,1.028769
2025-12-23 19:27:51.461,WT901BLE67(D2:B1:E5:85:1B:B2),0.898,0.063,0.461,-4.578,6.592,-15.137,16.11,-63.61,-25.08,...,10080.1.20,10,0.89093,0.099818,0.457153,-6.838914,9.620916,-22.187838,1.011382,1.006335
2025-12-23 19:27:51.522,WT901BLE67(D2:B1:E5:85:1B:B2),0.969,0.096,0.447,-4.517,8.667,-18.127,18.24,-62.48,-27.94,...,10080.1.20,10,0.863048,0.132399,0.45894,-7.638695,10.037866,-27.74766,1.071441,0.986411
2025-12-23 19:27:51.641,WT901BLE67(D2:B1:E5:85:1B:B2),0.861,0.134,0.519,2.38,1.343,-39.307,22.73,-61.42,-33.46,...,10080.1.20,10,0.839964,0.170697,0.457428,-9.272914,7.33421,-29.982519,1.014218,0.971554


In [9]:
from pathlib import Path

# Save only meaningful data to a processed CSV file
columns_to_save = ['AccX(g)', 'AccY(g)', 'AccZ(g)', 'AsX(°/s)', 'AsY(°/s)', 'AsZ(°/s)',
                   'AngleX(°)', 'AngleY(°)', 'AngleZ(°)', 'HX(uT)', 'HY(uT)', 'HZ(uT)',
                   'Q0()', 'Q1()', 'Q2()', 'Q3()']

# Generate processed data path based on raw data filename
raw_filename = Path(data_path).stem  # Extract raw filename without extension
processed_data_path = Path("../data/processed") / f"{raw_filename}_cleaned.csv"

df[columns_to_save].to_csv(processed_data_path)
print(f"Saved data with meaningful columns to {processed_data_path}")

Saved data with meaningful columns to ../data/processed/20251223192751_cleaned.csv
