# Data Cleaning and Preprocessing

This notebook focuses on cleaning the raw sensor data by handling missing values, normalizing the sensor readings, and identifying outliers. These steps ensure the data is ready for further analysis and pattern detection.

In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [7]:
# Load the raw data
data_path = '../data/raw/20251223151337.txt'
df = pd.read_csv(data_path, sep='\t')

# Parse 'time' column to datetime
df['time'] = pd.to_datetime(df['time'])

# Set 'time' as index
df.set_index('time', inplace=True)

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Fill missing values (if any) using forward fill
df = df.ffill()

# Verify no missing values remain
print("\nMissing values after filling:")
print(df.isnull().sum())

Missing values per column:
DeviceName          0
AccX(g)             0
AccY(g)             0
AccZ(g)             0
AsX(°/s)            0
AsY(°/s)            0
AsZ(°/s)            0
AngleX(°)           0
AngleY(°)           0
AngleZ(°)           0
HX(uT)              0
HY(uT)              0
HZ(uT)              0
Q0()                0
Q1()                0
Q2()                0
Q3()                0
Temperature(°C)     0
Height(m)           0
Pressure(kPa)       0
Version()           0
Battery level(%)    0
dtype: int64

Missing values after filling:
DeviceName          0
AccX(g)             0
AccY(g)             0
AccZ(g)             0
AsX(°/s)            0
AsY(°/s)            0
AsZ(°/s)            0
AngleX(°)           0
AngleY(°)           0
AngleZ(°)           0
HX(uT)              0
HY(uT)              0
HZ(uT)              0
Q0()                0
Q1()                0
Q2()                0
Q3()                0
Temperature(°C)     0
Height(m)           0
Pressure(kPa)       0
Versi

In [8]:
# Normalize sensor readings
columns_to_normalize = ['AccX(g)', 'AccY(g)', 'AccZ(g)', 'AsX(°/s)', 'AsY(°/s)', 'AsZ(°/s)',
                        'AngleX(°)', 'AngleY(°)', 'AngleZ(°)']

scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

# Display the first few rows of normalized data
df.head()

Unnamed: 0_level_0,DeviceName,AccX(g),AccY(g),AccZ(g),AsX(°/s),AsY(°/s),AsZ(°/s),AngleX(°),AngleY(°),AngleZ(°),...,HZ(uT),Q0(),Q1(),Q2(),Q3(),Temperature(°C),Height(m),Pressure(kPa),Version(),Battery level(%)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-12-23 15:13:37.349,WT901BLE67(D2:B1:E5:85:1B:B2),0.404977,0.260614,0.09188,0.424839,0.489177,0.549027,0.00953,0.546488,0.518907,...,16.373,0.711,-0.69696,-0.09274,-0.00641,24.7,0.0,0.0,10080.1.20,10
2025-12-23 15:13:37.465,WT901BLE67(D2:B1:E5:85:1B:B2),0.383484,0.257671,0.089031,0.425022,0.493631,0.54574,0.008322,0.545929,0.519187,...,16.373,0.711,-0.69696,-0.09274,-0.00641,24.7,0.0,0.0,10080.1.20,10
2025-12-23 15:13:37.524,WT901BLE67(D2:B1:E5:85:1B:B2),0.424208,0.266078,0.091168,0.423732,0.455659,0.537661,0.006846,0.542296,0.521007,...,16.373,0.711,-0.69696,-0.09274,-0.00641,24.7,0.0,0.0,10080.1.20,10
2025-12-23 15:13:37.643,WT901BLE67(D2:B1:E5:85:1B:B2),0.406674,0.269441,0.096866,0.420238,0.442053,0.522733,0.004027,0.53596,0.527724,...,19.827,0.711,-0.69696,-0.09274,-0.00641,24.7,0.0,0.0,10080.1.20,10
2025-12-23 15:13:37.734,WT901BLE67(D2:B1:E5:85:1B:B2),0.369344,0.2438,0.099715,0.44011,0.416574,0.490687,0.0,0.519005,0.536149,...,19.827,0.711,-0.69696,-0.09274,-0.00641,24.7,0.0,0.0,10080.1.20,10


In [34]:
# Save only meaningful data to a processed CSV file
columns_to_save = ['AccX(g)', 'AccY(g)', 'AccZ(g)', 'AsX(°/s)', 'AsY(°/s)', 'AsZ(°/s)',
                   'AngleX(°)', 'AngleY(°)', 'AngleZ(°)', 'HX(uT)', 'HY(uT)', 'HZ(uT)',
                   'Q0()', 'Q1()', 'Q2()', 'Q3()']
processed_data_path = '../data/processed/cleaned_data.csv'
df[columns_to_save].to_csv(processed_data_path)
print(f"Saved data with meaningful columns to {processed_data_path}")

Saved data with meaningful columns to ../data/processed/cleaned_data.csv
