In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from collections import defaultdict


#  Import additional libraries for missing value handling strategies
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.interpolate import interp1d
import joblib

In [None]:
class HumanActivityDataset:
    def __init__(self, root_path):
        """
        Initialize the dataset with the root path
        
        Parameters:
        root_path (str): Root path containing 'LS' and 'TS' folders
        """
        self.root_path = root_path
        self.learning_set_path = os.path.join(root_path, 'LS')
        self.test_set_path = os.path.join(root_path, 'TS')
        
    def load_dataset(self, dataset_type='learning'):
        """
        Load sensor data, activities, and subject IDs
        
        Parameters:
        dataset_type (str): 'learning' or 'test'
        
        Returns:
        tuple: Sensors dictionary, activities array (None for test set), subjects array
        """
        # Choose the correct path
        path = self.learning_set_path if dataset_type == 'learning' else self.test_set_path
        
        # Dictionary to store sensor data
        sensors = {}
        
        # Load data for each sensor
        for i in range(2, 33):  # Sensors from ID 2 to 32
            sensor_file = os.path.join(path, f'LS_sensor_{i}.txt' if dataset_type == 'learning' else f'TS_sensor_{i}.txt')
            sensors[i] = np.loadtxt(sensor_file)
        
        # Load subject IDs
        subjects = np.loadtxt(os.path.join(path, 'subject_Id.txt'))
        
        # Load activities for learning set
        activities = None
        if dataset_type == 'learning':
            activities = np.loadtxt(os.path.join(path, 'activity_Id.txt'))
        
        return sensors, activities, subjects
    
    def analyze_missing_values(self, sensors):
        """
        Analyze missing values across all sensors
        
        Parameters:
        sensors (dict): Dictionary of sensor data
        
        Returns:
        dict: Missing value statistics for each sensor
        """
        missing_stats = {}
        
        for sensor_id, sensor_data in sensors.items():
            # Count completely missing samples and missing points
            missing_samples = np.sum(np.all(sensor_data == -999999.99, axis=1))
            missing_points = np.sum(sensor_data == -999999.99)
            total_points = sensor_data.size
            
            missing_stats[sensor_id] = {
                'total_samples': sensor_data.shape[0],
                'completely_missing_samples': missing_samples,
                'missing_points': missing_points,
                'missing_percentage': (missing_points / total_points) * 100
            }
        
        return missing_stats
    
    def visualize_sensor_characteristics(self, sensors, activities):
        """
        Create comprehensive visualizations of sensor data
        
        Parameters:
        sensors (dict): Dictionary of sensor data
        activities (array): Activity labels
        """
        # Activity names for reference
        activity_names = [
            'Lying', 'Sitting', 'Standing', 'Walking very slow', 'Normal walking', 
            'Nordic walking', 'Running', 'Ascending stairs', 'Descending stairs', 
            'Cycling', 'Ironing', 'Vacuum cleaning', 'Rope jumping', 'Playing soccer'
        ]
        
        # Select representative sensors from different body locations
        selected_sensors = {
            'Heart Rate': 2, 
            'Hand Acceleration': 4, 
            'Chest Temperature': 13, 
            'Foot Acceleration': 24, 
            'Foot Magnetometer': 30
        }
        
        # Create a multi-panel figure
        fig, axes = plt.subplots(len(selected_sensors), 1, figsize=(15, 20))
        fig.suptitle('Sensor Data Across Different Activities', fontsize=16)
        
        for i, (sensor_name, sensor_id) in enumerate(selected_sensors.items()):
            sensor_data = sensors[sensor_id]
            
            # Mask missing values
            masked_data = np.ma.masked_equal(sensor_data, -999999.99)
            
            # Box plot of sensor values by activity
            box_data = []
            for activity in range(1, 15):
                activity_data = masked_data[activities == activity]
                # Take mean of each sample to reduce dimensionality
                box_data.append(np.mean(activity_data, axis=1))
            
            axes[i].boxplot(box_data, labels=[activity_names[j-1] for j in range(1, 15)])
            axes[i].set_title(f'{sensor_name} (Sensor {sensor_id})')
            axes[i].set_xlabel('Activities')
            axes[i].set_ylabel('Average Sensor Value')
            plt.setp(axes[i].get_xticklabels(), rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()
    
    # def preprocess_sensors(self, sensors):
    #     """
    #     Preprocess sensor data by handling missing values and standardizing
        
    #     Parameters:
    #     sensors (dict): Dictionary of sensor data
        
    #     Returns:
    #     dict: Preprocessed sensor data
    #     """
    #     preprocessed_sensors = {}
        
    #     # Store scaler for each sensor
    #     scalers = {}
    #     for sensor_id, sensor_data in sensors.items():
    #         # Create a copy of the data
    #         processed_data = sensor_data.copy()
            
    #         # Replace missing values with median for each sample
    #         for i in range(processed_data.shape[0]):
    #             sample = processed_data[i]
    #             valid_values = sample[sample != -999999.99]
                
    #             if len(valid_values) > 0:
    #                 # Replace missing values with sample median
    #                 processed_data[i][sample == -999999.99] = np.median(valid_values)
    #             else:
    #                 # If all values are missing, replace with 0
    #                 processed_data[i][sample == -999999.99] = 0
            
    #         # Standardize each sensor's data
    #         scalers[sensor_id] = StandardScaler()
    #         preprocessed_sensors[sensor_id] = scalers[sensor_id].fit_transform(processed_data)
        
    #     return preprocessed_sensors, scalers
    
    def detect_outliers(self, sensors):
        outliers_stats = {}

        for sensor_id, data in sensors.items():
            Q1 = np.percentile(data, 25)
            Q3 = np.percentile(data, 75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = data[(data < lower_bound) | (data > upper_bound)]
            outliers_percentage = (len(outliers) / data.size) * 100
            outliers_stats[sensor_id] = outliers_percentage
        return outliers_stats

    def explore_dataset(self):
        """
        Comprehensive exploration of the Human Activity Recognition dataset

        Returns:
        dict: Containing raw and preprocessed sensor data, activities, and subjects
        """
        # Load learning dataset
        learning_sensors, activities, learning_subjects = self.load_dataset('learning')

        # Load test dataset
        test_sensors, _, test_subjects = self.load_dataset('test')

        # Analyze missing values for learning set
        print("Missing Value Statistics (Learning Set):")
        learning_missing_stats = self.analyze_missing_values(learning_sensors)
        for sensor_id, stats in learning_missing_stats.items():
            print(f"Sensor {sensor_id}: {stats} missing values")

        # Detect outliers in learning set
        print("\nOutlier Statistics (Learning Set):")
        learning_outliers_stats = self.detect_outliers(learning_sensors)
        for sensor_id, outliers_percentage in learning_outliers_stats.items():
            print(f"Sensor {sensor_id}: {outliers_percentage:.2f}% outliers detected")

        # Visualize sensor characteristics for learning set
        self.visualize_sensor_characteristics(learning_sensors, activities)

        # Return key information for further analysis
        return {
            'learning_raw_sensors': learning_sensors,
            'learning_activities': activities,
            'learning_subjects': learning_subjects,
            'test_raw_sensors': test_sensors,
            'test_subjects': test_subjects
        }

In [None]:
dataset_explorer = HumanActivityDataset('.')

# Explore the dataset
dataset = dataset_explorer.explore_dataset()

In [None]:
def analyze_sensors_and_activities(root_path='LS'):
    # Load activities
    activities = np.loadtxt(f'{root_path}/activity_Id.txt')
    
    # Activity names mapping
    activity_names = {
        1: 'Lying',
        2: 'Sitting', 
        3: 'Standing',
        4: 'Walking very slow',
        5: 'Normal walking',
        6: 'Nordic walking',
        7: 'Running',
        8: 'Ascending stairs',
        9: 'Descending stairs',
        10: 'Cycling',
        11: 'Ironing',
        12: 'Vacuum cleaning',
        13: 'Rope jumping',
        14: 'Playing soccer'
    }
    
    # Compute activity distribution
    activity_counts = pd.Series(activities).value_counts().sort_index()
    
    # Create activity distribution plot
    plt.figure(figsize=(15, 6))
    bars = plt.bar(range(1, 15), activity_counts)
    plt.title('Distribution of Activities', fontsize=12)
    plt.xlabel('Activity')
    plt.ylabel('Count')
    plt.xticks(range(1, 15), [activity_names[i] for i in range(1, 15)], rotation=45, ha='right')
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Print activity distribution percentages
    print("\nActivity Distribution Percentages:")
    for act_id, count in activity_counts.items():
        percentage = (count/len(activities))*100
        print(f"{activity_names[act_id]}: {percentage:.2f}%")
    
    # Compute sensor statistics
    sensor_stats = {}
    for i in range(2, 33):
        sensor_data = np.loadtxt(f'{root_path}/LS_sensor_{i}.txt')
        
        # Mask missing values
        valid_data = sensor_data[sensor_data != -999999.99]
        
        stats = {
            'mean': np.mean(valid_data),
            'std': np.std(valid_data),
            'min': np.min(valid_data),
            'max': np.max(valid_data),
            'missing_rate': (np.sum(sensor_data == -999999.99) / sensor_data.size) * 100,
            '25%': np.percentile(valid_data, 25),
            '50%': np.percentile(valid_data, 50),
            '75%': np.percentile(valid_data, 75)
        }
        sensor_stats[i] = stats
    
    # Create a DataFrame with sensor statistics
    stats_df = pd.DataFrame.from_dict(sensor_stats, orient='index')
    
    # Print sensor statistics grouped by location
    print("\nSensor Statistics:")
    print("\nHeart Rate (Sensor 2):")
    print(stats_df.loc[2].to_string())
    
    print("\nHand Sensors (3-12):")
    print(stats_df.loc[3:12].to_string())
    
    print("\nChest Sensors (13-22):")
    print(stats_df.loc[13:22].to_string())
    
    print("\nAnkle Sensors (23-32):")
    print(stats_df.loc[23:32].to_string())

# Run the analysis
analyze_sensors_and_activities()

In [None]:
# Create a comprehensive sensor description dictionary
sensor_descriptions = {
    # Heart Rate
    2: {
        "name": "Heart Rate",
        "location": "Chest",
        "type": "ECG",
        "unit": "BPM",
        "missing_rate": "0%",
        "description": "Heart rate measurement from chest ECG sensor"
    },
    
    # Hand IMU (3-12): Accelerometer, Gyroscope, Magnetometer
    3: {"name": "Hand Temperature", "location": "Hand", "type": "Temperature", "unit": "°C", "missing_rate": "4.67%"},
    4: {"name": "Hand Accelerometer X", "location": "Hand", "type": "Accelerometer", "axis": "X", "unit": "m/s²", "missing_rate": "4.67%"},
    5: {"name": "Hand Accelerometer Y", "location": "Hand", "type": "Accelerometer", "axis": "Y", "unit": "m/s²", "missing_rate": "4.67%"},
    6: {"name": "Hand Accelerometer Z", "location": "Hand", "type": "Accelerometer", "axis": "Z", "unit": "m/s²", "missing_rate": "4.67%"},
    7: {"name": "Hand Gyroscope X", "location": "Hand", "type": "Gyroscope", "axis": "X", "unit": "rad/s", "missing_rate": "4.67%"},
    8: {"name": "Hand Gyroscope Y", "location": "Hand", "type": "Gyroscope", "axis": "Y", "unit": "rad/s", "missing_rate": "4.67%"},
    9: {"name": "Hand Gyroscope Z", "location": "Hand", "type": "Gyroscope", "axis": "Z", "unit": "rad/s", "missing_rate": "4.67%"},
    10: {"name": "Hand Magnetometer X", "location": "Hand", "type": "Magnetometer", "axis": "X", "unit": "μT", "missing_rate": "4.67%"},
    11: {"name": "Hand Magnetometer Y", "location": "Hand", "type": "Magnetometer", "axis": "Y", "unit": "μT", "missing_rate": "4.67%"},
    12: {"name": "Hand Magnetometer Z", "location": "Hand", "type": "Magnetometer", "axis": "Z", "unit": "μT", "missing_rate": "4.67%"},
    
    # Chest IMU (13-22): Accelerometer, Gyroscope, Magnetometer
    13: {"name": "Chest Accelerometer X", "location": "Chest", "type": "Accelerometer", "axis": "X", "unit": "m/s²", "missing_rate": "7.10%"},
    14: {"name": "Chest Accelerometer Y", "location": "Chest", "type": "Accelerometer", "axis": "Y", "unit": "m/s²", "missing_rate": "7.10%"},
    15: {"name": "Chest Accelerometer Z", "location": "Chest", "type": "Accelerometer", "axis": "Z", "unit": "m/s²", "missing_rate": "7.10%"},
    16: {"name": "Chest Gyroscope X", "location": "Chest", "type": "Gyroscope", "axis": "X", "unit": "rad/s", "missing_rate": "7.10%"},
    17: {"name": "Chest Gyroscope Y", "location": "Chest", "type": "Gyroscope", "axis": "Y", "unit": "rad/s", "missing_rate": "7.10%"},
    18: {"name": "Chest Gyroscope Z", "location": "Chest", "type": "Gyroscope", "axis": "Z", "unit": "rad/s", "missing_rate": "7.10%"},
    19: {"name": "Chest Magnetometer X", "location": "Chest", "type": "Magnetometer", "axis": "X", "unit": "μT", "missing_rate": "7.10%"},
    20: {"name": "Chest Magnetometer Y", "location": "Chest", "type": "Magnetometer", "axis": "Y", "unit": "μT", "missing_rate": "7.10%"},
    21: {"name": "Chest Magnetometer Z", "location": "Chest", "type": "Magnetometer", "axis": "Z", "unit": "μT", "missing_rate": "7.10%"},
    22: {"name": "Chest Temperature", "location": "Chest", "type": "Temperature", "unit": "°C", "missing_rate": "7.10%"},
    
    # Ankle IMU (23-32): Accelerometer, Gyroscope, Magnetometer
    23: {"name": "Ankle Accelerometer X", "location": "Ankle", "type": "Accelerometer", "axis": "X", "unit": "m/s²", "missing_rate": "15.90%"},
    24: {"name": "Ankle Accelerometer Y", "location": "Ankle", "type": "Accelerometer", "axis": "Y", "unit": "m/s²", "missing_rate": "15.90%"},
    25: {"name": "Ankle Accelerometer Z", "location": "Ankle", "type": "Accelerometer", "axis": "Z", "unit": "m/s²", "missing_rate": "15.90%"},
    26: {"name": "Ankle Gyroscope X", "location": "Ankle", "type": "Gyroscope", "axis": "X", "unit": "rad/s", "missing_rate": "15.90%"},
    27: {"name": "Ankle Gyroscope Y", "location": "Ankle", "type": "Gyroscope", "axis": "Y", "unit": "rad/s", "missing_rate": "15.90%"},
    28: {"name": "Ankle Gyroscope Z", "location": "Ankle", "type": "Gyroscope", "axis": "Z", "unit": "rad/s", "missing_rate": "15.90%"},
    29: {"name": "Ankle Magnetometer X", "location": "Ankle", "type": "Magnetometer", "axis": "X", "unit": "μT", "missing_rate": "15.90%"},
    30: {"name": "Ankle Magnetometer Y", "location": "Ankle", "type": "Magnetometer", "axis": "Y", "unit": "μT", "missing_rate": "15.90%"},
    31: {"name": "Ankle Magnetometer Z", "location": "Ankle", "type": "Magnetometer", "axis": "Z", "unit": "μT", "missing_rate": "15.90%"},
    32: {"name": "Ankle Temperature", "location": "Ankle", "type": "Temperature", "unit": "°C", "missing_rate": "15.90%"}
}

# Print sensor descriptions grouped by location
for location in ['Hand', 'Chest', 'Ankle']:
    print(f"\n=== {location} Sensors ===")
    location_sensors = {k: v for k, v in sensor_descriptions.items() if v['location'] == location}
    for sensor_id, desc in sorted(location_sensors.items()):
        print(f"Sensor {sensor_id}: {desc['name']}")
        print(f"  Type: {desc['type']}")
        print(f"  Unit: {desc['unit']}")
        print(f"  Missing Rate: {desc['missing_rate']}")
        if 'axis' in desc:
            print(f"  Axis: {desc['axis']}")
        print()

# Print summary statistics
print("\n=== Summary Statistics ===")
total_sensors = len(sensor_descriptions)
print(f"Total number of sensors: {total_sensors}")
print(f"Number of IMU locations: 3 (Hand, Chest, Ankle)")
print("Sensors per IMU:")
print("- 1 Temperature sensor")
print("- 3 Accelerometer axes (X, Y, Z)")
print("- 3 Gyroscope axes (X, Y, Z)")
print("- 3 Magnetometer axes (X, Y, Z)")

# Create Missing Value Handlers
Create a class that inherits from HumanActivityDataset to implement different imputation strategies (mean, mode, knn, interpolation)



In [None]:
# Create a class that inherits from HumanActivityDataset to implement different imputation strategies
class HumanActivityDatasetWithImputation(HumanActivityDataset):
    def __init__(self, root_path):
        super().__init__(root_path)
    
    def mean_imputation(self, sensors):
        imputed_sensors = {}
        scalers = {}
        for sensor_id, sensor_data in sensors.items():
            imputer = SimpleImputer(strategy='mean')
            imputed_data = imputer.fit_transform(sensor_data)
            scaler = StandardScaler()
            scaled_data = scaler.fit_transform(imputed_data)
            imputed_sensors[sensor_id] = scaled_data
            scalers[sensor_id] = scaler
        return imputed_sensors, scalers
    
    def mode_imputation(self, sensors):
        imputed_sensors = {}
        scalers = {}
        for sensor_id, sensor_data in sensors.items():
            imputer = SimpleImputer(strategy='most_frequent')
            imputed_data = imputer.fit_transform(sensor_data)
            scaler = StandardScaler()
            scaled_data = scaler.fit_transform(imputed_data)
            imputed_sensors[sensor_id] = scaled_data
            scalers[sensor_id] = scaler
        return imputed_sensors, scalers
    
    def knn_imputation(self, sensors, n_neighbors=5):
        imputed_sensors = {}
        scalers = {}
        for sensor_id, sensor_data in sensors.items():
            imputer = KNNImputer(n_neighbors=n_neighbors)
            imputed_data = imputer.fit_transform(sensor_data)
            scaler = StandardScaler()
            scaled_data = scaler.fit_transform(imputed_data)
            imputed_sensors[sensor_id] = scaled_data
            scalers[sensor_id] = scaler
        return imputed_sensors, scalers
    
    def linear_interpolation(self, sensors):
        imputed_sensors = {}
        scalers = {}
        for sensor_id, sensor_data in sensors.items():
            imputed_data = sensor_data.copy()
            for i in range(imputed_data.shape[1]):
                col = imputed_data[:, i]
                mask = col != -999999.99
                if np.any(mask):
                    interp_func = interp1d(np.where(mask)[0], col[mask], bounds_error=False, fill_value="extrapolate")
                    col[~mask] = interp_func(np.where(~mask)[0])
                imputed_data[:, i] = col
            scaler = StandardScaler()
            scaled_data = scaler.fit_transform(imputed_data)
            imputed_sensors[sensor_id] = scaled_data
            scalers[sensor_id] = scaler
        return imputed_sensors, scalers

In [None]:
# Initialize the dataset explorer with imputation strategies
dataset_explorer_imputation = HumanActivityDatasetWithImputation('.')

# Load the learning dataset
learning_sensors, activities, learning_subjects = dataset_explorer_imputation.load_dataset('learning')

# Apply different imputation strategies
mean_imputed_sensors, mean_scalers = dataset_explorer_imputation.mean_imputation(learning_sensors)
mode_imputed_sensors, mode_scalers = dataset_explorer_imputation.mode_imputation(learning_sensors)
knn_imputed_sensors, knn_scalers = dataset_explorer_imputation.knn_imputation(learning_sensors)
linear_imputed_sensors, linear_scalers = dataset_explorer_imputation.linear_interpolation(learning_sensors)

# Save the imputed datasets and scalers
joblib.dump(mean_imputed_sensors, os.path.join(dataset_explorer_imputation.learning_set_path, 'mean_imputed_sensors.pkl'))
joblib.dump(mean_scalers, os.path.join(dataset_explorer_imputation.learning_set_path, 'mean_scalers.pkl'))
joblib.dump(mode_imputed_sensors, os.path.join(dataset_explorer_imputation.learning_set_path, 'mode_imputed_sensors.pkl'))
joblib.dump(mode_scalers, os.path.join(dataset_explorer_imputation.learning_set_path, 'mode_scalers.pkl'))
joblib.dump(knn_imputed_sensors, os.path.join(dataset_explorer_imputation.learning_set_path, 'knn_imputed_sensors.pkl'))
joblib.dump(knn_scalers, os.path.join(dataset_explorer_imputation.learning_set_path, 'knn_scalers.pkl'))
joblib.dump(linear_imputed_sensors, os.path.join(dataset_explorer_imputation.learning_set_path, 'linear_imputed_sensors.pkl'))
joblib.dump(linear_scalers, os.path.join(dataset_explorer_imputation.learning_set_path, 'linear_scalers.pkl'))

### Why We Impute with Different Methods

Imputation is the process of replacing missing data with substituted values. Different imputation methods are used to handle missing data in various ways:

- **Mean Imputation**: Replaces missing values with the mean of the available values. This method is simple and effective when the data is symmetrically distributed.
- **Mode Imputation**: Replaces missing values with the most frequent value (mode). This is useful for categorical data or when the data has a high frequency of certain values.
- **KNN Imputation**: Uses the k-nearest neighbors algorithm to impute missing values based on the values of the nearest neighbors. This method can capture the local structure of the data.
- **Linear Interpolation**: Estimates missing values by fitting a linear function to the available data points. This method is useful when the data follows a linear trend.

Using different imputation methods allows us to compare their effectiveness and choose the best approach for our specific dataset.

### Why We Use .pkl Files and How to Work with Them

`.pkl` files are used to serialize Python objects using the `pickle` module. Serialization is the process of converting an object into a byte stream, which can be saved to a file and later deserialized to reconstruct the original object. We use `.pkl` files for the following reasons:

- **Efficiency**: Pickle files are efficient for saving and loading complex data structures, such as dictionaries and lists.
- **Convenience**: They allow us to save the state of an object, such as a trained model or preprocessed data, and load it later without having to recompute or retrain.
- **Portability**: Pickle files can be easily shared and used across different environments.

To work with `.pkl` files, we use the `joblib` library, which provides efficient serialization for large numpy arrays and other data structures:

```python
import joblib

# Save an object to a .pkl file
joblib.dump(object, 'filename.pkl')

# Load an object from a .pkl file
object = joblib.load('filename.pkl')
```

### Why We Standardize the Values and Why We Are Saving the Scalers for Later

Standardization is the process of scaling data to have a mean of 0 and a standard deviation of 1. This is important for the following reasons:

- **Consistency**: Standardizing ensures that all features contribute equally to the analysis, preventing features with larger scales from dominating. We indeed have very different metrics (BPM vs C°)
- **Improved Performance**: Many machine learning algorithms perform better when the data is standardized, as it helps with convergence and stability during training.
- **Comparability**: Standardized data allows for easier comparison between different features and datasets.

We save the scalers for later use to ensure that the same transformation is applied to new data, such as test data or future data points. This maintains consistency and allows us to accurately compare and evaluate the performance of our models:

```python
from sklearn.preprocessing import StandardScaler
import joblib

# Fit and transform the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Load the scaler and transform new data
scaler = joblib.load('scaler.pkl')
new_scaled_data = scaler.transform(new_data)
```

# Compare Imputation Methods
Show statistics to compare how different imputation methods handle the missing values

In [None]:
# Compare Imputation Methods

# Create visualizations and statistics to compare how different imputation methods handle the missing values

# Function to visualize imputed data
def visualize_imputed_data(imputed_sensors, title):
    fig, axes = plt.subplots(5, 1, figsize=(15, 25))
    fig.suptitle(title, fontsize=16)
    selected_sensors = [2, 4, 13, 24, 30]  # Example sensor IDs
    for i, sensor_id in enumerate(selected_sensors):
        sensor_data = imputed_sensors[sensor_id]
        axes[i].plot(sensor_data)
        axes[i].set_title(f'Sensor {sensor_id}')
        axes[i].set_xlabel('Sample')
        axes[i].set_ylabel('Value')
    plt.tight_layout()
    plt.show()

# # Visualize mean imputed data
# visualize_imputed_data(mean_imputed_sensors, 'Mean Imputation')

# # Visualize mode imputed data
# visualize_imputed_data(mode_imputed_sensors, 'Mode Imputation')

# # Visualize k-NN imputed data
# visualize_imputed_data(knn_imputed_sensors, 'k-NN Imputation')

# # Visualize linear interpolation imputed data
# visualize_imputed_data(linear_imputed_sensors, 'Linear Interpolation')

# Function to calculate and print statistics for imputed data
def print_imputation_statistics(imputed_sensors, method_name):
    print(f"Statistics for {method_name}:")
    for sensor_id, sensor_data in imputed_sensors.items():
        mean_value = np.mean(sensor_data)
        std_dev = np.std(sensor_data)
        print(f"Sensor {sensor_id}: Mean = {mean_value}, Std Dev = {std_dev}")

# Print statistics for mean imputed data
print_imputation_statistics(mean_imputed_sensors, 'Mean Imputation')

# Print statistics for mode imputed data
print_imputation_statistics(mode_imputed_sensors, 'Mode Imputation')

# Print statistics for k-NN imputed data
print_imputation_statistics(knn_imputed_sensors, 'k-NN Imputation')

# Print statistics for linear interpolation imputed data
print_imputation_statistics(linear_imputed_sensors, 'Linear Interpolation')

## Outliers Management

In [None]:
def analyze_sensor_outliers(root_path='LS'):
    # Define sensor groups and their valid ranges
    sensor_groups = {
        'heart_rate': {'sensors': [2], 'range': (30, 220), 'unit': 'bpm'},
        'temperature': {'sensors': [3, 13, 23], 'range': (20, 40), 'unit': '°C'},
        'acceleration': {'sensors': [4,5,6, 14,15,16, 24,25,26], 'range': (-50, 50), 'unit': 'm/s²'},
        'gyroscope': {'sensors': [7,8,9, 17,18,19, 27,28,29], 'range': (-20, 20), 'unit': 'rad/s'},
        'magnetometer': {'sensors': [10,11,12, 20,21,22, 30,31,32], 'range': (-100, 100), 'unit': 'µT'}
    }
    
    outlier_stats = defaultdict(dict)
    
    # Create subplots for each sensor type
    fig, axes = plt.subplots(len(sensor_groups), 1, figsize=(15, 5*len(sensor_groups)))
    fig.suptitle('Sensor Values Distribution with Outliers Highlighted', fontsize=16)
    
    for idx, (sensor_type, info) in enumerate(sensor_groups.items()):
        all_values = []
        outlier_counts = []
        
        for sensor_id in info['sensors']:
            # Load sensor data
            sensor_data = np.loadtxt(f'{root_path}/LS_sensor_{sensor_id}.txt')
            valid_data = sensor_data[sensor_data != -999999.99]
            
            # Calculate outliers based on valid range
            outliers_range = valid_data[(valid_data < info['range'][0]) | (valid_data > info['range'][1])]
            
            # Calculate statistical outliers (IQR method)
            Q1 = np.percentile(valid_data, 25)
            Q3 = np.percentile(valid_data, 75)
            IQR = Q3 - Q1
            statistical_outliers = valid_data[(valid_data < Q1 - 1.5*IQR) | (valid_data > Q3 + 1.5*IQR)]
            
            # Store statistics
            outlier_stats[sensor_type][sensor_id] = {
                'total_samples': len(sensor_data),
                'valid_samples': len(valid_data),
                'range_outliers': len(outliers_range),
                'statistical_outliers': len(statistical_outliers),
                'range_outlier_pct': (len(outliers_range) / len(valid_data)) * 100,
                'statistical_outlier_pct': (len(statistical_outliers) / len(valid_data)) * 100
            }
            
            all_values.extend(valid_data)
            outlier_counts.append(len(outliers_range))
        
        # Plot distribution
        sns.histplot(all_values, bins=100, ax=axes[idx])
        axes[idx].axvline(info['range'][0], color='r', linestyle='--', alpha=0.5)
        axes[idx].axvline(info['range'][1], color='r', linestyle='--', alpha=0.5)
        axes[idx].set_title(f'{sensor_type.capitalize()} Sensors ({info["unit"]})')
        axes[idx].set_xlabel(f'Value ({info["unit"]})')
        axes[idx].set_ylabel('Count')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\nOutlier Analysis Summary:")
    for sensor_type, sensors in outlier_stats.items():
        print(f"\n{sensor_type.upper()} SENSORS:")
        for sensor_id, stats in sensors.items():
            print(f"\nSensor {sensor_id}:")
            print(f"  Valid samples: {stats['valid_samples']:,}")
            print(f"  Range outliers: {stats['range_outliers']:,} ({stats['range_outlier_pct']:.2f}%)")
            print(f"  Statistical outliers: {stats['statistical_outliers']:,} ({stats['statistical_outlier_pct']:.2f}%)")

# Run the analysis
analyze_sensor_outliers()

## Suggestions

We have several sensors that could be combined as they represent values on a 3D scale.