# Part 1: Data Exploration and Preprocessing

In this notebook, we will implement functions for loading, preprocessing, and visualizing physiological data from the wearable device dataset.

In [None]:
# Set plotting style
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
sns.set_context('notebook')

## 1. Data Loading

Implement the `load_data` function to load physiological data from the dataset.

In [None]:
def load_data(data_dir='data/raw'):
    """Load all physiological data from the dataset.
    
    Parameters:
    -----------
    data_dir : str, optional
        Path to the data directory (default: 'data/raw')
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame with columns: ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    """
    # Import required libraries
    import os
    import pandas as pd
    from glob import glob
    
    # Initialize empty list to store data from each subject
    all_data = []
    
    # Get list of subject directories
    subject_dirs = sorted(glob(os.path.join(data_dir, 'S*')))
    
    for subject_dir in subject_dirs:
        subject_id = os.path.basename(subject_dir)
        
        # Get list of session directories
        session_dirs = glob(os.path.join(subject_dir, '*'))
        
        for session_dir in session_dirs:
            session = os.path.basename(session_dir)
            
            # Load data files
            hr_file = os.path.join(session_dir, 'HR.csv')
            eda_file = os.path.join(session_dir, 'EDA.csv')
            temp_file = os.path.join(session_dir, 'TEMP.csv')
            
            # Read CSV files
            hr_data = pd.read_csv(hr_file, names=['timestamp', 'heart_rate'])
            eda_data = pd.read_csv(eda_file, names=['timestamp', 'eda'])
            temp_data = pd.read_csv(temp_file, names=['timestamp', 'temperature'])
            
            # Merge data on timestamp
            merged = pd.merge(hr_data, eda_data, on='timestamp', how='outer')
            merged = pd.merge(merged, temp_data, on='timestamp', how='outer')
            
            # Add metadata
            merged['subject_id'] = subject_id
            merged['session'] = session
            
            all_data.append(merged)
    
    # Combine all data
    combined_data = pd.concat(all_data, ignore_index=True)
    
    # Sort by timestamp
    combined_data = combined_data.sort_values('timestamp')
    
    return combined_data

## 2. Data Preprocessing

Implement the `preprocess_data` function to handle missing values, resample time series, and remove outliers.

In [None]:
def preprocess_data(data, output_dir='data/processed'):
    """Preprocess physiological data.
    
    Parameters:
    -----------
    data : pandas.DataFrame
        Input data with columns: ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    output_dir : str, optional
        Directory to save processed data (default: 'data/processed')
        
    Returns:
    --------
    pandas.DataFrame
        Preprocessed data
    """
    # Import required libraries
    import os
    import numpy as np
    import pandas as pd
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Process each subject's data separately
    processed_data = []
    
    for subject_id in data['subject_id'].unique():
        subject_data = data[data['subject_id'] == subject_id].copy()
        
        # Convert timestamp to datetime
        subject_data['timestamp'] = pd.to_datetime(subject_data['timestamp'], unit='s')
        
        # Set timestamp as index for resampling
        subject_data.set_index('timestamp', inplace=True)
        
        # Resample to regular 1-second intervals
        subject_data = subject_data.resample('1S').asfreq()
        
        # Handle missing values (up to 1%)
        for col in ['heart_rate', 'eda', 'temperature']:
            missing_pct = subject_data[col].isna().mean() * 100
            if missing_pct <= 1:
                # Interpolate missing values
                subject_data[col] = subject_data[col].interpolate(method='time')
            else:
                # Drop rows with missing values if > 1%
                subject_data = subject_data.dropna(subset=[col])
        
        # Remove outliers using z-score method
        for col in ['heart_rate', 'eda', 'temperature']:
            z_scores = np.abs((subject_data[col] - subject_data[col].mean()) / subject_data[col].std())
            subject_data.loc[z_scores > 3.5, col] = np.nan
        
        # Reset index to make timestamp a column again
        subject_data.reset_index(inplace=True)
        
        # Save processed data
        output_file = os.path.join(output_dir, f'{subject_id}_processed.csv')
        subject_data.to_csv(output_file, index=False)
        
        processed_data.append(subject_data)
    
    # Combine all processed data
    combined_data = pd.concat(processed_data, ignore_index=True)
    
    return combined_data

## 3. Data Visualization

Implement the `plot_physiological_signals` function to visualize the data.

In [None]:
def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    """Create plots of physiological signals.
    
    Parameters:
    -----------
    data : pandas.DataFrame
        Input data with columns: ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    subject_id : str
        Subject ID to plot
    session : str
        Session to plot
    output_dir : str, optional
        Directory to save plots (default: 'plots')
        
    Returns:
    --------
    matplotlib.figure.Figure
        Figure object containing the plots
    """
    # Import required libraries
    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Filter data for specific subject and session
    mask = (data['subject_id'] == subject_id) & (data['session'] == session)
    plot_data = data[mask].copy()
    
    # Convert timestamp to datetime if needed
    if not pd.api.types.is_datetime64_any_dtype(plot_data['timestamp']):
        plot_data['timestamp'] = pd.to_datetime(plot_data['timestamp'], unit='s')
    
    # Create figure with subplots
    fig, axes = plt.subplots(3, 1, figsize=(12, 8), sharex=True)
    fig.suptitle(f'Physiological Signals - Subject {subject_id}, {session}')
    
    # Plot heart rate
    axes[0].plot(plot_data['timestamp'], plot_data['heart_rate'], 'r-')
    axes[0].set_ylabel('Heart Rate (bpm)')
    axes[0].grid(True)
    
    # Plot EDA
    axes[1].plot(plot_data['timestamp'], plot_data['eda'], 'b-')
    axes[1].set_ylabel('EDA (µS)')
    axes[1].grid(True)
    
    # Plot temperature
    axes[2].plot(plot_data['timestamp'], plot_data['temperature'], 'g-')
    axes[2].set_ylabel('Temperature (°C)')
    axes[2].set_xlabel('Time')
    axes[2].grid(True)
    
    # Adjust layout
    plt.tight_layout()
    
    # Save plot
    output_file = os.path.join(output_dir, f'{subject_id}_{session}_signals.png')
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    
    return fig

## Example Usage

Here's how to use the implemented functions:

In [None]:
# Load data
data = load_data()
print("Loaded data shape:", data.shape)

# Preprocess data
processed_data = preprocess_data(data)
print("\nProcessed data shape:", processed_data.shape)

# Create plots for first subject and session
subject_id = processed_data['subject_id'].iloc[0]
session = processed_data['session'].iloc[0]
fig = plot_physiological_signals(processed_data, subject_id, session)
plt.show()