# Beiwe Sample Dataset Accelerometer EDA Pt. 2

Prototype helper functions to load and summarize an hour's worth of accelerometer data 


In [7]:
%load_ext autoreload
%autoreload 2 

import pandas as pd
import numpy as np
import os

# TODO: move to its own script
def summarize_hourly_file(file_path, verbose=True):
    """Analyze a single hourly accelerometer CSV file."""
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            if verbose:
                print(f"Missing: {os.path.basename(file_path)}")
            return None
        # If it exists read CSV
        df = pd.read_csv(file_path)
        
        # Handle empty files
        if len(df) == 0:
            if verbose:
                print(f"Empty: {os.path.basename(file_path)}")
            return None
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")    

    # Convert timestamps
    df['datetime_utc'] = pd.to_datetime(df['timestamp'], unit='ms', utc=True)

    # Compute magnitude
    df['magnitude'] = np.sqrt(df['x']**2 + df['y']**2 + df['z']**2)

    # Calculate metrics
    n_rows = len(df)
    start_time = df['datetime_utc'].iloc[0]
    end_time = df['datetime_utc'].iloc[-1]
    
    # Duration in minutes
    duration_sec = (end_time - start_time).total_seconds()
    duration_min = duration_sec / 60
    
    # Sampling time in minutes (at 10 Hz)
    sampling_min = n_rows / 10 / 60
    
    # Duty cycle
    duty_cycle = sampling_min / duration_min
    
    # Count bursts
    df['time_diff_ms'] = df['timestamp'].diff()
    gaps = df['time_diff_ms'] > 1000 
    gaps_count = gaps.sum()
    n_bursts = gaps_count + 1
    
    # Mean magnitude
    mean_magnitude = df['magnitude'].mean()

    return {
        'n_rows': n_rows,
        'start_time': start_time,
        'end_time': end_time,
        'duration_min': duration_min,
        'sampling_min': sampling_min,
        'duty_cycle': duty_cycle,
       'n_bursts': n_bursts,
        'mean_magnitude': mean_magnitude
    }

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Check that summarize_hourly_file() works on the hr_9 CSV 

In [8]:
base_path = "/n/home01/egraff/sample_imputation/data/raw/3si9xdvl/accelerometer/"
test_file_path = base_path + "2022-03-26 09_00_00+00_00.csv"

result = summarize_hourly_file(test_file_path)
print(result)

{'n_rows': 18090, 'start_time': Timestamp('2022-03-26 09:00:38.792000+0000', tz='UTC'), 'end_time': Timestamp('2022-03-26 09:59:39.055000+0000', tz='UTC'), 'duration_min': 59.00438333333333, 'sampling_min': 30.15, 'duty_cycle': 0.5109789865894144, 'n_bursts': np.int64(30), 'mean_magnitude': np.float64(1.0069258994313353)}


Now try on different files 

In [None]:
test_hours = [0, 9, 16, 23]  # Variety: short, full, partial, end-of-day

print("\n" + "="*79)
for hour in test_hours:
    file_path = base_path + f"2022-03-26 {hour:02d}_00_00+00_00.csv"
    print(f"\nHour {hour:02d}:00")
    print("-" *79)
    
    try:
        result = summarize_hourly_file(file_path)
        
        print(f"Rows: {result['n_rows']:, }")
        print(f"Time span: {result['start_time'].strftime('%H:%M:%S')} to {result['end_time'].strftime('%H:%M:%S')}")
        print(f"Duration: {result['duration_min']:.1f} min")
        print(f"Sampling time: {result['sampling_min']:.1f} min")
        print(f"Duty cycle: {result['duty_cycle']:.1%}")
        print(f"Bursts: {result['n_bursts']}")
        print(f"Mean magnitude: {result['mean_magnitude']:.4f} g")
        
    except FileNotFoundError:
        print(f"File not found")
    except Exception as e:
        print(f"Error: {e}")

print("\n" + "="*79)



Hour 00:00
-------------------------------------------------------------------------------
Rows: 602
Time span: 00:26:29 to 00:27:29
Duration: 1.0 min
Sampling time: 1.0 min
Duty cycle: 100.6%
Bursts: 1
Mean magnitude: 1.0107 g

Hour 09:00
-------------------------------------------------------------------------------
Rows: 18,090
Time span: 09:00:38 to 09:59:39
Duration: 59.0 min
Sampling time: 30.1 min
Duty cycle: 51.1%
Bursts: 30
Mean magnitude: 1.0069 g

Hour 16:00
-------------------------------------------------------------------------------
Rows: 9,049
Time span: 16:30:01 to 16:59:02
Duration: 29.0 min
Sampling time: 15.1 min
Duty cycle: 52.0%
Bursts: 15
Mean magnitude: 1.0340 g

Hour 23:00
-------------------------------------------------------------------------------
Rows: 11,455
Time span: 23:22:06 to 23:59:06
Duration: 37.0 min
Sampling time: 19.1 min
Duty cycle: 51.6%
Bursts: 19
Mean magnitude: 1.0090 g



In [None]:
# Process all 24 hours for subject 3si9xdvl on 2022-03-26
date_str = "2022-03-26"
subject_id = "3si9xdvl"
# Store results
daily_summary = []

for hour in range(24):
    file_path = base_path + f"{date_str} {hour:02d}_00_00+00_00.csv"
    
    result = summarize_hourly_file(file_path)
    
    if result is not None:
        # Add hour and date info
        result['hour'] = hour
        result['date'] = date_str
        result['subject_id'] = subject_id
        daily_summary.append(result)

# Convert to df
summary_df = pd.DataFrame(daily_summary)

print(f"Summarized {len(summary_df)} out of 24 hours")
print(f"\n{summary_df.head(10)}")

Missing: 2022-03-26 01_00_00+00_00.csv
Missing: 2022-03-26 02_00_00+00_00.csv
Missing: 2022-03-26 03_00_00+00_00.csv
Missing: 2022-03-26 15_00_00+00_00.csv
Summarized 20 out of 24 hours

   n_rows                       start_time                         end_time  \
0     602 2022-03-26 00:26:29.955000+00:00 2022-03-26 00:27:29.812000+00:00   
1    8620 2022-03-26 04:31:15.460000+00:00 2022-03-26 04:59:33.337000+00:00   
2   11438 2022-03-26 05:00:33.399000+00:00 2022-03-26 05:59:34.726000+00:00   
3   18081 2022-03-26 06:00:34.770000+00:00 2022-03-26 06:59:34.988000+00:00   
4   18100 2022-03-26 07:00:35.057000+00:00 2022-03-26 07:59:37.352000+00:00   
5   18098 2022-03-26 08:00:37.442000+00:00 2022-03-26 08:59:38.720000+00:00   
6   18090 2022-03-26 09:00:38.792000+00:00 2022-03-26 09:59:39.055000+00:00   
7   18088 2022-03-26 10:00:39.127000+00:00 2022-03-26 10:59:39.403000+00:00   
8   18098 2022-03-26 11:00:39.477000+00:00 2022-03-26 11:59:40.737000+00:00   
9   18098 2022-03-26 12

In [14]:
# Want to see all rows
pd.set_option('display.max_rows', None)
print(summary_df)

    n_rows                       start_time                         end_time  \
0      602 2022-03-26 00:26:29.955000+00:00 2022-03-26 00:27:29.812000+00:00   
1     8620 2022-03-26 04:31:15.460000+00:00 2022-03-26 04:59:33.337000+00:00   
2    11438 2022-03-26 05:00:33.399000+00:00 2022-03-26 05:59:34.726000+00:00   
3    18081 2022-03-26 06:00:34.770000+00:00 2022-03-26 06:59:34.988000+00:00   
4    18100 2022-03-26 07:00:35.057000+00:00 2022-03-26 07:59:37.352000+00:00   
5    18098 2022-03-26 08:00:37.442000+00:00 2022-03-26 08:59:38.720000+00:00   
6    18090 2022-03-26 09:00:38.792000+00:00 2022-03-26 09:59:39.055000+00:00   
7    18088 2022-03-26 10:00:39.127000+00:00 2022-03-26 10:59:39.403000+00:00   
8    18098 2022-03-26 11:00:39.477000+00:00 2022-03-26 11:59:40.737000+00:00   
9    18098 2022-03-26 12:00:40.804000+00:00 2022-03-26 12:59:42.082000+00:00   
10   18090 2022-03-26 13:00:42.146000+00:00 2022-03-26 13:59:42.426000+00:00   
11   17487 2022-03-26 14:00:42.494000+00