In [None]:
import sys
import os
from dataclasses import asdict

# Add services directory to path
sys.path.insert(0, os.path.join(os.getcwd(), 'services'))
sys.path.insert(0, os.path.join(os.getcwd(), 'services/feature-producer'))

import pandas as pd
from datetime import datetime
from typing import List

from sensor_buffer import SensorBuffer
from featurizer import Featurizer
from entities.sensor import Sensor
from entities.feature_vector import FeatureVector


def generate_feature_vectors(sensor_measurements: List[Sensor]) -> List[FeatureVector]:
    buffer = SensorBuffer(max_hours=3)
    featurizer = Featurizer(buffer)
    feature_vectors = []
    
    for i, sensor in enumerate(sensor_measurements):
        # Add sensor to buffer
        buffer.add(sensor)
        
        # Only generate features after we have enough history (at least 3 hours)
        if i >= 36:  # 36 datapoints = 3 hours at 5-min intervals
            try:
                feature_vector = featurizer.extract_features(sensor)
                feature_vectors.append(feature_vector)
                
                if len(feature_vectors) % 1000 == 0:
                    print(f"    Generated {len(feature_vectors)} feature vectors")
            except Exception as e:
                print(f"    Error generating features at index {i}: {e}")
    
    return feature_vectors

Imports and generate_feature_vectors function loaded


In [8]:
# Load the CSV file
csv_file_path = 'data/sensor55__20250930_20251031.csv'
df_raw = pd.read_csv(csv_file_path)

print(f"Loaded {len(df_raw)} raw measurements from {csv_file_path}")
print(f"\nFirst 5 rows:")
df_raw.head()

Loaded 44573 raw measurements from data/sensor55__20250930_20251031.csv

First 5 rows:


Unnamed: 0,sensorId,timestamp,co2,humidity,light,motion,temperature
0,sensor55,2025-09-30T22:00:12Z,394.0,46.0,3.0,0.0,22.2
1,sensor55,2025-09-30T22:01:12Z,405.0,46.0,3.0,0.0,22.2
2,sensor55,2025-09-30T22:02:12Z,412.0,46.0,3.0,0.0,22.2
3,sensor55,2025-09-30T22:03:12Z,418.0,46.0,3.0,0.0,22.2
4,sensor55,2025-09-30T22:04:12Z,427.0,46.0,3.0,0.0,22.2


In [9]:
# Convert CSV rows to Sensor objects
sensor_list = []

for _, row in df_raw.iterrows():
    # Parse timestamp
    timestamp_str = row['timestamp']
    dt = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
    timestamp_unix = int(dt.timestamp())
    
    # Create Sensor object using the alias field name
    sensor = Sensor(
        sensorId=row['sensorId'],  # Use alias name, not Python field name
        timestamp=timestamp_unix,
        humidity=row['humidity'] if pd.notna(row['humidity']) else None,
        temperature=row['temperature'] if pd.notna(row['temperature']) else None,
        co2=row['co2'] if pd.notna(row['co2']) else None,
        motion=row['motion'] if pd.notna(row['motion']) else None,
        light=row['light'] if pd.notna(row['light']) else None
    )
    sensor_list.append(sensor)

# Sort by timestamp
sensor_list.sort(key=lambda s: s.timestamp)

print(f"Converted {len(sensor_list)} rows to Sensor objects")
print(f"Date range: {datetime.fromtimestamp(sensor_list[0].timestamp)} to {datetime.fromtimestamp(sensor_list[-1].timestamp)}")

Converted 44573 rows to Sensor objects
Date range: 2025-10-01 00:00:12 to 2025-10-31 23:58:35


In [10]:
# Generate feature vectors from the sensor data
print("Generating feature vectors...")
csv_feature_vectors = generate_feature_vectors(sensor_list)

print(f"\nGenerated {len(csv_feature_vectors)} feature vectors")
print(f"Skipped first {min(36, len(sensor_list))} measurements (need 3h history)")

Generating feature vectors...
    Generated 1000 feature vectors
    Generated 1000 feature vectors
    Generated 2000 feature vectors
    Generated 2000 feature vectors
    Generated 3000 feature vectors
    Generated 3000 feature vectors
    Generated 4000 feature vectors
    Generated 4000 feature vectors
    Generated 5000 feature vectors
    Generated 5000 feature vectors
    Generated 6000 feature vectors
    Generated 6000 feature vectors
    Generated 7000 feature vectors
    Generated 7000 feature vectors
    Generated 8000 feature vectors
    Generated 8000 feature vectors
    Generated 9000 feature vectors
    Generated 9000 feature vectors
    Generated 10000 feature vectors
    Generated 10000 feature vectors
    Generated 11000 feature vectors
    Generated 11000 feature vectors
    Generated 12000 feature vectors
    Generated 12000 feature vectors
    Generated 13000 feature vectors
    Generated 13000 feature vectors
    Generated 14000 feature vectors
    Generated 14

In [11]:
# Convert to DataFrame
csv_feature_data = [asdict(fv) for fv in csv_feature_vectors]
df_csv_features = pd.DataFrame(csv_feature_data)

print(f"Feature DataFrame shape: {df_csv_features.shape}")
print(f"Columns: {df_csv_features.shape[1]}")
print(f"\nFirst 5 feature vectors:")
df_csv_features.head()

Feature DataFrame shape: (44537, 44)
Columns: 44

First 5 feature vectors:


Unnamed: 0,sensor_id,timestamp,humidity,temperature,co2,motion,light,avg_humidity_60m,avg_humidity_120m,avg_humidity_180m,...,is_off_hours,is_night,season,residual_co2_recent_motion,rising_co2_recent_motion,light_recent_motion,temperature_humidity,motion_off_hours,light_on_at_night,schema_version
0,sensor55,1759271772,46.0,22.1,417.0,0.0,3.0,46.0,46.0,46.0,...,1,1,2,0.0,0.0,0.0,1016.6,0.0,0.0,1
1,sensor55,1759271832,46.0,22.2,409.0,0.0,3.0,46.0,46.0,46.0,...,1,1,2,-0.0,0.0,0.0,1021.2,0.0,0.0,1
2,sensor55,1759271892,46.0,22.1,412.0,0.0,3.0,46.0,46.0,46.0,...,1,1,2,0.0,-0.0,0.0,1016.6,0.0,0.0,1
3,sensor55,1759271952,46.0,22.1,406.0,0.0,3.0,46.0,46.0,46.0,...,1,1,2,-0.0,0.0,0.0,1016.6,0.0,0.0,1
4,sensor55,1759272012,46.0,22.2,417.0,0.0,3.0,46.0,46.0,46.0,...,1,1,2,0.0,-0.0,0.0,1021.2,0.0,0.0,1


In [12]:
# Save feature vectors to CSV
output_csv = 'sensor55_feature_vectors.csv'
df_csv_features.to_csv(output_csv, index=False)

print(f"Feature vectors saved to {output_csv}")
print(f"File contains {len(df_csv_features)} feature vectors with {df_csv_features.shape[1]} features each")

Feature vectors saved to sensor55_feature_vectors.csv
File contains 44537 feature vectors with 44 features each


In [13]:
# Display summary of generated features
print("\nFeature Vector Summary")
print("="*60)
print(f"Total vectors: {len(df_csv_features)}")
print(f"Sensor ID: {df_csv_features['sensor_id'].unique()[0]}")
print(f"Time range: {pd.to_datetime(df_csv_features['timestamp'], unit='s').min()} to {pd.to_datetime(df_csv_features['timestamp'], unit='s').max()}")
print(f"\nFeature statistics (first 10 features):")
df_csv_features.iloc[:, :10].describe()


Feature Vector Summary
Total vectors: 44537
Sensor ID: sensor55
Time range: 2025-09-30 22:36:12 to 2025-10-31 22:58:35

Feature statistics (first 10 features):


Unnamed: 0,timestamp,humidity,temperature,co2,motion,light,avg_humidity_60m,avg_humidity_120m,avg_humidity_180m
count,44537.0,44537.0,44537.0,44537.0,44537.0,44537.0,44537.0,44537.0,44537.0
mean,1760611000.0,43.909693,21.023134,700.055055,0.200799,35.27613,43.908907,43.908907,43.908907
std,773851.7,6.242234,0.861742,505.027993,0.78553,319.446035,6.226271,6.226271,6.226271
min,1759272000.0,29.0,18.9,357.0,0.0,3.0,29.861111,29.861111,29.861111
25%,1759941000.0,40.0,20.4,408.0,0.0,3.0,40.0,40.0,40.0
50%,1760611000.0,43.0,21.1,470.0,0.0,3.0,43.0,43.0,43.0
75%,1761282000.0,48.0,21.6,732.0,0.0,20.0,47.833333,47.833333,47.833333
max,1761952000.0,65.0,23.6,3253.0,8.0,12291.0,63.944444,63.944444,63.944444
