# People detection using indoor air sensors

### Dependencies

#### Package installation

This script requires NumPy, Scikit-learn, Pandas and wget to run

In [None]:
import sys

!{sys.executable} -m pip install numpy D pandas wget




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import os
import zipfile
import wget

#### Data download

Dataset that is required to run this script is available at Zenodo

URL for the dataset is https://zenodo.org/records/15224267

In [2]:
url = 'https://zenodo.org/records/15224267/files/data.zip?download=1'
filepath = './data/data.zip'
os.makedirs('data', exist_ok=True)
wget.download(url, filepath)

'./data/data.zip'

Data is archived as a zip file. File contents are extracted to data folder

In [3]:
zip_file_path = './data/data.zip'
extraction_dir = '.'
os.makedirs(extraction_dir, exist_ok=True)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)
os.remove(zip_file_path)

### Data processing

#### Loading data

Script uses four parameters for indoor and outdoor sensor data. These parameters are CO2 levels, temperature, relative humidity and PM10 levels. This map defines columns containing this data for each sensor provider.

In [4]:
sensor_param_map ={
'df_4': [r"CO2 (ppm)_sensor_1", r"Temperature (°C)_sensor_1", r"Humidity (%)_sensor_1", r"PM10 (µg/m3)_sensor_1"], 
'df_1': [r"CO2[ppm]", r"T[°C]", r"RH[%]", r"PM10[ugm3]"], 
'df_2': [r"CO2 (ppm)", r"Temperature (°C)", r"Humidity (%)", r"PM 10 (µg/m3)"],
'df_3': [r"CO2 (ppm)", r"Temperature (°C)", r"Humidity (%) ", r"PM 10 (µg/m3)"], 
'df_5': [r"CO2 (ppm)_sensor_1", r"Temperature °C_sensor_1", r"Humidity %_sensor_1", r"PM10 (ppb)_sensor_1"], 
'df_6': [r"CO2 (ppm)_sensor_10", r"Temperature (°C)_sensor_10", r"Humidity (%)_sensor_10", r"PM10 (µg/m3)_sensor_10"], 
'df_ref': [r"CO2 [ppm]", r"T [°C]", r"rH [%]", r"PM10 [µg/m³]"], 
'df_out' : [r"CO2(mg/m³)", r"temperature(C)", r"relativeHumidity(%)", r"PM10(µg/m³)"],
    }

indoor_data_load() function loads data for a given sensor provider and resample rate

In [5]:
def indoor_data_load(sel_df, resample_rate):
    
    files = {
        'df_4': 'Indoor_sensor_provider_4.csv',
        'df_1': 'Indoor_sensor_provider_1.csv',
        'df_2': 'Indoor_sensor_provider_2.csv',
        'df_3': 'Indoor_sensor_provider_3.csv',
        'df_5': 'Indoor_sensor_provider_5.csv',
        'df_6': 'Indoor_sensor_provider_6.csv',
        'df_ref': 'Indoor_referent_device.csv'
    }
    #extract all data into a dictionary of dataframes
    dfs = {name: pd.read_csv(f'./data/{path}', index_col=0) for name, path in files.items()} 
    
    #defining timestamp column for each dataframe
    timestamp_map = {
    'df_4': 'timestamp', 'df_1': 'timestamp', 'df_2': 'Timestamp',
    'df_3': 'Timestamp', 'df_5': 'Timestamp', 'df_ref': 'timestamp'
    }

    #converting timestamps to datetime format and index for each dataframe
    for key in dfs:
        if key == 'df_ref': 
            dfs[key].set_index('Timestamp', inplace=True)
            dfs[key].index = pd.to_datetime(dfs[key].index)
        else:
            dfs[key].index = pd.to_datetime(dfs[key].index)

    #selecting a dataframe, defining its sensor parameters and resampling it
    df = dfs[sel_df][sensor_param_map[sel_df]].resample(resample_rate).mean()
    df.columns = ["Indoor CO2", "Indoor Temperature", "Indoor Humidity", "Indoor PM10"]
    
    return df

outdoor_data_load() function loads outdoor data for a given resample rate

In [6]:
def outdoor_data_load(resample_rate):
    #extract data
    df = pd.read_csv('./data/Outdoor_sensor_provider_2.csv', index_col=0)
    #convert timestamp to datetime and set it to the index
    df.set_index('Timestamp', inplace=True)
    df.index = pd.to_datetime(df.index)
    #extract columns and resample data
    df = df[sensor_param_map["df_out"]].resample(resample_rate).mean()
    df.columns = ["Outdoor CO2", "Outdoor Temperature", "Outdoor Humidity", "Outdoor PM10"]
    return df
    

target_load() function loads targets. In this case, targets are human presence at a given time

In [7]:
def target_load(resample_rate):
    #extract data
    df = pd.read_csv('./data/People.csv', index_col=0)
    #convert timestamp to datetime and set it to the index
    df.set_index('Timestamp', inplace=True)
    df.index = pd.to_datetime(df.index)
    #extract columns and resample data, null values are filled with 0
    df = df.resample(resample_rate).max().fillna(0)
    df.loc[df['People in a room'] > 1, 'People in a room'] = 1 #if there's more than one person at a given time, set the value to 1
    return df

Select an indoor sensor and resample rate. Feel free to change these parameters

In [8]:
sel_df = "df_3" #indoor sensor selector, choices are df_1, df_2, df_3, df_4, df_5, df_6 and df_ref
resample_rate = "10min" #resample rate selector, values below 5 minutes are not recommended

#### Loading data

Indoor, outdoor and target data is inner joined to ensure alignment

In [9]:
dataset = indoor_data_load(sel_df, resample_rate) #load indoor data
targets = target_load(resample_rate) #load target data
outdoor_data = outdoor_data_load (resample_rate) #load outdoor data
dataset = dataset.join(outdoor_data, how="inner") #inner join of indoor and outdoor data
dataset = dataset.join(targets, how="inner") #inner join with target data
dataset

Unnamed: 0_level_0,Indoor CO2,Indoor Temperature,Indoor Humidity,Indoor PM10,Outdoor CO2,Outdoor Temperature,Outdoor Humidity,Outdoor PM10,People in a room
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-10-11 13:40:00,674.750000,22.817500,57.888750,1.000000,714.0,27.720,38.00,1.0,0.0
2024-10-11 13:50:00,724.800000,23.271000,58.454000,0.400000,720.0,27.850,37.50,1.0,0.0
2024-10-11 14:00:00,869.181818,23.920000,58.523636,0.454545,707.5,27.975,37.10,1.0,0.0
2024-10-11 14:10:00,1013.200000,24.347000,58.367000,0.300000,712.0,26.535,39.15,3.0,1.0
2024-10-11 14:20:00,1166.400000,24.608000,58.875000,0.700000,716.0,25.990,40.80,1.5,1.0
...,...,...,...,...,...,...,...,...,...
2024-10-31 10:40:00,820.400000,24.532000,54.657000,79.500000,,,,,1.0
2024-10-31 10:50:00,765.000000,24.537000,53.886000,74.700000,,,,,0.0
2024-10-31 11:00:00,743.300000,24.236000,52.985000,69.100000,,,,,1.0
2024-10-31 11:10:00,639.200000,23.872000,52.628000,53.600000,,,,,1.0


Rows with empty values are dropped to account for sensor downtime

In [10]:
dataset = dataset.dropna() #drops rows with null values
dataset

Unnamed: 0_level_0,Indoor CO2,Indoor Temperature,Indoor Humidity,Indoor PM10,Outdoor CO2,Outdoor Temperature,Outdoor Humidity,Outdoor PM10,People in a room
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-10-11 13:40:00,674.750000,22.8175,57.888750,1.000000,714.0,27.720,38.00,1.0,0.0
2024-10-11 13:50:00,724.800000,23.2710,58.454000,0.400000,720.0,27.850,37.50,1.0,0.0
2024-10-11 14:00:00,869.181818,23.9200,58.523636,0.454545,707.5,27.975,37.10,1.0,0.0
2024-10-11 14:10:00,1013.200000,24.3470,58.367000,0.300000,712.0,26.535,39.15,3.0,1.0
2024-10-11 14:20:00,1166.400000,24.6080,58.875000,0.700000,716.0,25.990,40.80,1.5,1.0
...,...,...,...,...,...,...,...,...,...
2024-10-29 23:10:00,1195.100000,23.9350,61.532000,138.800000,1008.5,13.780,74.10,28.5,0.0
2024-10-29 23:20:00,1216.800000,23.9070,61.718000,143.400000,1001.5,13.475,73.80,28.0,0.0
2024-10-29 23:30:00,1230.200000,23.8840,61.814000,146.800000,1046.0,13.430,72.90,24.5,0.0
2024-10-29 23:40:00,1248.700000,23.8690,62.067000,150.800000,1024.5,13.325,74.00,32.0,0.0


Alligned features and parameters are extracted

In [11]:
x = dataset.iloc[:, :-1] #set all but last column as features
y = dataset.iloc[:, -1] #set last column as target

In [12]:
x

Unnamed: 0_level_0,Indoor CO2,Indoor Temperature,Indoor Humidity,Indoor PM10,Outdoor CO2,Outdoor Temperature,Outdoor Humidity,Outdoor PM10
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-10-11 13:40:00,674.750000,22.8175,57.888750,1.000000,714.0,27.720,38.00,1.0
2024-10-11 13:50:00,724.800000,23.2710,58.454000,0.400000,720.0,27.850,37.50,1.0
2024-10-11 14:00:00,869.181818,23.9200,58.523636,0.454545,707.5,27.975,37.10,1.0
2024-10-11 14:10:00,1013.200000,24.3470,58.367000,0.300000,712.0,26.535,39.15,3.0
2024-10-11 14:20:00,1166.400000,24.6080,58.875000,0.700000,716.0,25.990,40.80,1.5
...,...,...,...,...,...,...,...,...
2024-10-29 23:10:00,1195.100000,23.9350,61.532000,138.800000,1008.5,13.780,74.10,28.5
2024-10-29 23:20:00,1216.800000,23.9070,61.718000,143.400000,1001.5,13.475,73.80,28.0
2024-10-29 23:30:00,1230.200000,23.8840,61.814000,146.800000,1046.0,13.430,72.90,24.5
2024-10-29 23:40:00,1248.700000,23.8690,62.067000,150.800000,1024.5,13.325,74.00,32.0


In [13]:
y

Timestamp
2024-10-11 13:40:00    0.0
2024-10-11 13:50:00    0.0
2024-10-11 14:00:00    0.0
2024-10-11 14:10:00    1.0
2024-10-11 14:20:00    1.0
                      ... 
2024-10-29 23:10:00    0.0
2024-10-29 23:20:00    0.0
2024-10-29 23:30:00    0.0
2024-10-29 23:40:00    0.0
2024-10-29 23:50:00    0.0
Name: People in a room, Length: 2647, dtype: float64

Dataset is split into training and test sets

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, stratify=y, random_state=42) #split train and test sets at ratio of 80-20

Features are rescaled using the standard scaler

In [15]:
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler() #instancing a scaler
x_train = sc.fit_transform(x_train) #fitting a scaler and transforming train data
x_test = sc.transform(x_test) #transforming test data with scaler
x_test

array([[ 1.89281314,  0.33427559,  0.83205891, ..., -0.03783704,
         0.42633843,  1.72712422],
       [ 0.11085314, -0.76497459, -0.69472691, ..., -2.07974906,
         0.91709027,  0.26433855],
       [-1.04136071, -0.51974627, -1.72635706, ...,  0.65836582,
        -1.1616228 , -0.44921543],
       ...,
       [-0.67127943,  0.22877039, -1.35180764, ..., -0.22780702,
        -0.71076948, -1.02005862],
       [-0.63264349,  0.48683042,  1.67406923, ...,  1.15929226,
        -0.45940879, -0.30650464],
       [ 0.30756301, -0.24885453, -0.26746809, ..., -0.72979475,
         0.80936426, -0.59192623]])

### Random forest classifier

This classifier uses a random forest model with 15 estimators and a max depth of 4

In [16]:
from sklearn.ensemble import RandomForestClassifier
#instancing a random forest classifier with 15 estimators and max depth of 4
rf_classifier = RandomForestClassifier(n_estimators = 15, max_depth = 4, criterion = 'entropy', random_state = 42, class_weight="balanced", min_samples_split=6)
rf_classifier.fit(x_train, y_train) #fitting the model to train data
rf_y_predict = rf_classifier.predict(x_test) #predicting test targets
rf_y_predict = pd.Series(rf_y_predict, name='Prediction', index=y_test.index) 
rf_results = pd.concat([y_test, rf_y_predict], axis=1) #setting targets and prediction for visual comparison
rf_results

Unnamed: 0_level_0,People in a room,Prediction
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-24 20:10:00,0.0,0.0
2024-10-21 07:30:00,0.0,0.0
2024-10-19 15:50:00,0.0,1.0
2024-10-12 05:40:00,0.0,0.0
2024-10-16 14:50:00,0.0,1.0
...,...,...
2024-10-19 04:20:00,0.0,0.0
2024-10-17 15:10:00,0.0,0.0
2024-10-15 23:40:00,0.0,0.0
2024-10-26 17:10:00,0.0,0.0


Model confusion matrix

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score
rf_cm = confusion_matrix(y_test, rf_y_predict) #confusion matrix using test data
print(rf_cm)

[[459  52]
 [  9  10]]


Training accuracy score for random forest classifier

In [18]:
accuracy_score(y_train, rf_classifier.predict(x_train))*100

90.93056211620217

Test accuracy score for random forest classifier

In [19]:
accuracy_score(y_test, rf_y_predict)*100

88.49056603773585

### Logistic regression classifier

This classifier uses a logistic regression model

In [20]:
from sklearn.linear_model import LogisticRegression
#instancing a logistic regression model instance
lr_classifier = LogisticRegression(random_state = 0, penalty="elasticnet", solver="saga", l1_ratio=0.001, max_iter=1000)
lr_classifier.fit(x_train, y_train) #fitting the model to data
lr_y_predict = lr_classifier.predict(x_test) #predicting test data
lr_y_predict = pd.Series(lr_y_predict, name='Prediction', index=y_test.index)
lr_results = pd.concat([y_test, lr_y_predict], axis=1) #comparing test targets and predictions
lr_results

Unnamed: 0_level_0,People in a room,Prediction
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-24 20:10:00,0.0,0.0
2024-10-21 07:30:00,0.0,0.0
2024-10-19 15:50:00,0.0,0.0
2024-10-12 05:40:00,0.0,0.0
2024-10-16 14:50:00,0.0,0.0
...,...,...
2024-10-19 04:20:00,0.0,0.0
2024-10-17 15:10:00,0.0,0.0
2024-10-15 23:40:00,0.0,0.0
2024-10-26 17:10:00,0.0,0.0


Confusion matric for logistic regression classifier

In [21]:
lr_cm = confusion_matrix(y_test, lr_y_predict) #calculate confusion matrix on test data
print(lr_cm)

[[511   0]
 [ 19   0]]


Training accuracy score for logistic regression classifier

In [22]:
accuracy_score(y_train, lr_classifier.predict(x_train))*100

96.36277751535192

Test accuracy score for logistic regression classifier

In [23]:
accuracy_score(y_test, lr_y_predict)*100

96.41509433962264

### Neural network classifier

This classifier uses a single layer neural network with 4 neurons

In [24]:
from sklearn.neural_network import MLPClassifier
#instancing a multi layer perceptron classifier with one hidden layer with 4 neurons
nn_classifier = MLPClassifier(hidden_layer_sizes=(4), max_iter=1000)
nn_classifier.fit(x_train, y_train) #fitting model to data
nn_y_predict = nn_classifier.predict(x_test) #predicting test data
nn_y_predict = pd.Series(nn_y_predict, name='Prediction', index=y_test.index)
nn_results = pd.concat([y_test, nn_y_predict], axis=1) #comparing data
nn_results

Unnamed: 0_level_0,People in a room,Prediction
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-24 20:10:00,0.0,0.0
2024-10-21 07:30:00,0.0,0.0
2024-10-19 15:50:00,0.0,0.0
2024-10-12 05:40:00,0.0,0.0
2024-10-16 14:50:00,0.0,0.0
...,...,...
2024-10-19 04:20:00,0.0,0.0
2024-10-17 15:10:00,0.0,0.0
2024-10-15 23:40:00,0.0,0.0
2024-10-26 17:10:00,0.0,0.0


Confusion matric for neural network classifier

In [25]:
nn_cm = confusion_matrix(y_test, nn_y_predict) #calculate confusion matrix for test set
print(nn_cm)

[[511   0]
 [ 19   0]]


Training accuracy score for neural network classifier

In [26]:
accuracy_score(y_train, nn_classifier.predict(x_train))*100

96.41001417099669

Test accuracy score for neural network classifier

In [27]:
accuracy_score(y_test, nn_y_predict)*100

96.41509433962264