# Imports

In [88]:
import pandas as pd
import numpy as np
import glob

from tqdm import tqdm

%matplotlib inline

In [55]:
# Get available directories

# The Honeywell_season_01 data
path = "/datc/opschaler/Honeywell_season_01"
sub_dirs = np.array(glob.glob(path + "/*")) # Get location to all subdirectories
dwellings = np.array(list((map(lambda x: x[-11:], sub_dirs))))
dwellings

array(['P01S01W6595', 'P01S01W3955', 'P01S01W7042', 'P01S01W7071',
       'P01S01W4569', 'P01S01W1347', 'P01S01W8669', 'P01S01W0378',
       'P01S01W7980', 'P01S01W8239', 'P01S01W6240', 'P01S01W9431',
       'P01S01W5855', 'P01S01W5588', 'P01S01W6271', 'P01S01W8171',
       'P01S01W6289', 'P01S01W4091', 'P01S01W4313', 'P01S01W6959',
       'P01S01W5292', 'P01S01W0998', 'P01S01W4579', 'P01S01W4002',
       'P01S01W8743', 'P01S01W7826', 'P01S01W8828', 'P01S01W0373',
       'P01S01W9617', 'P01S01W6549', 'P01S01W1341', 'P01S01W5040',
       'P01S01W1270', 'P01S01W5564', 'P01S01W4489', 'P01S01W4589',
       'P01S01W3497', 'P01S01W7548', 'P01S01W1554', 'P01S01W2581',
       'P01S01W3155', 'P01S01W6835', 'P01S01W4979', 'P01S01W5476',
       'P01S01W5746'], dtype='<U11')

Next thing to do is to iterate over each dwelling and combine all its available data from within the sub directories.

In [56]:
# Select a sub dir and get all the available files

sub_dir = sub_dirs[0]

print("Selected dwelling: %s " % dwelling)

# All the datalog files from the selected dwelling
files = glob.glob(sub_dir +"/datalogFile*")

files[:5]

Selected dwelling: P01S01W6595 


['/datc/opschaler/Honeywell_season_01/P01S01W6595/datalogFile_1DC107_20170528_000.csv',
 '/datc/opschaler/Honeywell_season_01/P01S01W6595/datalogFile_1DC107_20170501_000.csv',
 '/datc/opschaler/Honeywell_season_01/P01S01W6595/datalogFile_1DC015_20170405_000.csv',
 '/datc/opschaler/Honeywell_season_01/P01S01W6595/datalogFile_1DC015_20170403_000.csv',
 '/datc/opschaler/Honeywell_season_01/P01S01W6595/datalogFile_1DC10F_20170515_000.csv']

In [63]:
dfs = []

for file in files:
    df = pd.read_csv(file, delimiter=';', parse_dates=['Timestamp'])
    dfs.append(df)

In [80]:
# Sample rate is 5 minutes
data = pd.concat(dfs)
data = data.rename(columns={'Timestamp': 'datetime'})
data = data.set_index(['datetime'])

# Check for NaNs
data.isnull().sum()

Device              0
Co2Value            0
RoomTemp            0
Humidity            0
HumidityBathRoom    0
VentilationLevel    0
Presence            0
dtype: int64

In [84]:
# Resample to original samplerate so not registered timestamps appear
data = data.resample('5T').mean()

data.isnull().sum()

Co2Value            71134
RoomTemp            71134
Humidity            71134
HumidityBathRoom    71134
VentilationLevel    71134
Presence            71134
dtype: int64

The sensordata contains error as numbers:  
For 'HumidityBathroom'  
    - 255 'no connection'  
    - 242 'connection error'
  
For 'VentilationLevel'  
    - 255 'not used'  

For 'Presence':      
    - 255 'no connection'  
    - 242 'connection error'
   

In [103]:
# Replace the 255 and 242 values with NaNs in the respective columns.
data[['HumidityBathRoom', 'VentilationLevel', 'Presence']] = data[['HumidityBathRoom', 'VentilationLevel', 'Presence']].replace({255:np.nan, 242:np.nan})