The sensordata contains error as numbers:  
For 'HumidityBathroom'  
    - 255 'no connection'  
    - 242 'connection error'
  
For 'VentilationLevel'  
    - 255 'not used'  

For 'Presence':    
    - 255 'no connection'  
    - 242 'connection error'
   

# Imports

In [18]:
import pandas as pd
import numpy as np
import glob

from tqdm import tqdm

%matplotlib inline

Manually change 'Honeywell_season_01' to 'Honeywell_season_01_02' so both honeywell folders are processed.  
Could also make a loop for this, to iterate over the 2 different honeywell.  

In [19]:
# The Honeywell_season_01 data
path = "/datc/opschaler/Honeywell_season_01"
#path = "/datc/opschaler/Honeywell_season_01_02"
sub_dirs = np.array(glob.glob(path + "/*")) # Get location to all subdirectories
dwellings = np.array(list((map(lambda x: x[-11:], sub_dirs))))

In [20]:
"""
Read the serial to room data.
"""

labels = pd.read_excel('//datc//opschaler//honeywell_sensors_per_dwelling_combined//honeywell_serial_to_room.xlsx')
labels.head()

Unnamed: 0,Living room,Kitchen,Bedroom 1,Bedroom 2
0,1dc018,1dc00e,1dc012,1dc010
1,1dc029,1dc011,1dc019,1dc01b
2,1dc02c,1dc015,1dc01f,1dc024
3,1dc036,1dc01c,1dc026,1dc025
4,1dc03c,1dc021,1dc031,1dc027


In [21]:
# Next thing to do is to iterate over each dwelling and combine all its available data from within the sub directories.

final_dfs = []

In [22]:
for i, sub_dir in enumerate(tqdm(sub_dirs[:1])):
    # Select a sub dir and get all the available files
    sub_dir = sub_dirs[i]
    dwelling = dwellings[i]

    # All the datalog files from the selected dwelling
    files = glob.glob(sub_dir +"/datalogFile_******_201*.csv")
    
    sensor_ids = np.array(list((map(lambda x: x[-23:-17], files))))
    sensor_ids = np.array([x.lower() if isinstance(x, str) else x for x in sensor_ids]) # Concert str to lowercase to match the labels df.
    
    dfs = []
    
    # Read all the files in the selected sub dir, append to a list
    for j, file in enumerate(files):
        sensor_id = sensor_ids[j]
        living_room = sensor_id in labels['Living room'].unique() # Returns true if sensor_id is in the living room series
        kitchen = sensor_id in labels['Kitchen'].unique()
        bedroom1 = sensor_id in labels['Bedroom 1'].unique()
        bedroom2 = sensor_id in labels['Bedroom 2'].unique()

        df = pd.read_csv(file, delimiter=';', parse_dates=['Timestamp'])
        
        """
        Resample to original samplerate (so not registedred timestamps appear) before adding the room category. 
        If doing this after, the room category will be removed by the resampler.
        Note that the device column is lost upon resampling, but the device ID is gathered from the filename, not this column.
        """ 
        
        df = df.rename(columns={'Timestamp': 'datetime'})
        df = df.set_index(['datetime'])
        df = df.resample('5T').mean()
            
        # Add a column name containing the room where the sensor is in.
        if living_room:
            df['room'] = 'living room'
        elif kitchen:
            df['room'] = 'kitchen'
        elif bedroom1:
            df['room'] = 'bedroom1'
        elif bedroom2:
            df['room'] = 'bedroom2'
        else:
            df['room'] = 'unknown'
        
        dfs.append(df)

    # Concatenate the dfs list into one df. 
    # Original sample rate is 5 minutes.
    data = pd.concat(dfs)
    data['room'] = data['room'].astype('category') # change datetypes to category
    
    # Replace the 255 and 242 values with NaNs in the respective columns.
    data[['HumidityBathRoom', 'VentilationLevel', 'Presence']] = data[['HumidityBathRoom', 'VentilationLevel', 'Presence']].replace({255:np.nan, 242:np.nan})
    data['dwelling'] = dwelling
    
    final_dfs.append(data)

100%|██████████| 1/1 [00:11<00:00, 11.08s/it]


# Concat all the dataframes

In [23]:
final_df = pd.concat(final_dfs)
final_df.head()

Unnamed: 0_level_0,Co2Value,RoomTemp,Humidity,HumidityBathRoom,VentilationLevel,Presence,room,dwelling
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-05-26 23:50:00,430.0,25.3,38.0,,,0.0,bedroom1,P01S01W6595
2017-05-26 23:55:00,421.0,25.3,38.0,,,0.0,bedroom1,P01S01W6595
2017-05-27 00:00:00,428.0,25.19,38.0,,,0.0,bedroom1,P01S01W6595
2017-05-27 00:05:00,430.0,25.23,38.0,,,0.0,bedroom1,P01S01W6595
2017-05-27 00:10:00,423.0,25.24,39.0,,,0.0,bedroom1,P01S01W6595


# Check the created df

In [24]:
# What happens when 4 sensors from different rooms gather data on the same timestamp? Does the same timestap appear 4 times with 4 different rooms?
final_df = final_df.sort_index()
final_df.head()

# Looks like this is the case!

Unnamed: 0_level_0,Co2Value,RoomTemp,Humidity,HumidityBathRoom,VentilationLevel,Presence,room,dwelling
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-04 00:00:00,1446.0,21.6,52.0,59.0,,,living room,P01S01W6595
2017-01-04 00:00:00,780.0,21.0,47.0,,,1.0,bedroom1,P01S01W6595
2017-01-04 00:00:00,1358.0,21.19,54.0,,,0.0,kitchen,P01S01W6595
2017-01-04 00:00:00,924.0,20.79,49.0,,,0.0,bedroom2,P01S01W6595
2017-01-04 00:05:00,899.0,21.09,47.0,,,0.0,bedroom1,P01S01W6595


# Deal with NaNs

In [25]:
"""
Note that the currently used column in analysis from this data will be Presence, therefor the other NaNs are ignored.
"""

final_df['Presence'] = final_df['Presence'].fillna(0)
final_df.isnull().sum()

Co2Value            2095513
RoomTemp            2095513
Humidity            2095513
HumidityBathRoom    2172294
VentilationLevel    2197876
Presence                  0
room                      0
dwelling                  0
dtype: int64

In [26]:
final_df.head()

Unnamed: 0_level_0,Co2Value,RoomTemp,Humidity,HumidityBathRoom,VentilationLevel,Presence,room,dwelling
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-04 00:00:00,1446.0,21.6,52.0,59.0,,0.0,living room,P01S01W6595
2017-01-04 00:00:00,780.0,21.0,47.0,,,1.0,bedroom1,P01S01W6595
2017-01-04 00:00:00,1358.0,21.19,54.0,,,0.0,kitchen,P01S01W6595
2017-01-04 00:00:00,924.0,20.79,49.0,,,0.0,bedroom2,P01S01W6595
2017-01-04 00:05:00,899.0,21.09,47.0,,,0.0,bedroom1,P01S01W6595


final_df.to_csv('/datc/opschaler/honeywell_sensors_per_dwelling_combined/honeywell_all_dwellings_combined.csv', sep='\t', index=True)

#  TO DO: Transform the df

df = pd.read_csv('/datc/opschaler/honeywell_sensors_per_dwelling_combined/honeywell_all_dwellings_combined.csv', delimiter='\t', parse_dates=['datetime'])
df = df.set_index(['datetime'])

In [34]:
"""
Create one column per feature, per room. 
i.e. for precense:
    presence_living_room, presence_bedroom1, precense_bedroom2, presence_kitchen
    
This way there will be one row of features per datetime.
"""

array(['kitchen'], dtype=object)

In [35]:
t = final_df
columns = ['Co2Value', 'RoomTemp', 'Humidity', 'HumidityBathRoom',
       'VentilationLevel', 'Presence', 'dwelling']

rooms = ['living room', 'bedroom1', 'bedroom2', 'kitchen']


df.columns = ['{}_{}'.format(var,room) for var, room in df.columns]

ValueError: too many values to unpack (expected 2)

In [None]:
t[:200]