The sensordata contains error as numbers:  
For 'HumidityBathroom'  
    - 255 'no connection'  
    - 242 'connection error'
  
For 'VentilationLevel'  
    - 255 'not used'  

For 'Presence':    
    - 255 'no connection'  
    - 242 'connection error'
   

# Imports

In [1]:
import pandas as pd
import numpy as np
import glob

from tqdm import tqdm

%matplotlib inline

Manually change 'Honeywell_season_01' to 'Honeywell_season_01_02' so both honeywell folders are processed.  
Could also make a loop for this, to iterate over the 2 different honeywell.  

In [19]:
# The Honeywell_season_01 data
path = "/datc/opschaler/Honeywell_season_01"
#path = "/datc/opschaler/Honeywell_season_01_02"
sub_dirs = np.array(glob.glob(path + "/*")) # Get location to all subdirectories
dwellings = np.array(list((map(lambda x: x[-11:], sub_dirs))))

In [20]:
"""
Read the serial to room data.
"""

labels = pd.read_excel('//datc//opschaler//honeywell_sensors_per_dwelling_combined//honeywell_serial_to_room.xlsx')
labels.head()

Unnamed: 0,Living room,Kitchen,Bedroom 1,Bedroom 2
0,1dc018,1dc00e,1dc012,1dc010
1,1dc029,1dc011,1dc019,1dc01b
2,1dc02c,1dc015,1dc01f,1dc024
3,1dc036,1dc01c,1dc026,1dc025
4,1dc03c,1dc021,1dc031,1dc027


In [21]:
# Next thing to do is to iterate over each dwelling and combine all its available data from within the sub directories.

final_dfs = []

In [22]:
for i, sub_dir in enumerate(tqdm(sub_dirs[:1])):
    # Select a sub dir and get all the available files
    sub_dir = sub_dirs[i]
    dwelling = dwellings[i]

    # All the datalog files from the selected dwelling
    files = glob.glob(sub_dir +"/datalogFile_******_201*.csv")
    
    sensor_ids = np.array(list((map(lambda x: x[-23:-17], files))))
    sensor_ids = np.array([x.lower() if isinstance(x, str) else x for x in sensor_ids]) # Concert str to lowercase to match the labels df.
    
    dfs = []
    
    # Read all the files in the selected sub dir, append to a list
    for j, file in enumerate(files):
        sensor_id = sensor_ids[j]
        living_room = sensor_id in labels['Living room'].unique() # Returns true if sensor_id is in the living room series
        kitchen = sensor_id in labels['Kitchen'].unique()
        bedroom1 = sensor_id in labels['Bedroom 1'].unique()
        bedroom2 = sensor_id in labels['Bedroom 2'].unique()

        df = pd.read_csv(file, delimiter=';', parse_dates=['Timestamp'])
        
        """
        Resample to original samplerate (so not registedred timestamps appear) before adding the room category. 
        If doing this after, the room category will be removed by the resampler.
        Note that the device column is lost upon resampling, but the device ID is gathered from the filename, not this column.
        """ 
        
        df = df.rename(columns={'Timestamp': 'datetime'})
        df = df.set_index(['datetime'])
        df = df.resample('5T').mean()
            
        # Add a column name containing the room where the sensor is in.
        if living_room:
            df['room'] = 'living room'
        elif kitchen:
            df['room'] = 'kitchen'
        elif bedroom1:
            df['room'] = 'bedroom1'
        elif bedroom2:
            df['room'] = 'bedroom2'
        else:
            df['room'] = 'unknown'
        
        dfs.append(df)

    # Concatenate the dfs list into one df. 
    # Original sample rate is 5 minutes.
    data = pd.concat(dfs)
    data['room'] = data['room'].astype('category') # change datetypes to category
    
    # Replace the 255 and 242 values with NaNs in the respective columns.
    data[['HumidityBathRoom', 'VentilationLevel', 'Presence']] = data[['HumidityBathRoom', 'VentilationLevel', 'Presence']].replace({255:np.nan, 242:np.nan})
    data['dwelling'] = dwelling
    
    final_dfs.append(data)

100%|██████████| 1/1 [00:11<00:00, 11.08s/it]


# Concat all the dataframes

In [5]:
final_df = pd.concat(final_dfs)
final_df.head()

NameError: name 'final_dfs' is not defined

# Check the created df

In [24]:
# What happens when 4 sensors from different rooms gather data on the same timestamp? Does the same timestap appear 4 times with 4 different rooms?
final_df = final_df.sort_index()
final_df.head()

# Looks like this is the case!

Unnamed: 0_level_0,Co2Value,RoomTemp,Humidity,HumidityBathRoom,VentilationLevel,Presence,room,dwelling
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-04 00:00:00,1446.0,21.6,52.0,59.0,,,living room,P01S01W6595
2017-01-04 00:00:00,780.0,21.0,47.0,,,1.0,bedroom1,P01S01W6595
2017-01-04 00:00:00,1358.0,21.19,54.0,,,0.0,kitchen,P01S01W6595
2017-01-04 00:00:00,924.0,20.79,49.0,,,0.0,bedroom2,P01S01W6595
2017-01-04 00:05:00,899.0,21.09,47.0,,,0.0,bedroom1,P01S01W6595


# Deal with NaNs

In [25]:
"""
Note that the currently used column in analysis from this data will be Presence, therefor the other NaNs are ignored.
"""

final_df['Presence'] = final_df['Presence'].fillna(0)
final_df.isnull().sum()

Co2Value            2095513
RoomTemp            2095513
Humidity            2095513
HumidityBathRoom    2172294
VentilationLevel    2197876
Presence                  0
room                      0
dwelling                  0
dtype: int64

In [26]:
final_df.head()

Unnamed: 0_level_0,Co2Value,RoomTemp,Humidity,HumidityBathRoom,VentilationLevel,Presence,room,dwelling
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-04 00:00:00,1446.0,21.6,52.0,59.0,,0.0,living room,P01S01W6595
2017-01-04 00:00:00,780.0,21.0,47.0,,,1.0,bedroom1,P01S01W6595
2017-01-04 00:00:00,1358.0,21.19,54.0,,,0.0,kitchen,P01S01W6595
2017-01-04 00:00:00,924.0,20.79,49.0,,,0.0,bedroom2,P01S01W6595
2017-01-04 00:05:00,899.0,21.09,47.0,,,0.0,bedroom1,P01S01W6595


final_df.to_csv('/datc/opschaler/honeywell_sensors_per_dwelling_combined/honeywell_all_dwellings_combined.csv', sep='\t', index=True)

#  TO DO: Transform the df

In [2]:
df = pd.read_csv('/datc/opschaler/honeywell_sensors_per_dwelling_combined/honeywell_all_dwellings_combined.csv', delimiter='\t', parse_dates=['datetime'])
df = df.set_index(['datetime'])

In [1]:
"""
Create one column per feature, per room. 
i.e. for precense:
    presence_living_room, presence_bedroom1, precense_bedroom2, presence_kitchen
    
This way there will be one row of features per datetime.
"""

'\nCreate one column per feature, per room. \ni.e. for precense:\n    presence_living_room, presence_bedroom1, precense_bedroom2, presence_kitchen\n    \nThis way there will be one row of features per datetime.\n'

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 37376938 entries, 2012-05-10 22:40:00 to 2017-12-17 23:55:00
Data columns (total 8 columns):
Co2Value            float64
Humidity            float64
HumidityBathRoom    float64
Presence            float64
RoomTemp            float64
VentilationLevel    float64
dwelling            object
room                object
dtypes: float64(6), object(2)
memory usage: 2.5+ GB


In [11]:

t.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1000000 entries, 2012-05-10 22:40:00 to 2015-01-25 02:25:00
Data columns (total 8 columns):
Co2Value            323259 non-null float64
Humidity            323259 non-null float64
HumidityBathRoom    143000 non-null float64
Presence            1000000 non-null float64
RoomTemp            323259 non-null float64
VentilationLevel    144 non-null float64
dwelling            1000000 non-null object
room                1000000 non-null object
dtypes: float64(6), object(2)
memory usage: 68.7+ MB


In [46]:
t = df[:1000000]
t = t.set_index([t.index, 'dwelling'], append=False)
t.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Co2Value,Humidity,HumidityBathRoom,Presence,RoomTemp,VentilationLevel,room
datetime,dwelling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-25 02:25:00,P01S01W4589,503.0,31.0,38.0,0.0,18.15,,living room
2015-01-25 02:25:00,P01S01W6289,1227.0,44.0,,0.0,18.5,,bedroom2
2015-01-25 02:25:00,P01S01W4313,654.0,25.0,,0.0,23.82,,living room
2015-01-25 02:25:00,P01S01W1554,1821.0,47.0,,0.0,18.07,,bedroom1
2015-01-25 02:25:00,P01S01W4313,793.0,33.0,,0.0,19.81,,bedroom1


In [49]:
a= t.pivot_table(columns='room')
a.tail()

room,bedroom1,bedroom2,kitchen,living room,unknown
Humidity,43.380019,35.627004,37.929715,33.981186,48.437259
HumidityBathRoom,57.48358,,,57.779543,47.618827
Presence,0.017195,0.014709,0.048281,0.064814,0.047931
RoomTemp,20.841977,20.750225,21.123434,22.72429,21.763935
VentilationLevel,224.840278,,,,


In [30]:
a.columns = ['{}_{}'.format(var,room) for var, room in a.columns]
a.columns

Index(['datetime_bedroom1', 'datetime_bedroom2', 'datetime_kitchen',
       'datetime_living room', 'datetime_unknown', 'dwelling_bedroom1',
       'dwelling_bedroom2', 'dwelling_kitchen', 'dwelling_living room',
       'dwelling_unknown', 'Co2Value_bedroom1', 'Co2Value_bedroom2',
       'Co2Value_kitchen', 'Co2Value_living room', 'Co2Value_unknown',
       'Humidity_bedroom1', 'Humidity_bedroom2', 'Humidity_kitchen',
       'Humidity_living room', 'Humidity_unknown', 'HumidityBathRoom_bedroom1',
       'HumidityBathRoom_bedroom2', 'HumidityBathRoom_kitchen',
       'HumidityBathRoom_living room', 'HumidityBathRoom_unknown',
       'Presence_bedroom1', 'Presence_bedroom2', 'Presence_kitchen',
       'Presence_living room', 'Presence_unknown', 'RoomTemp_bedroom1',
       'RoomTemp_bedroom2', 'RoomTemp_kitchen', 'RoomTemp_living room',
       'RoomTemp_unknown', 'VentilationLevel_bedroom1',
       'VentilationLevel_bedroom2', 'VentilationLevel_kitchen',
       'VentilationLevel_living ro

In [33]:
a.head()

Unnamed: 0,datetime_bedroom1,datetime_bedroom2,datetime_kitchen,datetime_living room,datetime_unknown,dwelling_bedroom1,dwelling_bedroom2,dwelling_kitchen,dwelling_living room,dwelling_unknown,...,RoomTemp_bedroom1,RoomTemp_bedroom2,RoomTemp_kitchen,RoomTemp_living room,RoomTemp_unknown,VentilationLevel_bedroom1,VentilationLevel_bedroom2,VentilationLevel_kitchen,VentilationLevel_living room,VentilationLevel_unknown
0,2012-05-10 22:40:00,NaT,NaT,NaT,NaT,P01S01W1554,,,,,...,21.34,,,,,,,,,
1,2012-05-10 22:45:00,NaT,NaT,NaT,NaT,P01S01W1554,,,,,...,21.29,,,,,,,,,
2,2012-05-10 22:50:00,NaT,NaT,NaT,NaT,P01S01W1554,,,,,...,21.29,,,,,,,,,
3,2012-05-10 22:55:00,NaT,NaT,NaT,NaT,P01S01W1554,,,,,...,21.29,,,,,,,,,
4,2012-05-10 23:00:00,NaT,NaT,NaT,NaT,P01S01W1554,,,,,...,21.28,,,,,,,,,
