In [1]:
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
import numpy as np
import datetime

In [2]:
# Import the two datasets 

senseData = pd.read_csv("kasterenSenseData.csv", header = 0)
actData = pd.read_csv("kasterenActData.csv", header = 0)
senseData.rename(columns = {'ID':'Sensor_ID'}, inplace = True)

senseData

Unnamed: 0,Start time,End time,Sensor_ID,Val
0,25/02/2008 00:20,25/02/2008 00:22,24,1
1,25/02/2008 09:33,25/02/2008 09:33,24,1
2,25/02/2008 09:33,25/02/2008 17:21,24,1
3,25/02/2008 09:36,25/02/2008 09:37,5,1
4,25/02/2008 09:37,25/02/2008 09:37,6,1
...,...,...,...,...
1314,21/03/2008 18:27,21/03/2008 18:27,8,1
1315,21/03/2008 18:28,21/03/2008 18:28,8,1
1316,21/03/2008 18:28,21/03/2008 18:28,17,1
1317,21/03/2008 19:11,21/03/2008 19:11,12,1


In [3]:
# Convert the date columns in datetime type

senseData['Start time'] = pd.to_datetime(senseData['Start time'], format='%d/%m/%Y %H:%M')
senseData['End time'] = pd.to_datetime(senseData['End time'], format='%d/%m/%Y %H:%M')
actData['Start time'] = pd.to_datetime(actData['Start time'], format='%d/%m/%Y %H:%M')
actData['End time'] = pd.to_datetime(actData['End time'], format='%d/%m/%Y %H:%M')

In [4]:
# From the documentation
dd = {
1: 'leave house',
4: 'use toilet',
5: 'take shower',
10:'go to bed',
13:'prepare Breakfast',
15:'prepare Dinner',
17:'get drink'}
# dd = {v: k for k, v in dd.items()}
dd


{1: 'leave house',
 4: 'use toilet',
 5: 'take shower',
 10: 'go to bed',
 13: 'prepare Breakfast',
 15: 'prepare Dinner',
 17: 'get drink'}

In [5]:
newdataset = pd.DataFrame(columns = ["Case_ID","Start time","End time","Sensor_ID","Label","Label_ID","Activity_ID",])
i = 0
for index, row in actData.iterrows():
    # I have the activity label and I have to select all the measurements that are between the starting and the end date
    if(row["Start time"]>row["End time"]):
        print("There is an error in the labels dataset, " + str(index))
        pass    
    # If the end time of this event is greater then the start time of the next event, just consider as ending time the next start time
    if(index<(len(actData)-1) and actData.loc[index+1]["Start time"]< row["End time"]): 
        mask = (senseData['Start time'] >= row["Start time"]) & (senseData['End time'] <= actData.loc[index+1]["Start time"])
    else: mask = (senseData['Start time'] >= row["Start time"]) & (senseData['End time'] <= row["End time"])
    temp = senseData.loc[mask]
    if(len(senseData.loc[mask]) > 1):
        temp = temp.assign(Label=dd[row["ID"]])
        temp = temp.assign(Label_ID=row["ID"])
        temp = temp.assign(Activity_ID=index)
        temp = temp.assign(Case_ID=str(row["Start time"].day)+"_"+str(+row["Start time"].month))
        newdataset = pd.concat([newdataset, temp], ignore_index=True)
    else: pass
    # ATTENTION
    # There are cases in which the time range in the activity label, doesn't match any measurements
    # E.g. there is the following activity: 
    #   2011-11-28 20:21:15,2011-11-29 02:06:00,Spare_Time/TV
    # but the corresponding sensor measurement is: 
    #   2011-11-28 20:21:15,2011-11-29 02:06:51,Seat,Pressure,Living
    # As can be noticed, the ending time of the measurement is later than the ending time of the activity

    # print(i)
    # i += 1
    # if(i==2):
    #     print(newdataset)
    #     break
newdataset

Unnamed: 0,Case_ID,Start time,End time,Sensor_ID,Label,Label_ID,Activity_ID,Val
0,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,6,use toilet,4,1,1.0
1,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,14,use toilet,4,1,1.0
2,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,14,use toilet,4,1,1.0
3,25_2,2008-02-25 09:37:00,2008-02-25 09:38:00,6,use toilet,4,1,1.0
4,25_2,2008-02-25 09:49:00,2008-02-25 09:49:00,9,prepare Breakfast,13,2,1.0
...,...,...,...,...,...,...,...,...
1064,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,5,use toilet,4,243,1.0
1065,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,5,use toilet,4,243,1.0
1066,21_3,2008-03-21 18:25:00,2008-03-21 18:25:00,6,use toilet,4,243,1.0
1067,21_3,2008-03-21 19:11:00,2008-03-21 19:11:00,12,leave house,1,244,1.0


In [6]:
sensor_names = {  
 1 : 'Microwave'         ,
 5 : 'Hall-Toilet door'  ,
 6 : 'Hall-Bathroom door',
 7 : 'Cups cupboard'     ,
 8 : 'Fridge'            ,
 9 : 'Plates cupboard'   ,
12 : 'Frontdoor'         ,
13 : 'Dishwasher'        ,
14 : 'ToiletFlush'       ,
17 : 'Freezer'           ,
18 : 'Pans Cupboard'     ,
20 : 'Washingmachine'    ,
23 : 'Groceries Cupboard',
24 : 'Hall-Bedroom door' 
    }
newdataset = newdataset.replace({"Sensor_ID": sensor_names})
newdataset

Unnamed: 0,Case_ID,Start time,End time,Sensor_ID,Label,Label_ID,Activity_ID,Val
0,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,Hall-Bathroom door,use toilet,4,1,1.0
1,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,ToiletFlush,use toilet,4,1,1.0
2,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,ToiletFlush,use toilet,4,1,1.0
3,25_2,2008-02-25 09:37:00,2008-02-25 09:38:00,Hall-Bathroom door,use toilet,4,1,1.0
4,25_2,2008-02-25 09:49:00,2008-02-25 09:49:00,Plates cupboard,prepare Breakfast,13,2,1.0
...,...,...,...,...,...,...,...,...
1064,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,Hall-Toilet door,use toilet,4,243,1.0
1065,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,Hall-Toilet door,use toilet,4,243,1.0
1066,21_3,2008-03-21 18:25:00,2008-03-21 18:25:00,Hall-Bathroom door,use toilet,4,243,1.0
1067,21_3,2008-03-21 19:11:00,2008-03-21 19:11:00,Frontdoor,leave house,1,244,1.0


In [7]:
# Merge the take shower and use toilet toegther in use toilet
# take shower -> 5 is converted into use toilet -> 4
# newdataset.loc[newdataset["Label_ID"] == 5] = 4
# newdataset.loc[newdataset["Label"] == "take shower"] = "use toilet"
newdataset["Label_ID"] = np.where((newdataset.Label_ID == 5), 4, newdataset.Label_ID)
newdataset["Label"] = np.where((newdataset.Label == "take shower"), "use toilet", newdataset.Label)
newdataset

Unnamed: 0,Case_ID,Start time,End time,Sensor_ID,Label,Label_ID,Activity_ID,Val
0,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,Hall-Bathroom door,use toilet,4,1,1.0
1,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,ToiletFlush,use toilet,4,1,1.0
2,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,ToiletFlush,use toilet,4,1,1.0
3,25_2,2008-02-25 09:37:00,2008-02-25 09:38:00,Hall-Bathroom door,use toilet,4,1,1.0
4,25_2,2008-02-25 09:49:00,2008-02-25 09:49:00,Plates cupboard,prepare Breakfast,13,2,1.0
...,...,...,...,...,...,...,...,...
1064,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,Hall-Toilet door,use toilet,4,243,1.0
1065,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,Hall-Toilet door,use toilet,4,243,1.0
1066,21_3,2008-03-21 18:25:00,2008-03-21 18:25:00,Hall-Bathroom door,use toilet,4,243,1.0
1067,21_3,2008-03-21 19:11:00,2008-03-21 19:11:00,Frontdoor,leave house,1,244,1.0


In [8]:
# Check if two subsequent activities, inside a case, are the same, by checking if two subsequent groups have the same label_ID. 
# In case merge, and update the corresponding Activity_ID (they have to have the same ID now, as they are the same activity)
grouped_caseid = newdataset.groupby("Case_ID")
for id, groupid in grouped_caseid:
    groupac = groupid.groupby("Activity_ID").first().reset_index()
    temp_label_id = 0
    ac_ID = 0
    for ix, el in groupac.iterrows():
        # In this case the label_Id is the same as the storage, that is the previous one, so the activity has to be merged and updated with the previous values
        if el["Label_ID"] == temp_label_id:
            # newdataset.loc[newdataset["Activity_ID"] == el["Activity_ID"]] = ac_ID
            newdataset["Activity_ID"] = np.where((newdataset.Activity_ID == el["Activity_ID"]), ac_ID, newdataset.Activity_ID)
            # print("the same of the storage")
        else:
            # print("not the same, so update it with the actual one")
            temp_label_id = el["Label_ID"]
            ac_ID = el["Activity_ID"]

In [9]:
# Filter the log for the timeframe 8-13
newdataset = newdataset.loc[(newdataset['Start time'].dt.hour >= 8) & (newdataset['End time'].dt.hour <=13)]
# Save the file
newdataset.to_csv("log_labeled.csv", index=False)

In [10]:
# newdataset["Sensor"] = newdataset["Location"] + newdataset["Type"] 
columns = ["Start time","End time"] + newdataset.Sensor_ID.unique().tolist() + ["Label", "Label_ID"]
labeled_senseData = pd.DataFrame(columns=columns)

In [11]:
# Transpose the log and create a row for each activation, incrementing the counter each time
grouped = newdataset.groupby("Activity_ID")
i = 0
for a, group in grouped:
    if(len(group) > 1):
        group = group.reset_index()
        counter = {}
        for ind, el in group.iterrows():
            newrow = {}
            #newrow["Activity_Size"] = len(group)
            newrow["Start time"] = el["Start time"]
            newrow["Label"] = el["Label"]
            newrow["Label_ID"] = el["Label_ID"]
            newrow["End time"] = el["End time"]
            # print(counter)
            if el["Sensor_ID"] in counter:
                counter[el["Sensor_ID"]] = counter[el["Sensor_ID"]]+1
                #newrow[el["Sensor_ID"]] = counter[el["Sensor_ID"]]
            else:
                #newrow[el["Sensor_ID"]] = 1
                counter[el["Sensor_ID"]] =1
            for k, v in counter.items():
                newrow[k] = v
            tempp = pd.DataFrame([newrow])
            # print(newrow)
            labeled_senseData = pd.concat([labeled_senseData, tempp], ignore_index=True)
        else: pass
    i +=1
    # if(i==2):
    #      break
# print(labeled_senseData)
labeled_senseData

Unnamed: 0,Start time,End time,Hall-Bathroom door,ToiletFlush,Plates cupboard,Fridge,Microwave,Groceries Cupboard,Hall-Toilet door,Frontdoor,Hall-Bedroom door,Pans Cupboard,Freezer,Cups cupboard,Dishwasher,Washingmachine,Label,Label_ID
0,2008-02-25 09:37:00,2008-02-25 09:37:00,1,,,,,,,,,,,,,,use toilet,4
1,2008-02-25 09:37:00,2008-02-25 09:37:00,1,1,,,,,,,,,,,,,use toilet,4
2,2008-02-25 09:37:00,2008-02-25 09:37:00,1,2,,,,,,,,,,,,,use toilet,4
3,2008-02-25 09:37:00,2008-02-25 09:38:00,2,2,,,,,,,,,,,,,use toilet,4
4,2008-02-25 09:49:00,2008-02-25 09:49:00,,,1,,,,,,,,,,,,prepare Breakfast,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,2008-03-21 09:13:00,2008-03-21 09:13:00,3,1,,,,,,,1,,,,,,use toilet,4
409,2008-03-21 09:14:00,2008-03-21 09:14:00,3,2,,,,,,,1,,,,,,use toilet,4
410,2008-03-21 09:14:00,2008-03-21 09:15:00,4,2,,,,,,,1,,,,,,use toilet,4
411,2008-03-21 09:17:00,2008-03-21 09:17:00,4,2,,,,,1,,1,,,,,,use toilet,4


In [12]:
# Rename columns
labeled_senseData = labeled_senseData.rename(columns = {  
 1 : 'Microwave'         ,
 5 : 'Hall-Toilet door'  ,
 6 : 'Hall-Bathroom door',
 7 : 'Cups cupboard'     ,
 8 : 'Fridge'            ,
 9 : 'Plates cupboard'   ,
12 : 'Frontdoor'         ,
13 : 'Dishwasher'        ,
14 : 'ToiletFlush'       ,
17 : 'Freezer'           ,
18 : 'Pans Cupboard'     ,
20 : 'Washingmachine'    ,
23 : 'Groceries Cupboard',
24 : 'Hall-Bedroom door' 
    }) 
labeled_senseData

Unnamed: 0,Start time,End time,Hall-Bathroom door,ToiletFlush,Plates cupboard,Fridge,Microwave,Groceries Cupboard,Hall-Toilet door,Frontdoor,Hall-Bedroom door,Pans Cupboard,Freezer,Cups cupboard,Dishwasher,Washingmachine,Label,Label_ID
0,2008-02-25 09:37:00,2008-02-25 09:37:00,1,,,,,,,,,,,,,,use toilet,4
1,2008-02-25 09:37:00,2008-02-25 09:37:00,1,1,,,,,,,,,,,,,use toilet,4
2,2008-02-25 09:37:00,2008-02-25 09:37:00,1,2,,,,,,,,,,,,,use toilet,4
3,2008-02-25 09:37:00,2008-02-25 09:38:00,2,2,,,,,,,,,,,,,use toilet,4
4,2008-02-25 09:49:00,2008-02-25 09:49:00,,,1,,,,,,,,,,,,prepare Breakfast,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,2008-03-21 09:13:00,2008-03-21 09:13:00,3,1,,,,,,,1,,,,,,use toilet,4
409,2008-03-21 09:14:00,2008-03-21 09:14:00,3,2,,,,,,,1,,,,,,use toilet,4
410,2008-03-21 09:14:00,2008-03-21 09:15:00,4,2,,,,,,,1,,,,,,use toilet,4
411,2008-03-21 09:17:00,2008-03-21 09:17:00,4,2,,,,,1,,1,,,,,,use toilet,4


In [13]:
# Save the file
labeled_senseData.to_csv("log_labeled_trans.csv", index=False)