In [1]:
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
import numpy as np
import datetime

In [2]:
# Import the two datasets 

senseData = pd.read_csv("kasterenSenseData.csv", header = 0)
actData = pd.read_csv("kasterenActData.csv", header = 0)
senseData.rename(columns = {'ID':'Sensor_ID'}, inplace = True)

senseData

Unnamed: 0,Start time,End time,Sensor_ID,Val
0,25/02/2008 00:20,25/02/2008 00:22,24,1
1,25/02/2008 09:33,25/02/2008 09:33,24,1
2,25/02/2008 09:33,25/02/2008 17:21,24,1
3,25/02/2008 09:36,25/02/2008 09:37,5,1
4,25/02/2008 09:37,25/02/2008 09:37,6,1
...,...,...,...,...
1314,21/03/2008 18:27,21/03/2008 18:27,8,1
1315,21/03/2008 18:28,21/03/2008 18:28,8,1
1316,21/03/2008 18:28,21/03/2008 18:28,17,1
1317,21/03/2008 19:11,21/03/2008 19:11,12,1


In [3]:
# Convert the date columns in datetime type

senseData['Start time'] = pd.to_datetime(senseData['Start time'], format='%d/%m/%Y %H:%M')
senseData['End time'] = pd.to_datetime(senseData['End time'], format='%d/%m/%Y %H:%M')
actData['Start time'] = pd.to_datetime(actData['Start time'], format='%d/%m/%Y %H:%M')
actData['End time'] = pd.to_datetime(actData['End time'], format='%d/%m/%Y %H:%M')

In [4]:
# From the documentation
dd = {
1: 'leave house',
4: 'use toilet',
5: 'take shower',
10:'go to bed',
13:'prepare Breakfast',
15:'prepare Dinner',
17:'get drink'}
# dd = {v: k for k, v in dd.items()}
dd


{1: 'leave house',
 4: 'use toilet',
 5: 'take shower',
 10: 'go to bed',
 13: 'prepare Breakfast',
 15: 'prepare Dinner',
 17: 'get drink'}

In [5]:
newdataset = pd.DataFrame(columns = ["Case_ID","Start time","End time","Sensor_ID","Label","Label_ID","Activity_ID",])
i = 0
for index, row in actData.iterrows():
    # I have the activity label and I have to select all the measurements that are between the starting and the end date
    if(row["Start time"]>row["End time"]):
        print("There is an error in the labels dataset, " + str(index))
        pass    
    # If the end time of this event is greater then the start time of the next event, just consider as ending time the next start time
    if(index<(len(actData)-1) and actData.loc[index+1]["Start time"]< row["End time"]): 
        mask = (senseData['Start time'] >= row["Start time"]) & (senseData['End time'] <= actData.loc[index+1]["Start time"])
    else: mask = (senseData['Start time'] >= row["Start time"]) & (senseData['End time'] <= row["End time"])
    temp = senseData.loc[mask]
    temp = temp.assign(Label=dd[row["ID"]])
    temp = temp.assign(Label_ID=row["ID"])
    temp = temp.assign(Activity_ID=index)
    temp = temp.assign(Case_ID=str(row["Start time"].day)+"_"+str(+row["Start time"].month))
    newdataset = pd.concat([newdataset, temp], ignore_index=True)
    # ATTENTION
    # There are cases in which the time range in the activity label, doesn't match any measurements
    # E.g. there is the following activity: 
    #   2011-11-28 20:21:15,2011-11-29 02:06:00,Spare_Time/TV
    # but the corresponding sensor measurement is: 
    #   2011-11-28 20:21:15,2011-11-29 02:06:51,Seat,Pressure,Living
    # As can be noticed, the ending time of the measurement is later than the ending time of the activity

    # print(i)
    # i += 1
    # if(i==2):
    #     print(newdataset)
    #     break
newdataset

Unnamed: 0,Case_ID,Start time,End time,Sensor_ID,Label,Label_ID,Activity_ID,Val
0,25_2,2008-02-25 09:33:00,2008-02-25 09:33:00,24,go to bed,10,0,1.0
1,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,6,use toilet,4,1,1.0
2,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,14,use toilet,4,1,1.0
3,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,14,use toilet,4,1,1.0
4,25_2,2008-02-25 09:37:00,2008-02-25 09:38:00,6,use toilet,4,1,1.0
...,...,...,...,...,...,...,...,...
1077,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,5,use toilet,4,243,1.0
1078,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,5,use toilet,4,243,1.0
1079,21_3,2008-03-21 18:25:00,2008-03-21 18:25:00,6,use toilet,4,243,1.0
1080,21_3,2008-03-21 19:11:00,2008-03-21 19:11:00,12,leave house,1,244,1.0


In [6]:
# Filter the log for the timeframe 8-13
newdataset = newdataset.loc[(newdataset['Start time'].dt.hour >= 8) & (newdataset['End time'].dt.hour <=13)]
# Save the file
newdataset.to_csv("log_labeled.csv", index=False)

In [7]:
# newdataset["Sensor"] = newdataset["Location"] + newdataset["Type"] 
columns = ["Start time","End time"] + newdataset.Sensor_ID.unique().tolist() + ["Label", "Label_ID"]
labeled_senseData = pd.DataFrame(columns=columns)

In [8]:
grouped = newdataset.groupby("Activity_ID")
i = 0
for a, group in grouped:
    newrow = {}
    group = group.reset_index()
    for ind, el in group.iterrows():
        newrow["Activity_Size"] = len(group)
        # First row of the group, i.e. Start time
        if(ind) == 0:
            newrow["Start time"] = el["Start time"]
            newrow["Label"] = el["Label"]
            newrow["Label_ID"] = el["Label_ID"]
        # Last row of the group, i.e. End time
        if(ind) == len(group)-1:
            newrow["End time"] = el["End time"]
        if el["Sensor_ID"] in newrow:
            newrow[el["Sensor_ID"]] = newrow[el["Sensor_ID"]]+1
        else:
            newrow[el["Sensor_ID"]] = 1
    i += 1
    tempp = pd.DataFrame([newrow])
    # print(newrow)
    labeled_senseData = pd.concat([labeled_senseData, tempp], ignore_index=True)
    # if(i==2):
    #     break
# print(labeled_senseData)
labeled_senseData

Unnamed: 0,Start time,End time,24,6,14,9,8,1,23,5,12,18,17,7,13,20,Label,Label_ID,Activity_Size
0,2008-02-25 09:33:00,2008-02-25 09:33:00,1,,,,,,,,,,,,,,go to bed,10,1.0
1,2008-02-25 09:37:00,2008-02-25 09:38:00,,2,2,,,,,,,,,,,,use toilet,4,4.0
2,2008-02-25 09:49:00,2008-02-25 09:53:00,,,,2,2,2,2,,,,,,,,prepare Breakfast,13,8.0
3,2008-02-25 10:02:00,2008-02-25 10:10:00,,,,,,,,2,,,,,,,take shower,5,2.0
4,2008-02-25 10:19:00,2008-02-25 10:19:00,,,,,,,,,1,,,,,,leave house,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,2008-03-20 09:57:00,2008-03-20 09:57:00,,,,,,,,,1,,,,,,leave house,1,1.0
110,2008-03-21 08:30:00,2008-03-21 08:31:00,1,2,1,,,,,,,,,,,,use toilet,4,4.0
111,2008-03-21 09:13:00,2008-03-21 09:15:00,,2,1,,,,,,,,,,,,use toilet,4,3.0
112,2008-03-21 09:17:00,2008-03-21 09:24:00,,,,,,,,2,,,,,,,take shower,5,2.0


In [9]:
labeled_senseData = labeled_senseData.rename(columns = {  
 1 : 'Microwave'         ,
 5 : 'Hall-Toilet door'  ,
 6 : 'Hall-Bathroom door',
 7 : 'Cups cupboard'     ,
 8 : 'Fridge'            ,
 9 : 'Plates cupboard'   ,
12 : 'Frontdoor'         ,
13 : 'Dishwasher'        ,
14 : 'ToiletFlush'       ,
17 : 'Freezer'           ,
18 : 'Pans Cupboard'     ,
20 : 'Washingmachine'    ,
23 : 'Groceries Cupboard',
24 : 'Hall-Bedroom door' 
    }) 
labeled_senseData

Unnamed: 0,Start time,End time,Hall-Bedroom door,Hall-Bathroom door,ToiletFlush,Plates cupboard,Fridge,Microwave,Groceries Cupboard,Hall-Toilet door,Frontdoor,Pans Cupboard,Freezer,Cups cupboard,Dishwasher,Washingmachine,Label,Label_ID,Activity_Size
0,2008-02-25 09:33:00,2008-02-25 09:33:00,1,,,,,,,,,,,,,,go to bed,10,1.0
1,2008-02-25 09:37:00,2008-02-25 09:38:00,,2,2,,,,,,,,,,,,use toilet,4,4.0
2,2008-02-25 09:49:00,2008-02-25 09:53:00,,,,2,2,2,2,,,,,,,,prepare Breakfast,13,8.0
3,2008-02-25 10:02:00,2008-02-25 10:10:00,,,,,,,,2,,,,,,,take shower,5,2.0
4,2008-02-25 10:19:00,2008-02-25 10:19:00,,,,,,,,,1,,,,,,leave house,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,2008-03-20 09:57:00,2008-03-20 09:57:00,,,,,,,,,1,,,,,,leave house,1,1.0
110,2008-03-21 08:30:00,2008-03-21 08:31:00,1,2,1,,,,,,,,,,,,use toilet,4,4.0
111,2008-03-21 09:13:00,2008-03-21 09:15:00,,2,1,,,,,,,,,,,,use toilet,4,3.0
112,2008-03-21 09:17:00,2008-03-21 09:24:00,,,,,,,,2,,,,,,,take shower,5,2.0


In [10]:
# Save the file
labeled_senseData.to_csv("log_labeled_trans.csv", index=False)