In [244]:
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
import numpy as np

In [245]:
# Import the two datasets 

senseData = pd.read_csv("kasterenSenseData.csv", header = 0)
actData = pd.read_csv("kasterenActData.csv", header = 0)
senseData.rename(columns = {'ID':'Sensor_ID'}, inplace = True)

senseData 

Unnamed: 0,Start time,End time,Sensor_ID,Val
0,25/02/2008 00:20,25/02/2008 00:22,24,1
1,25/02/2008 09:33,25/02/2008 09:33,24,1
2,25/02/2008 09:33,25/02/2008 17:21,24,1
3,25/02/2008 09:36,25/02/2008 09:37,5,1
4,25/02/2008 09:37,25/02/2008 09:37,6,1
...,...,...,...,...
1314,21/03/2008 18:27,21/03/2008 18:27,8,1
1315,21/03/2008 18:28,21/03/2008 18:28,8,1
1316,21/03/2008 18:28,21/03/2008 18:28,17,1
1317,21/03/2008 19:11,21/03/2008 19:11,12,1


In [246]:
# Convert the date columns in datetime type

senseData['Start time'] = pd.to_datetime(senseData['Start time'], format='%d/%m/%Y %H:%M')
senseData['End time'] = pd.to_datetime(senseData['End time'], format='%d/%m/%Y %H:%M')
actData['Start time'] = pd.to_datetime(actData['Start time'], format='%d/%m/%Y %H:%M')
actData['End time'] = pd.to_datetime(actData['End time'], format='%d/%m/%Y %H:%M')

In [247]:
# From the documentation
dd = {
1: 'leave_house',
4: 'use_toilet',
5: 'take_shower',
10:'go_to_bed',
13:'prepare_breakfast',
15:'prepare_dinner',
17:'get_drink'}
# dd = {v: k for k, v in dd.items()}
dd


{1: 'leave_house',
 4: 'use_toilet',
 5: 'take_shower',
 10: 'go_to_bed',
 13: 'prepare_breakfast',
 15: 'prepare_dinner',
 17: 'get_drink'}

In [248]:
newdataset = pd.DataFrame(columns = ["Case_ID","Start time","End time","Sensor_ID","Label","Label_ID","Activity_ID"])
i = 0
for index, row in actData.iterrows():
    # I have the activity label and I have to select all the measurements that are between the starting and the end date
    if(row["Start time"]>row["End time"]):
        print("There is an error in the labels dataset, " + str(index))
        pass    
    # If the end time of this event is greater then the start time of the next event, just consider as ending time the next start time
    if(index<(len(actData)-1) and actData.loc[index+1]["Start time"]< row["End time"]): 
        mask = (senseData['Start time'] >= row["Start time"]) & (senseData['End time'] <= actData.loc[index+1]["Start time"])
    else: mask = (senseData['Start time'] >= row["Start time"]) & (senseData['End time'] <= row["End time"])
    temp = senseData.loc[mask]
    if(len(senseData.loc[mask]) > 1):
        temp = temp.assign(Label=dd[row["ID"]])
        temp = temp.assign(Label_ID=row["ID"])
        temp = temp.assign(Activity_ID=index)
        temp = temp.assign(Case_ID=str(row["Start time"].day)+"_"+str(+row["Start time"].month))
        temp.drop("Val", inplace = True, axis=1)
        newdataset = pd.concat([newdataset, temp], ignore_index=True)
    else: pass
    # ATTENTION
    # There are cases in which the time range in the activity label, doesn't match any measurements
    # E.g. there is the following activity: 
    #   2011-11-28 20:21:15,2011-11-29 02:06:00,Spare_Time/TV
    # but the corresponding sensor measurement is: 
    #   2011-11-28 20:21:15,2011-11-29 02:06:51,Seat,Pressure,Living
    # As can be noticed, the ending time of the measurement is later than the ending time of the activity

    # print(i)
    # i += 1
    # if(i==2):
    #     print(newdataset)
    #     break
newdataset

Unnamed: 0,Case_ID,Start time,End time,Sensor_ID,Label,Label_ID,Activity_ID
0,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,6,use_toilet,4,1
1,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,14,use_toilet,4,1
2,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,14,use_toilet,4,1
3,25_2,2008-02-25 09:37:00,2008-02-25 09:38:00,6,use_toilet,4,1
4,25_2,2008-02-25 09:49:00,2008-02-25 09:49:00,9,prepare_breakfast,13,2
...,...,...,...,...,...,...,...
1064,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,5,use_toilet,4,243
1065,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,5,use_toilet,4,243
1066,21_3,2008-03-21 18:25:00,2008-03-21 18:25:00,6,use_toilet,4,243
1067,21_3,2008-03-21 19:11:00,2008-03-21 19:11:00,12,leave_house,1,244


In [249]:
sensor_names = {  
 1 : 'Microwave'         ,
 5 : 'Hall-Toilet_door'  ,
 6 : 'Hall-Bathroom_door',
 7 : 'Cups_cupboard'     ,
 8 : 'Fridge'            ,
 9 : 'Plates_cupboard'   ,
12 : 'Frontdoor'         ,
13 : 'Dishwasher'        ,
14 : 'ToiletFlush'       ,
17 : 'Freezer'           ,
18 : 'Pans_Cupboard'     ,
20 : 'Washingmachine'    ,
23 : 'Groceries_Cupboard',
24 : 'Hall-Bedroom_door' 
    }
newdataset = newdataset.replace({"Sensor_ID": sensor_names})
newdataset = newdataset.rename(columns={"Sensor_ID": "Sensor"})
newdataset

Unnamed: 0,Case_ID,Start time,End time,Sensor,Label,Label_ID,Activity_ID
0,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,Hall-Bathroom_door,use_toilet,4,1
1,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,ToiletFlush,use_toilet,4,1
2,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,ToiletFlush,use_toilet,4,1
3,25_2,2008-02-25 09:37:00,2008-02-25 09:38:00,Hall-Bathroom_door,use_toilet,4,1
4,25_2,2008-02-25 09:49:00,2008-02-25 09:49:00,Plates_cupboard,prepare_breakfast,13,2
...,...,...,...,...,...,...,...
1064,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,Hall-Toilet_door,use_toilet,4,243
1065,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,Hall-Toilet_door,use_toilet,4,243
1066,21_3,2008-03-21 18:25:00,2008-03-21 18:25:00,Hall-Bathroom_door,use_toilet,4,243
1067,21_3,2008-03-21 19:11:00,2008-03-21 19:11:00,Frontdoor,leave_house,1,244


In [250]:
# Merge the take shower and use toilet toegther in use toilet
# take shower -> 5 is converted into use toilet -> 4
newdataset["Label_ID"] = np.where((newdataset.Label_ID == 5), 4, newdataset.Label_ID)
newdataset["Label"] = np.where((newdataset.Label == "take_shower"), "use_toilet", newdataset.Label)
# Filter the log for the timeframe 8-13
# newdataset = newdataset.loc[(newdataset['Start time'].dt.hour >= 8) & (newdataset['End time'].dt.hour <=13)]
newdataset

Unnamed: 0,Case_ID,Start time,End time,Sensor,Label,Label_ID,Activity_ID
0,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,Hall-Bathroom_door,use_toilet,4,1
1,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,ToiletFlush,use_toilet,4,1
2,25_2,2008-02-25 09:37:00,2008-02-25 09:37:00,ToiletFlush,use_toilet,4,1
3,25_2,2008-02-25 09:37:00,2008-02-25 09:38:00,Hall-Bathroom_door,use_toilet,4,1
4,25_2,2008-02-25 09:49:00,2008-02-25 09:49:00,Plates_cupboard,prepare_breakfast,13,2
...,...,...,...,...,...,...,...
1064,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,Hall-Toilet_door,use_toilet,4,243
1065,21_3,2008-03-21 18:24:00,2008-03-21 18:24:00,Hall-Toilet_door,use_toilet,4,243
1066,21_3,2008-03-21 18:25:00,2008-03-21 18:25:00,Hall-Bathroom_door,use_toilet,4,243
1067,21_3,2008-03-21 19:11:00,2008-03-21 19:11:00,Frontdoor,leave_house,1,244


In [251]:
out = pd.DataFrame(columns = ["Case_ID","Start time","End time","Sensor","Label","Activity_ID"])
i = 0
for idc, case in newdataset.groupby("Case_ID"):
    for id, window in case.groupby(pd.Grouper(freq="2min", key = "Start time")):
        if(window.empty):
            pass
        else:
            most_freq_label = window['Label'].value_counts().idxmax()
            dict = {"Case_ID":[],"Start time":[],"End time":[],"Sensor":[],"Label":[],"Activity_ID":[]}
            for a, el in window.iterrows():
                dict.get("Case_ID").append(el.get("Case_ID"))
                dict.get("Start time").append(el.get("Start time"))
                dict.get("End time").append(el.get("End time"))
                dict.get("Sensor").append(el.get("Sensor"))
                dict.get("Label").append(most_freq_label)
                dict.get("Activity_ID").append(i)
            i = i+1
            df2 = pd.DataFrame(dict)
            out = pd.concat([out, df2], ignore_index = True)

In [252]:
# Save the file
out.to_csv("log_labeled.csv", index=False)
Train_set, Test_set = train_test_split(out, test_size=0.3, random_state=1) # 70% training and 30% test
Train_set.to_csv("log_labeled_train.csv", index=False)
Test_set.to_csv("log_labeled_test.csv", index=False)

In [253]:
# newdataset["Sensor"] = newdataset["Location"] + newdataset["Type"] 
columns = ["Start time","End time"] + out.Sensor.unique().tolist() + ["Label"]
labeled_senseData = pd.DataFrame(columns=columns)

In [254]:
# Transpose the log and create a row for each group
grouped = out.groupby("Activity_ID")
i = 0
for a, group in grouped:
    group = group.reset_index()
    newrow = {} 
    newrow["Start time"] = group.iloc[0]["Start time"]
    newrow["End time"] = group.iloc[-1]["End time"]
    newrow["Label"] = group.iloc[0]["Label"]
    for ind, el in group.iterrows():
        newrow[el["Sensor"]] = 1
    labeled_senseData = labeled_senseData.append(newrow, ignore_index = True)
    i +=1
    # if(i==2):
    #     break
# print(labeled_senseData)
labeled_senseData

Unnamed: 0,Start time,End time,Frontdoor,Hall-Bedroom_door,Hall-Bathroom_door,ToiletFlush,Hall-Toilet_door,Plates_cupboard,Groceries_Cupboard,Fridge,Freezer,Cups_cupboard,Dishwasher,Microwave,Washingmachine,Pans_Cupboard,Label
0,2008-03-10 06:21:00,2008-03-10 06:21:00,1,,,,,,,,,,,,,,go_to_bed
1,2008-03-10 08:33:00,2008-03-10 08:33:00,,1,,,,,,,,,,,,,go_to_bed
2,2008-03-10 08:35:00,2008-03-10 08:35:00,,,1,1,,,,,,,,,,,use_toilet
3,2008-03-10 08:49:00,2008-03-10 08:49:00,,,1,,,,,,,,,,,,use_toilet
4,2008-03-10 08:51:00,2008-03-10 08:51:00,,,1,1,1,,,,,,,,,,use_toilet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,2008-03-08 11:12:00,2008-03-08 11:12:00,,,1,,1,,,,,,,,,,use_toilet
457,2008-03-08 11:15:00,2008-03-08 11:16:00,,,1,1,1,,,,,,,,,,use_toilet
458,2008-03-08 11:22:00,2008-03-08 11:22:00,,,,,1,,,,,,,,,,use_toilet
459,2008-03-08 11:36:00,2008-03-08 11:36:00,1,,,,,,,,,,,,,,leave_house


In [255]:
# Save the file
labeled_senseData.to_csv("log_labeled_trans.csv", index=False)