# Step 1: Data munging

## Les choses à faire dans cette étape:
    1) Lire les jeux de données et nettoyer les jeux de données: les données des capteurs sont mises sous forme discrète avec delta t = 60s. 
    2) Calcule les paramètres du modèle de Markov caché: la fréquence de transition et d'observation.

In [151]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
from datetime import datetime
import math
#from __future__ import print_function
from operator import itemgetter
from collections import defaultdict

In [152]:
sensorData = pd.read_table("SensorData.txt")
activityData = pd.read_table("activitiesData.txt")

On élimine la première ligne de chaque jeu de données, ce qui servent à rien

In [153]:
sensorData = sensorData.ix[1:]
activityData = activityData.ix[1:]

In [154]:
activityData.head()

Unnamed: 0,Start time,End time,ID
1,25-Feb-2008 00:22:46,25-Feb-2008 09:34:12,10
2,25-Feb-2008 09:37:17,25-Feb-2008 09:38:02,4
3,25-Feb-2008 09:49:23,25-Feb-2008 09:53:28,13
4,25-Feb-2008 10:02:28,25-Feb-2008 10:12:42,5
5,25-Feb-2008 10:19:06,25-Feb-2008 16:55:38,1


On vérifie si il y a des valeurs manquantes dans notre jeu de données:

In [155]:
sensorData = sensorData[sensorData['End time'].notnull()]

In [156]:
activityData = activityData[activityData['End time'].notnull()]

On crée les listes des activités et des capteurs:

In [157]:
activityLabel = Series(['nothing','leave house','use toilet','take shower','go to bed','prepare breakfast','prepare dinner',
                        'get drink'], index=[0,1,4,5,10,13,15,17])
activityLabel

0               nothing
1           leave house
4            use toilet
5           take shower
10            go to bed
13    prepare breakfast
15       prepare dinner
17            get drink
dtype: object

In [158]:
sensorLabel = Series(['Microwave','Hall-Toilet door','Hall-Bathroom door','Cups cupboard','Fridge',
                      'Plates cupboard','Front door','Dishwasher','ToiletFlush','Freezer','Pans Cupboard',
                      'Washing machine','Groceries Cupboard','Hall-Bedroom door'],
                     index = [1,5,6,7,8,9,12,13,14,17,18,20,23,24])
sensorLabel

1              Microwave
5       Hall-Toilet door
6     Hall-Bathroom door
7          Cups cupboard
8                 Fridge
9        Plates cupboard
12            Front door
13            Dishwasher
14           ToiletFlush
17               Freezer
18         Pans Cupboard
20       Washing machine
23    Groceries Cupboard
24     Hall-Bedroom door
dtype: object

Maintenant on cherche à gérer les 2 colonnes de temps : Start Time et End Time

In [159]:
x1 = '25-Feb-2008 00:22:46'
x2 = '25-Feb-2008 00:23:47'
y1 = pd.to_datetime(x1)
y2 = pd.to_datetime(x2)
print(type(y2-y1))
print((y2-y1)/pd.Timedelta(1,'s'))

<class 'pandas.tslib.Timedelta'>
61.0


In [160]:
activityData['Start time'] = pd.to_datetime(activityData['Start time'])

In [161]:
activityData['End time'] = pd.to_datetime(activityData['End time'])

In [162]:
sensorData['Start time'] = pd.to_datetime(sensorData['Start time'])

In [163]:
sensorData['End time'] = pd.to_datetime(sensorData['End time'])

Convert ID columns from string to int:

In [164]:
sensorData.ID = sensorData['ID'].apply(int)

In [165]:
activityData.ID = activityData['ID'].apply(int)

Convert to raw feature matrix:

In [166]:
def convert2RawFeatMatrix(trainingSensorData, trainingActivityData, timeStep):
    beginTime = 0 
    endTime = 0
    
    if sensorData['Start time'][1] < activityData['Start time'][1]: # we compare 1st time step of activity data vs sensor data
        beginTime = sensorData['Start time'][1]
    else:
        beginTime = activityData['Start time'][1]
        
    if sensorData['End time'][len(sensorData)] > activityData['End time'][len(activityData)]:
        endTime = sensorData['End time'][len(sensorData)]
    else:
        endTime = activityData['End time'][len(activityData)]
        
    duration = endTime - beginTime
    numberOfTimeStep = math.ceil(duration/pd.Timedelta(timeStep,'s'))
    
    #print beginTime
    
    featureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep))
    labels = Series(0, index = np.arange(numberOfTimeStep))
    
    for index,rowSensor in trainingSensorData.iterrows():
        differenceStart = rowSensor['Start time'] - beginTime
        differenceEnd = rowSensor['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        
        featureMatrix.loc[rowSensor.ID][startStep:endStep] = 1
        
    for index, rowActivity in trainingActivityData.iterrows(): 
        differenceStart = rowActivity['Start time'] - beginTime
        #print differenceStart
        differenceEnd = rowActivity['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        #print startStep, 
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        #print endStep
        
        labels[startStep:endStep] = rowActivity.ID
        
    return featureMatrix, labels
    
        
                                                                              
    

In [167]:
featureMatrix, labels = convert2RawFeatMatrix(sensorData, activityData, 60)

In [168]:
featureMatrix.ix[:,565:584]


Unnamed: 0,565.0,566.0,567.0,568.0,569.0,570.0,571.0,572.0,573.0,574.0,575.0,576.0,577.0,578.0,579.0,580.0,581.0,582.0,583.0,584.0
1,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0
14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Convert to last sensor fired representation, in which the last sensor that changed state continues to give 1 and changes to 0 when a different sensor changes state.

In [169]:
def convert2LastFiredFeatureMatrix(trainingSensorData, trainingActivityData, timeStep):
    beginTime = 0 
    endTime = 0
    
    #if trainingSensorData['Start time'][1] < trainingActivityData['Start time'][1]: # we compare 1st time step of activity data vs sensor data
    beginTime = trainingSensorData['Start time'][1]
    #else:
        #beginTime = trainingActivityData['Start time'][1]
        
    #if trainingSensorData['End time'][len(trainingSensorData)] > trainingActivityData['End time'][len(trainingActivityData)]:
    endTime = trainingSensorData['End time'][len(trainingSensorData)] 
    #else:
        #endTime = trainingActivityData['End time'][len(trainingActivityData)]
        
    duration = endTime - beginTime
    numberOfTimeStep = math.ceil(duration/pd.Timedelta(timeStep,'s'))
    
    #print beginTime
    
    featureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep))
    finalFeatureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep)) 
    labels = Series(0, index = np.arange(numberOfTimeStep))
    
    for index,rowSensor in trainingSensorData.iterrows():
        differenceStart = rowSensor['Start time'] - beginTime
        differenceEnd = rowSensor['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        
        featureMatrix.loc[rowSensor.ID][startStep] = 1
        featureMatrix.loc[rowSensor.ID][endStep-1] = 1
    
    x,y = np.where(featureMatrix == 1)
    lst = []
    for i in range(len(x)): 
        lst.append((x[i],y[i]))
        
    lstSorted = sorted(lst, key = lambda pos:pos[1])
    
    #print(lstSorted)
    
    for i in range(len(lstSorted)-1): 
        finalFeatureMatrix.iloc[lstSorted[i][0]][lstSorted[i][1]: lstSorted[i+1][1]-1] = 1 
            
    for index, rowActivity in trainingActivityData.iterrows(): 
        if rowActivity['Start time'] < beginTime or rowActivity['End time'] > endTime: 
            continue
        
        differenceStart = rowActivity['Start time'] - beginTime
        #print differenceStart
        differenceEnd = rowActivity['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        #print startStep, 
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        #print endStep
        
        labels[startStep:endStep] = rowActivity.ID
    
    finalFeatureMatrix = finalFeatureMatrix.drop(labels = len(finalFeatureMatrix.iloc[0])-1, axis = 1 )
    #print len(labels)
    labels = labels[:len(labels)-2]
    #print len(labels)
    return finalFeatureMatrix, labels
    
        
                                                                              
    

Now we create the training set

In [170]:
testDay = [25,28,5,10,16]

In [171]:
trainingSensor = [sensorData['Start time'][x].day not in testDay and sensorData['End time'][x].day not in testDay for x in range(1,len(sensorData))]
trainingSensor = trainingSensor + [False]
trainingSensorData = sensorData[trainingSensor]
trainingSensorData.index = np.arange(1,len(trainingSensorData)+1)
trainingSensorData.head()

Unnamed: 0,Start time,End time,ID,Val
1,2008-02-26 00:38:49,2008-02-26 00:38:50,24,1
2,2008-02-26 00:38:52,2008-02-26 00:38:59,5,1
3,2008-02-26 00:38:53,2008-02-26 00:39:41,24,1
4,2008-02-26 00:39:00,2008-02-26 00:39:01,5,1
5,2008-02-26 00:39:03,2008-02-26 00:39:04,5,1


In [172]:
trainingActivity = [activityData['Start time'][x].day not in testDay and activityData['End time'][x].day not in testDay for x in range(1,len(activityData))]
trainingActivity = trainingActivity + [False]
trainingActivityData = activityData[trainingActivity]
trainingActivityData.index = np.arange(1,len(trainingActivityData)+1)
trainingActivityData.head()

Unnamed: 0,Start time,End time,ID
1,2008-02-26 00:39:24,2008-02-26 00:39:40,4
2,2008-02-26 03:13:40,2008-02-26 03:14:41,4
3,2008-02-26 08:35:59,2008-02-26 08:36:38,4
4,2008-02-26 09:15:40,2008-02-26 09:19:00,4
5,2008-02-26 09:26:42,2008-02-26 09:29:09,13


In [173]:
finalFeatureMatrix, labels = convert2LastFiredFeatureMatrix(trainingSensorData,trainingActivityData, 60)

In [174]:
labels

0        4
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
35642    0
35643    0
35644    0
35645    0
35646    0
35647    0
35648    0
35649    0
35650    0
35651    0
35652    0
35653    0
35654    0
35655    0
35656    0
35657    0
35658    0
35659    0
35660    0
35661    0
35662    0
35663    0
35664    0
35665    0
35666    0
35667    0
35668    0
35669    0
35670    0
35671    0
dtype: int64

In [175]:
finalFeatureMatrix

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,35662.0,35663.0,35664.0,35665.0,35666.0,35667.0,35668.0,35669.0,35670.0,35671.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


In [176]:
def cumulationTable(finalFeatureMatrix, labels):
    featureMat = finalFeatureMatrix.copy()
    lab = labels.copy()
    cumuSensor = [featureMat[0]]
    cumuActivity = [lab[0]]
    for x in xrange(1,len(lab)):
        if lab[x] == lab[x-1]:
            cumuSensor[-1] += featureMat[x]
        else:
            cumuSensor.append(featureMat[x])
            cumuActivity.append(lab[x])
    
    return cumuSensor, cumuActivity

In [177]:
cumuSensor, cumuActivity = cumulationTable(finalFeatureMatrix, labels)

In [178]:
len(cumuSensor)

326

In [179]:
len(cumuActivity)

326

In [180]:
len(labels)

35672

In [181]:
activityLabel

0               nothing
1           leave house
4            use toilet
5           take shower
10            go to bed
13    prepare breakfast
15       prepare dinner
17            get drink
dtype: object

In [182]:
sensorLabel

1              Microwave
5       Hall-Toilet door
6     Hall-Bathroom door
7          Cups cupboard
8                 Fridge
9        Plates cupboard
12            Front door
13            Dishwasher
14           ToiletFlush
17               Freezer
18         Pans Cupboard
20       Washing machine
23    Groceries Cupboard
24     Hall-Bedroom door
dtype: object

In [183]:
t = finalFeatureMatrix[finalFeatureMatrix == 0]

We now calculate the state transition frequencies

In [184]:
def labelFrequencyCounting(labels):
    activityDict = defaultdict(lambda: defaultdict(int)) #create a defaultdict of defaultdict
    
    for x in range(len(labels)-2): #we stop at the nth-1 number
        activityDict[labels[x]][labels[x+1]] += 1 
    return activityDict
    

In [185]:
labelFrequencyCounting = labelFrequencyCounting(labels)

In [186]:
labelFrequencyCounting

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {0: 14324,
                          1: 24,
                          4: 60,
                          5: 13,
                          10: 8,
                          13: 8,
                          15: 4,
                          17: 14}),
             1: defaultdict(int, {0: 17, 1: 14104, 4: 6, 15: 2}),
             4: defaultdict(int,
                         {0: 53,
                          1: 1,
                          4: 149,
                          5: 5,
                          10: 19,
                          13: 4,
                          15: 2,
                          17: 1}),
             5: defaultdict(int, {0: 18, 5: 172}),
             10: defaultdict(int, {0: 8, 4: 17, 10: 6305, 13: 2}),
             13: defaultdict(int, {0: 15, 13: 59}),
             15: defaultdict(int, {0: 6, 4: 1, 15: 221, 17: 1}),
             17: defaultdict(int, {0: 15, 13: 1, 17: 11

In [187]:
def sumOfDict(dct):
    sum = 0
    for value in dct.values():
        sum += value
    return sum

In [188]:
def frequencyMatrix(frequencyCounting): # on va utiliser cette méthode pour calculer transitionMatrix et observationMatrix
    frequencyMat = frequencyCounting.copy()
    for miniDict in frequencyMat.values():
        temporarySum = sumOfDict(miniDict)
        for key,value in miniDict.items():
            miniDict[key] = float(value)/temporarySum
    return frequencyMat
            

In [189]:
def labelTransitionMatrix(labelFrequencyCounting):
    return frequencyMatrix(labelFrequencyCounting)     

In [190]:
labelTransitionMatrix = labelTransitionMatrix(labelFrequencyCounting)

In [191]:
x,y = np.where(finalFeatureMatrix == 1)
lst = []
for i in range(len(x)): 
    lst.append((x[i],y[i]))

lstSorted = sorted(lst, key = lambda pos:pos[1])

sensorList = sensorLabel.index.tolist()
indexSensorList = []
for sensor in lstSorted:
    indexSensorList.append(sensorList[sensor[0]])

#set(indexSensorList)


In [192]:
len(lst)

35672

In [193]:
len(indexSensorList)

35672

In [194]:
len(finalFeatureMatrix.iloc[1])

35672

In [195]:
len(labels)

35672

In [196]:
def observationFrequencyCounting(labels, indexSensorList):
    observationMat = defaultdict(lambda: defaultdict(int))
    for x in range(len(labels)):
        observationMat[labels[x]][indexSensorList[x]] += 1
    return observationMat

In [197]:
#observationFrequencyCounting = observationFrequencyCounting(labels, indexSensorList)
#observationFrequencyCounting

In [198]:
def observationMatrix(labels, indexSensorList):
    observationFrequencyCount = observationFrequencyCounting(labels, indexSensorList)
    return frequencyMatrix(observationFrequencyCount)
    

In [199]:
observationMatrix = observationMatrix(labels, indexSensorList)

In [200]:
observationMatrix

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {1: 0.0029051670471052084,
                          5: 0.016946807774780383,
                          6: 0.04177906896313205,
                          7: 0.005256968942380854,
                          8: 0.17175070899910078,
                          9: 0.003112678979041295,
                          12: 0.44096285536418345,
                          13: 0.008853842429273017,
                          14: 0.08438818565400844,
                          17: 0.009614719513038666,
                          18: 0.14470498720343086,
                          20: 0.0033893615549560768,
                          23: 0.014387493947568651,
                          24: 0.05194715362800028}),
             1: defaultdict(int,
                         {5: 0.00028310566919102553,
                          6: 0.0003538820864887819,
                          8: 0.00014155283459551277,
              

Enfin, on va mettre ces 2 matrices sous forme DataFrame: 

In [201]:
labelTransitionMatrix

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {0: 0.9909373919059149,
                          1: 0.0016603251470079558,
                          4: 0.0041508128675198895,
                          5: 0.0008993427879626426,
                          10: 0.0005534417156693186,
                          13: 0.0005534417156693186,
                          15: 0.0002767208578346593,
                          17: 0.0009685230024213075}),
             1: defaultdict(int,
                         {0: 0.0012031990940618586,
                          1: 0.9982305895675561,
                          4: 0.0004246585037865383,
                          15: 0.00014155283459551277}),
             4: defaultdict(int,
                         {0: 0.2264957264957265,
                          1: 0.004273504273504274,
                          4: 0.6367521367521367,
                          5: 0.021367521367521368,
                          10: 0

In [202]:
transitionDistribution = pd.DataFrame(0.0, index = activityLabel.index, columns = activityLabel.index)
for key,valDict in labelTransitionMatrix.items():
    for miniKey, val in valDict.items(): 
        #print miniKey,key, val
        transitionDistribution.loc[miniKey][key] = val
transitionDistribution

Unnamed: 0,0,1,4,5,10,13,15,17
0,0.990937,0.001203,0.226496,0.094737,0.001263,0.202703,0.026201,0.555556
1,0.00166,0.998231,0.004274,0.0,0.0,0.0,0.0,0.0
4,0.004151,0.000425,0.636752,0.0,0.002685,0.0,0.004367,0.0
5,0.000899,0.0,0.021368,0.905263,0.0,0.0,0.0,0.0
10,0.000553,0.0,0.081197,0.0,0.995736,0.0,0.0,0.0
13,0.000553,0.0,0.017094,0.0,0.000316,0.797297,0.0,0.037037
15,0.000277,0.000142,0.008547,0.0,0.0,0.0,0.965066,0.0
17,0.000969,0.0,0.004274,0.0,0.0,0.0,0.004367,0.407407


In [203]:
observationDistribution = pd.DataFrame(0.0, index = sensorLabel.index, columns = activityLabel.index)
for key,valDict in observationMatrix.items():
    for miniKey, val in valDict.items(): 
        #print miniKey,key, val
        observationDistribution.loc[miniKey][key] = val
observationDistribution

Unnamed: 0,0,1,4,5,10,13,15,17
1,0.002905,0.0,0.0,0.0,0.0,0.067568,0.056769,0.0
5,0.016947,0.000283,0.012821,0.878947,0.151453,0.0,0.0,0.0
6,0.041779,0.000354,0.226496,0.0,0.023215,0.0,0.004367,0.0
7,0.005257,0.0,0.0,0.0,0.0,0.0,0.004367,0.111111
8,0.171751,0.000142,0.004274,0.0,0.0,0.297297,0.091703,0.777778
9,0.003113,0.0,0.0,0.0,0.0,0.040541,0.257642,0.0
12,0.440963,0.99908,0.025641,0.0,0.0,0.0,0.0,0.0
13,0.008854,0.0,0.0,0.0,0.0,0.0,0.017467,0.0
14,0.084388,0.0,0.517094,0.121053,0.109128,0.081081,0.0,0.0
17,0.009615,0.000142,0.0,0.0,0.0,0.162162,0.100437,0.0
