# Step 1: Data munging

## Les choses à faire dans cette étape:
    1) Lire les jeux de données et nettoyer les jeux de données: les données des capteurs sont mises sous forme discrète avec delta t = 60s. 
    2) Calcule les paramètres du modèle de Markov caché: la fréquence de transition et d'observation.

In [1]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
from datetime import datetime
import math
#from __future__ import print_function
from operator import itemgetter
from collections import defaultdict

In [2]:
sensorData = pd.read_table("SensorData.txt")
activityData = pd.read_table("activitiesData.txt")

On élimine la première ligne de chaque jeu de données, ce qui servent à rien

In [3]:
sensorData = sensorData.ix[1:]
activityData = activityData.ix[1:]

In [4]:
activityData.head()

Unnamed: 0,Start time,End time,ID
1,25-Feb-2008 00:22:46,25-Feb-2008 09:34:12,10
2,25-Feb-2008 09:37:17,25-Feb-2008 09:38:02,4
3,25-Feb-2008 09:49:23,25-Feb-2008 09:53:28,13
4,25-Feb-2008 10:02:28,25-Feb-2008 10:12:42,5
5,25-Feb-2008 10:19:06,25-Feb-2008 16:55:38,1


On vérifie si il y a des valeurs manquantes dans notre jeu de données:

In [5]:
sensorData = sensorData[sensorData['End time'].notnull()]

In [6]:
activityData = activityData[activityData['End time'].notnull()]

On crée les listes des activités et des capteurs:

In [7]:
activityLabel = Series(['nothing','leave house','use toilet','take shower','go to bed','prepare breakfast','prepare dinner',
                        'get drink'], index=[0,1,4,5,10,13,15,17])
activityLabel

0               nothing
1           leave house
4            use toilet
5           take shower
10            go to bed
13    prepare breakfast
15       prepare dinner
17            get drink
dtype: object

In [8]:
sensorLabel = Series(['Microwave','Hall-Toilet door','Hall-Bathroom door','Cups cupboard','Fridge',
                      'Plates cupboard','Front door','Dishwasher','ToiletFlush','Freezer','Pans Cupboard',
                      'Washing machine','Groceries Cupboard','Hall-Bedroom door'],
                     index = [1,5,6,7,8,9,12,13,14,17,18,20,23,24])
sensorLabel

1              Microwave
5       Hall-Toilet door
6     Hall-Bathroom door
7          Cups cupboard
8                 Fridge
9        Plates cupboard
12            Front door
13            Dishwasher
14           ToiletFlush
17               Freezer
18         Pans Cupboard
20       Washing machine
23    Groceries Cupboard
24     Hall-Bedroom door
dtype: object

Maintenant on cherche à gérer les 2 colonnes de temps : Start Time et End Time

In [9]:
x1 = '25-Feb-2008 00:22:46'
x2 = '25-Feb-2008 00:23:47'
y1 = pd.to_datetime(x1)
y2 = pd.to_datetime(x2)
print(type(y2-y1))
print((y2-y1)/pd.Timedelta(1,'s'))

<class 'pandas.tslib.Timedelta'>
61.0


In [10]:
activityData['Start time'] = pd.to_datetime(activityData['Start time'])

In [11]:
activityData['End time'] = pd.to_datetime(activityData['End time'])

In [12]:
sensorData['Start time'] = pd.to_datetime(sensorData['Start time'])

In [13]:
sensorData['End time'] = pd.to_datetime(sensorData['End time'])

Convert ID columns from string to int:

In [14]:
sensorData.ID = sensorData['ID'].apply(int)

In [15]:
activityData.ID = activityData['ID'].apply(int)

Convert to raw feature matrix:

In [16]:
def convert2RawFeatMatrix(trainingSensorData, trainingActivityData, timeStep):
    beginTime = 0 
    endTime = 0
    
    if sensorData['Start time'][1] < activityData['Start time'][1]: # we compare 1st time step of activity data vs sensor data
        beginTime = sensorData['Start time'][1]
    else:
        beginTime = activityData['Start time'][1]
        
    if sensorData['End time'][len(sensorData)] > activityData['End time'][len(activityData)]:
        endTime = sensorData['End time'][len(sensorData)]
    else:
        endTime = activityData['End time'][len(activityData)]
        
    duration = endTime - beginTime
    numberOfTimeStep = math.ceil(duration/pd.Timedelta(timeStep,'s'))
    
    #print beginTime
    
    featureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep))
    labels = Series(0, index = np.arange(numberOfTimeStep))
    
    for index,rowSensor in trainingSensorData.iterrows():
        differenceStart = rowSensor['Start time'] - beginTime
        differenceEnd = rowSensor['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        
        featureMatrix.loc[rowSensor.ID][startStep:endStep] = 1
        
    for index, rowActivity in trainingActivityData.iterrows(): 
        differenceStart = rowActivity['Start time'] - beginTime
        #print differenceStart
        differenceEnd = rowActivity['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        #print startStep, 
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        #print endStep
        
        labels[startStep:endStep] = rowActivity.ID
        
    return featureMatrix, labels
    
        
                                                                              
    

In [17]:
featureMatrix, labels = convert2RawFeatMatrix(sensorData, activityData, 60)

In [18]:
featureMatrix.ix[:,565:584]


Unnamed: 0,565.0,566.0,567.0,568.0,569.0,570.0,571.0,572.0,573.0,574.0,575.0,576.0,577.0,578.0,579.0,580.0,581.0,582.0,583.0,584.0
1,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0
14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Convert to last sensor fired representation, in which the last sensor that changed state continues to give 1 and changes to 0 when a different sensor changes state.

In [109]:
def convert2LastFiredFeatureMatrix(trainingSensorData, trainingActivityData, timeStep):
    beginTime = 0 
    endTime = 0
    
    #if trainingSensorData['Start time'][1] < trainingActivityData['Start time'][1]: # we compare 1st time step of activity data vs sensor data
    beginTime = trainingSensorData['Start time'][1]
    #else:
        #beginTime = trainingActivityData['Start time'][1]
        
    #if trainingSensorData['End time'][len(trainingSensorData)] > trainingActivityData['End time'][len(trainingActivityData)]:
    endTime = trainingSensorData['End time'][len(trainingSensorData)] 
    #else:
        #endTime = trainingActivityData['End time'][len(trainingActivityData)]
        
    duration = endTime - beginTime
    numberOfTimeStep = math.ceil(duration/pd.Timedelta(timeStep,'s'))
    
    #print beginTime
    
    featureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep))
    finalFeatureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep)) 
    labels = Series(0, index = np.arange(numberOfTimeStep))
    
    for index,rowSensor in trainingSensorData.iterrows():
        differenceStart = rowSensor['Start time'] - beginTime
        differenceEnd = rowSensor['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        
        featureMatrix.loc[rowSensor.ID][startStep] = 1
        featureMatrix.loc[rowSensor.ID][endStep-1] = 1
    
    x,y = np.where(featureMatrix == 1)
    lst = []
    for i in range(len(x)): 
        lst.append((x[i],y[i]))
        
    lstSorted = sorted(lst, key = lambda pos:pos[1])
    
    #print(lstSorted)
    
    for i in range(len(lstSorted)-1): 
        finalFeatureMatrix.iloc[lstSorted[i][0]][lstSorted[i][1]: lstSorted[i+1][1]-1] = 1 
            
    for index, rowActivity in trainingActivityData.iterrows(): 
        if rowActivity['Start time'] < beginTime or rowActivity['End time'] > endTime: 
            continue
        
        differenceStart = rowActivity['Start time'] - beginTime
        #print differenceStart
        differenceEnd = rowActivity['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        #print startStep, 
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        #print endStep
        
        labels[startStep:endStep] = rowActivity.ID
    
    finalFeatureMatrix = finalFeatureMatrix.drop(labels = len(finalFeatureMatrix.iloc[0])-1, axis = 1 )
    #print len(labels)
    labels = labels[:len(labels)-2]
    #print len(labels)
    return finalFeatureMatrix, labels
    
        
                                                                              
    

Now we create the training set

In [110]:
testDay = [25,28,5,10,16]

In [111]:
trainingSensor = [sensorData['Start time'][x].day not in testDay and sensorData['End time'][x].day not in testDay for x in range(1,len(sensorData))]
trainingSensor = trainingSensor + [False]
trainingSensorData = sensorData[trainingSensor]
trainingSensorData.index = np.arange(1,len(trainingSensorData)+1)
trainingSensorData.head()

Unnamed: 0,Start time,End time,ID,Val
1,2008-02-25 00:20:14,2008-02-25 00:22:57,24,1
2,2008-02-25 09:33:41,2008-02-25 09:33:42,24,1
3,2008-02-25 09:36:43,2008-02-25 09:37:04,5,1
4,2008-02-25 09:37:20,2008-02-25 09:37:23,6,1
5,2008-02-25 09:37:51,2008-02-25 09:37:52,14,1


In [113]:
trainingActivity = [activityData['Start time'][x].day not in testDay and activityData['End time'][x].day not in testDay for x in range(1,len(activityData))]
trainingActivity = trainingActivity + [False]
trainingActivityData = activityData[trainingActivity]
trainingActivityData.index = np.arange(1,len(trainingActivityData)+1)
trainingActivityData.head()

Unnamed: 0,Start time,End time,ID
1,2008-02-25 00:22:46,2008-02-25 09:34:12,10
2,2008-02-25 09:37:17,2008-02-25 09:38:02,4
3,2008-02-25 09:49:23,2008-02-25 09:53:28,13
4,2008-02-25 10:02:28,2008-02-25 10:12:42,5
5,2008-02-25 10:19:06,2008-02-25 16:55:38,1


In [114]:
finalFeatureMatrix, labels = convert2LastFiredFeatureMatrix(trainingSensorData,trainingActivityData, 60)

In [115]:
finalFeatureMatrix

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,37120.0,37121.0,37122.0,37123.0,37124.0,37125.0,37126.0,37127.0,37128.0,37129.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [116]:
t = finalFeatureMatrix[finalFeatureMatrix == 0]

We now calculate the state transition frequencies

In [117]:
def labelFrequencyCounting(labels):
    activityDict = defaultdict(lambda: defaultdict(int)) #create a defaultdict of defaultdict
    
    for x in range(len(labels)-2): #we stop at the nth-1 number
        activityDict[labels[x]][labels[x+1]] += 1 
    return activityDict
    

In [118]:
labelFrequencyCounting = labelFrequencyCounting(labels)

In [119]:
labelFrequencyCounting

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {0: 17071,
                          1: 22,
                          4: 58,
                          5: 15,
                          10: 12,
                          13: 9,
                          15: 6,
                          17: 13}),
             1: defaultdict(int, {0: 15, 1: 10090, 4: 8, 15: 1}),
             4: defaultdict(int,
                         {0: 57, 1: 2, 4: 146, 5: 3, 10: 27, 13: 5, 15: 3}),
             5: defaultdict(int, {0: 18, 5: 161}),
             10: defaultdict(int, {0: 10, 4: 29, 10: 9000}),
             13: defaultdict(int, {0: 15, 13: 53}),
             15: defaultdict(int, {0: 4, 4: 2, 15: 234, 17: 4}),
             17: defaultdict(int, {0: 16, 13: 1, 17: 18})})

In [120]:
def sumOfDict(dct):
    sum = 0
    for value in dct.values():
        sum += value
    return sum

In [121]:
def frequencyMatrix(frequencyCounting): # on va utiliser cette méthode pour calculer transitionMatrix et observationMatrix
    frequencyMat = frequencyCounting.copy()
    for miniDict in frequencyMat.values():
        temporarySum = sumOfDict(miniDict)
        for key,value in miniDict.items():
            miniDict[key] = float(value)/temporarySum
    return frequencyMat
            

In [122]:
def labelTransitionMatrix(labelFrequencyCounting):
    return frequencyMatrix(labelFrequencyCounting)     

In [123]:
labelTransitionMatrix = labelTransitionMatrix(labelFrequencyCounting)

In [124]:
x,y = np.where(finalFeatureMatrix == 1)
lst = []
for i in range(len(x)): 
    lst.append((x[i],y[i]))
        
lstSorted = sorted(lst, key = lambda pos:pos[1])

sensorList = sensorLabel.index.tolist()
indexSensorList = []
for sensor in lstSorted:
    indexSensorList.append(sensorList[sensor[0]])

#set(indexSensorList)


In [125]:
len(indexSensorList)

37130

In [126]:
len(finalFeatureMatrix.iloc[1])

37130

In [127]:
len(labels)

37130

In [128]:
def observationFrequencyCounting(labels, indexSensorList):
    observationMat = defaultdict(lambda: defaultdict(int))
    for x in range(len(labels)):
        observationMat[labels[x]][indexSensorList[x]] += 1
    return observationMat

In [129]:
#observationFrequencyCounting = observationFrequencyCounting(labels, indexSensorList)
#observationFrequencyCounting

In [130]:
def observationMatrix(labels, indexSensorList):
    observationFrequencyCount = observationFrequencyCounting(labels, indexSensorList)
    return frequencyMatrix(observationFrequencyCount)
    

In [131]:
observationMatrix = observationMatrix(labels, indexSensorList)

In [132]:
observationMatrix

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {1: 0.006334263133426313,
                          5: 0.020223152022315203,
                          6: 0.02992794049279405,
                          7: 0.0023245002324500234,
                          8: 0.03666899116689912,
                          9: 0.014470013947001394,
                          12: 0.2551720130172013,
                          13: 0.008251975825197582,
                          14: 0.25029056252905624,
                          17: 0.008310088331008833,
                          18: 0.007147838214783821,
                          20: 0.002673175267317527,
                          23: 0.02109483960948396,
                          24: 0.3371106462110646}),
             1: defaultdict(int,
                         {1: 0.0028673126359501683,
                          5: 0.005438006723353767,
                          6: 0.0022740755388570297,
                    

Enfin, on va mettre ces 2 matrices sous forme DataFrame: 

In [133]:
labelTransitionMatrix

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {0: 0.9921538998023945,
                          1: 0.0012786237359060793,
                          4: 0.003370917121934209,
                          5: 0.0008717889108450541,
                          10: 0.0006974311286760432,
                          13: 0.0005230733465070325,
                          15: 0.0003487155643380216,
                          17: 0.0007555503893990469}),
             1: defaultdict(int,
                         {0: 0.0014830927427328456,
                          1: 0.9976270516116275,
                          4: 0.0007909827961241843,
                          15: 9.887284951552304e-05}),
             4: defaultdict(int,
                         {0: 0.2345679012345679,
                          1: 0.00823045267489712,
                          4: 0.6008230452674898,
                          5: 0.012345679012345678,
                          10: 0.11

In [134]:
transitionDistribution = pd.DataFrame(0.0, index = activityLabel.index, columns = activityLabel.index)
for key,valDict in labelTransitionMatrix.items():
    for miniKey, val in valDict.items(): 
        #print miniKey,key, val
        transitionDistribution.loc[miniKey][key] = val
transitionDistribution

Unnamed: 0,0,1,4,5,10,13,15,17
0,0.992154,0.001483,0.234568,0.100559,0.001106,0.220588,0.016393,0.457143
1,0.001279,0.997627,0.00823,0.0,0.0,0.0,0.0,0.0
4,0.003371,0.000791,0.600823,0.0,0.003208,0.0,0.008197,0.0
5,0.000872,0.0,0.012346,0.899441,0.0,0.0,0.0,0.0
10,0.000697,0.0,0.111111,0.0,0.995685,0.0,0.0,0.0
13,0.000523,0.0,0.020576,0.0,0.0,0.779412,0.0,0.028571
15,0.000349,9.9e-05,0.012346,0.0,0.0,0.0,0.959016,0.0
17,0.000756,0.0,0.0,0.0,0.0,0.0,0.016393,0.514286


In [135]:
observationDistribution = pd.DataFrame(0.0, index = sensorLabel.index, columns = activityLabel.index)
for key,valDict in observationMatrix.items():
    for miniKey, val in valDict.items(): 
        #print miniKey,key, val
        observationDistribution.loc[miniKey][key] = val
observationDistribution

Unnamed: 0,0,1,4,5,10,13,15,17
1,0.006334,0.002867,0.0,0.0,0.0,0.088235,0.127049,0.0
5,0.020223,0.005438,0.00823,0.826816,0.099679,0.0,0.0,0.0
6,0.029928,0.002274,0.209877,0.044693,0.073459,0.0,0.008197,0.0
7,0.002325,0.0,0.0,0.0,0.0,0.0,0.036885,0.057143
8,0.036669,0.000198,0.00823,0.0,0.0,0.147059,0.139344,0.8
9,0.01447,0.000494,0.0,0.0,0.0,0.044118,0.151639,0.0
12,0.255172,0.96668,0.004115,0.0,0.014493,0.0,0.0,0.0
13,0.008252,0.006526,0.0,0.0,0.0,0.0,0.0,0.0
14,0.250291,0.007514,0.588477,0.128492,0.291846,0.088235,0.0,0.0
17,0.00831,0.000494,0.0,0.0,0.0,0.117647,0.184426,0.114286
