# Step 1: Data munging

## Les choses à faire dans cette étape:
    1) Lire les jeux de données et nettoyer les jeux de données: les données des capteurs sont mises sous forme discrète avec delta t = 60s. 
    2) Calcule les paramètres du modèle de Markov caché: la fréquence de transition et d'observation.

In [1]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
from datetime import datetime
import math
#from __future__ import print_function
from operator import itemgetter
from collections import defaultdict

In [2]:
sensorData = pd.read_table("SensorData.txt")
activityData = pd.read_table("activitiesData.txt")

On élimine la première ligne de chaque jeu de données, ce qui servent à rien

In [3]:
sensorData = sensorData.ix[1:]
activityData = activityData.ix[1:]

In [4]:
activityData.head()

Unnamed: 0,Start time,End time,ID
1,25-Feb-2008 00:22:46,25-Feb-2008 09:34:12,10
2,25-Feb-2008 09:37:17,25-Feb-2008 09:38:02,4
3,25-Feb-2008 09:49:23,25-Feb-2008 09:53:28,13
4,25-Feb-2008 10:02:28,25-Feb-2008 10:12:42,5
5,25-Feb-2008 10:19:06,25-Feb-2008 16:55:38,1


On vérifie si il y a des valeurs manquantes dans notre jeu de données:

In [5]:
sensorData = sensorData[sensorData['End time'].notnull()]

In [6]:
activityData = activityData[activityData['End time'].notnull()]

On crée les listes des activités et des capteurs:

In [7]:
activityLabel = Series(['nothing','leave house','use toilet','take shower','go to bed','prepare breakfast','prepare dinner',
                        'get drink'], index=[0,1,4,5,10,13,15,17])
activityLabel

0               nothing
1           leave house
4            use toilet
5           take shower
10            go to bed
13    prepare breakfast
15       prepare dinner
17            get drink
dtype: object

In [8]:
sensorLabel = Series(['Microwave','Hall-Toilet door','Hall-Bathroom door','Cups cupboard','Fridge',
                      'Plates cupboard','Front door','Dishwasher','ToiletFlush','Freezer','Pans Cupboard',
                      'Washing machine','Groceries Cupboard','Hall-Bedroom door'],
                     index = [1,5,6,7,8,9,12,13,14,17,18,20,23,24])
sensorLabel

1              Microwave
5       Hall-Toilet door
6     Hall-Bathroom door
7          Cups cupboard
8                 Fridge
9        Plates cupboard
12            Front door
13            Dishwasher
14           ToiletFlush
17               Freezer
18         Pans Cupboard
20       Washing machine
23    Groceries Cupboard
24     Hall-Bedroom door
dtype: object

Maintenant on cherche à gérer les 2 colonnes de temps : Start Time et End Time

In [9]:
x1 = '25-Feb-2008 00:22:46'
x2 = '25-Feb-2008 00:23:47'
y1 = pd.to_datetime(x1)
y2 = pd.to_datetime(x2)
print(type(y2-y1))
print((y2-y1)/pd.Timedelta(1,'s'))

<class 'pandas.tslib.Timedelta'>
61.0


In [10]:
activityData['Start time'] = pd.to_datetime(activityData['Start time'])

In [11]:
activityData['End time'] = pd.to_datetime(activityData['End time'])

In [12]:
sensorData['Start time'] = pd.to_datetime(sensorData['Start time'])

In [13]:
sensorData['End time'] = pd.to_datetime(sensorData['End time'])

Convert ID columns from string to int:

In [14]:
sensorData.ID = sensorData['ID'].apply(int)

In [15]:
activityData.ID = activityData['ID'].apply(int)

Convert to raw feature matrix:

In [16]:
def convert2RawFeatMatrix(trainingSensorData, trainingActivityData, timeStep):
    beginTime = 0 
    endTime = 0
    
    if sensorData['Start time'][1] < activityData['Start time'][1]: # we compare 1st time step of activity data vs sensor data
        beginTime = sensorData['Start time'][1]
    else:
        beginTime = activityData['Start time'][1]
        
    if sensorData['End time'][len(sensorData)] > activityData['End time'][len(activityData)]:
        endTime = sensorData['End time'][len(sensorData)]
    else:
        endTime = activityData['End time'][len(activityData)]
        
    duration = endTime - beginTime
    numberOfTimeStep = math.ceil(duration/pd.Timedelta(timeStep,'s'))
    
    #print beginTime
    
    featureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep))
    labels = Series(0, index = np.arange(numberOfTimeStep))
    
    for index,rowSensor in trainingSensorData.iterrows():
        differenceStart = rowSensor['Start time'] - beginTime
        differenceEnd = rowSensor['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        
        featureMatrix.loc[rowSensor.ID][startStep:endStep] = 1
        
    for index, rowActivity in trainingActivityData.iterrows(): 
        differenceStart = rowActivity['Start time'] - beginTime
        #print differenceStart
        differenceEnd = rowActivity['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        #print startStep, 
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        #print endStep
        
        labels[startStep:endStep] = rowActivity.ID
        
    return featureMatrix, labels
    
        
                                                                              
    

In [17]:
featureMatrix, labels = convert2RawFeatMatrix(sensorData, activityData, 60)

In [18]:
featureMatrix.ix[:,565:584]


Unnamed: 0,565.0,566.0,567.0,568.0,569.0,570.0,571.0,572.0,573.0,574.0,575.0,576.0,577.0,578.0,579.0,580.0,581.0,582.0,583.0,584.0
1,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0
14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Convert to last sensor fired representation, in which the last sensor that changed state continues to give 1 and changes to 0 when a different sensor changes state.

In [19]:
def convert2LastFiredFeatureMatrix(trainingSensorData, trainingActivityData, timeStep):
    beginTime = 0 
    endTime = 0
    
    if sensorData['Start time'][1] < activityData['Start time'][1]: # we compare 1st time step of activity data vs sensor data
        beginTime = sensorData['Start time'][1]
    else:
        beginTime = activityData['Start time'][1]
        
    if sensorData['End time'][len(sensorData)] > activityData['End time'][len(activityData)]:
        endTime = sensorData['End time'][len(sensorData)]
    else:
        endTime = activityData['End time'][len(activityData)]
        
    duration = endTime - beginTime
    numberOfTimeStep = math.ceil(duration/pd.Timedelta(timeStep,'s'))
    
    #print beginTime
    
    featureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep))
    finalFeatureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep)) 
    labels = Series(0, index = np.arange(numberOfTimeStep))
    
    for index,rowSensor in trainingSensorData.iterrows():
        differenceStart = rowSensor['Start time'] - beginTime
        differenceEnd = rowSensor['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        
        featureMatrix.loc[rowSensor.ID][startStep] = 1
        featureMatrix.loc[rowSensor.ID][endStep] = 1
    
    x,y = np.where(featureMatrix == 1)
    lst = []
    for i in range(len(x)): 
        lst.append((x[i],y[i]))
        
    lstSorted = sorted(lst, key = lambda pos:pos[1])
    
    #print(lstSorted)
    
    for i in range(len(lstSorted)-1): 
        finalFeatureMatrix.iloc[lstSorted[i][0]][lstSorted[i][1]: lstSorted[i+1][1]-1] = 1 
            
    for index, rowActivity in trainingActivityData.iterrows(): 
        differenceStart = rowActivity['Start time'] - beginTime
        #print differenceStart
        differenceEnd = rowActivity['End time'] - beginTime
        
        startStep = int(differenceStart / pd.Timedelta(timeStep,'s'))
        #print startStep, 
        endStep = int(differenceEnd / pd.Timedelta(timeStep,'s'))
        #print endStep
        
        labels[startStep:endStep] = rowActivity.ID
    
    finalFeatureMatrix = finalFeatureMatrix.drop(labels = len(finalFeatureMatrix.iloc[0])-1, axis = 1 )
    #print len(labels)
    labels = labels[:len(labels)-2]
    #print len(labels)
    return finalFeatureMatrix, featureMatrix, labels
    
        
                                                                              
    

In [20]:
finalFeatureMatrix, featureMatrix, labels = convert2LastFiredFeatureMatrix(sensorData, activityData, 60)

In [21]:
finalFeatureMatrix.ix[:,39990:]

Unnamed: 0,39990.0,39991.0,39992.0,39993.0,39994.0,39995.0,39996.0,39997.0,39998.0,39999.0,40000.0,40001.0,40002.0,40003.0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12,1,1,1,1,1,1,1,1,1,1,1,1,1,1
13,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,0,0,0,0


We now calculate the state transition frequencies

In [22]:
def labelFrequencyCounting(labels):
    activityDict = defaultdict(lambda: defaultdict(int)) #create a defaultdict of defaultdict
    
    for x in range(len(labels)-2): #we stop at the nth-1 number
        activityDict[labels[x]][labels[x+1]] += 1 
    return activityDict
    

In [23]:
labelFrequencyCounting = labelFrequencyCounting(labels)

In [24]:
labelFrequencyCounting

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {0: 4625,
                          1: 31,
                          4: 69,
                          5: 17,
                          10: 15,
                          13: 11,
                          15: 7,
                          17: 14}),
             1: defaultdict(int, {0: 22, 1: 22558, 4: 9, 15: 2}),
             4: defaultdict(int,
                         {0: 62,
                          1: 3,
                          4: 178,
                          5: 6,
                          10: 33,
                          13: 6,
                          15: 3,
                          17: 1}),
             5: defaultdict(int, {0: 23, 5: 219}),
             10: defaultdict(int, {0: 12, 4: 34, 10: 11565, 13: 2}),
             13: defaultdict(int, {0: 20, 13: 70}),
             15: defaultdict(int, {0: 5, 4: 2, 15: 333, 17: 5}),
             17: defaultdict(int, {0: 19, 13: 1, 17:

In [25]:
def sumOfDict(dct):
    sum = 0
    for value in dct.values():
        sum += value
    return sum

In [26]:
def frequencyMatrix(frequencyCounting): # on va utiliser cette méthode pour calculer transitionMatrix et observationMatrix
    frequencyMat = frequencyCounting.copy()
    for miniDict in frequencyMat.values():
        temporarySum = sumOfDict(miniDict)
        for key,value in miniDict.items():
            miniDict[key] = float(value)/temporarySum
    return frequencyMat
            

In [27]:
def labelTransitionMatrix(labelFrequencyCounting):
    return frequencyMatrix(labelFrequencyCounting)     

In [28]:
labelTransitionMatrix = labelTransitionMatrix(labelFrequencyCounting)

In [29]:
labelTransitionMatrix

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {0: 0.965754854875757,
                          1: 0.006473167675923992,
                          4: 0.014408018375443725,
                          5: 0.003549801628732512,
                          10: 0.0031321779077051574,
                          13: 0.0022969304656504487,
                          15: 0.0014616830235957402,
                          17: 0.0029233660471914805}),
             1: defaultdict(int,
                         {0: 0.000973839139480324,
                          1: 0.9985392412907795,
                          4: 0.0003983887388783144,
                          15: 8.853083086184763e-05}),
             4: defaultdict(int,
                         {0: 0.21232876712328766,
                          1: 0.010273972602739725,
                          4: 0.6095890410958904,
                          5: 0.02054794520547945,
                          10: 0.11301

In [30]:
finalFeatureMatrix

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,39994.0,39995.0,39996.0,39997.0,39998.0,39999.0,40000.0,40001.0,40002.0,40003.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
x,y = np.where(finalFeatureMatrix == 1)
lst = []
for i in range(len(x)): 
    lst.append((x[i],y[i]))
        
lstSorted = sorted(lst, key = lambda pos:pos[1])

sensorList = sensorLabel.index.tolist()
indexSensorList = []
for sensor in lstSorted:
    indexSensorList.append(sensorList[sensor[0]])

#set(indexSensorList)
    

In [32]:
def observationFrequencyCounting(labels, indexSensorList):
    observationMat = defaultdict(lambda: defaultdict(int))
    for x in range(len(labels)):
        observationMat[labels[x]][indexSensorList[x]] += 1
    return observationMat

In [33]:
#observationFrequencyCounting = observationFrequencyCounting(labels, indexSensorList)

In [34]:
def observationMatrix(labels, indexSensorList):
    observationFrequencyCount = observationFrequencyCounting(labels, indexSensorList)
    return frequencyMatrix(observationFrequencyCount)

    

In [35]:
observationMatrix = observationMatrix(labels, indexSensorList)

In [36]:
observationMatrix

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {1: 0.0238045520985592,
                          5: 0.09835038630194196,
                          6: 0.12361662142409689,
                          7: 0.00897891000208812,
                          8: 0.1703904781791606,
                          9: 0.05345583629150136,
                          12: 0.06577573606180831,
                          13: 0.03194821465859261,
                          14: 0.1904364167884736,
                          17: 0.02965128419294216,
                          18: 0.04510336187095427,
                          20: 0.00960534558362915,
                          23: 0.1014825642096471,
                          24: 0.04740029233660472}),
             1: defaultdict(int,
                         {1: 0.001283583410790953,
                          5: 0.0027442128092772097,
                          6: 0.0018589828708006906,
                          8: 0.

Enfin, on va mettre ces 2 matrices sous forme DataFrame: 

In [37]:
labelTransitionMatrix

defaultdict(<function __main__.<lambda>>,
            {0: defaultdict(int,
                         {0: 0.965754854875757,
                          1: 0.006473167675923992,
                          4: 0.014408018375443725,
                          5: 0.003549801628732512,
                          10: 0.0031321779077051574,
                          13: 0.0022969304656504487,
                          15: 0.0014616830235957402,
                          17: 0.0029233660471914805}),
             1: defaultdict(int,
                         {0: 0.000973839139480324,
                          1: 0.9985392412907795,
                          4: 0.0003983887388783144,
                          15: 8.853083086184763e-05}),
             4: defaultdict(int,
                         {0: 0.21232876712328766,
                          1: 0.010273972602739725,
                          4: 0.6095890410958904,
                          5: 0.02054794520547945,
                          10: 0.11301

In [38]:
transitionDistribution = pd.DataFrame(0.0, index = activityLabel.index, columns = activityLabel.index)
for key,valDict in labelTransitionMatrix.items():
    for miniKey, val in valDict.items(): 
        #print miniKey,key, val
        transitionDistribution.loc[miniKey][key] = val
transitionDistribution

Unnamed: 0,0,1,4,5,10,13,15,17
0,0.965755,0.000974,0.212329,0.095041,0.001033,0.222222,0.014493,0.475
1,0.006473,0.998539,0.010274,0.0,0.0,0.0,0.0,0.0
4,0.014408,0.000398,0.609589,0.0,0.002928,0.0,0.005797,0.0
5,0.00355,0.0,0.020548,0.904959,0.0,0.0,0.0,0.0
10,0.003132,0.0,0.113014,0.0,0.995867,0.0,0.0,0.0
13,0.002297,0.0,0.020548,0.0,0.000172,0.777778,0.0,0.025
15,0.001462,8.9e-05,0.010274,0.0,0.0,0.0,0.965217,0.0
17,0.002923,0.0,0.003425,0.0,0.0,0.0,0.014493,0.5


In [39]:
observationDistribution = pd.DataFrame(0.0, index = sensorLabel.index, columns = activityLabel.index)
for key,valDict in observationMatrix.items():
    for miniKey, val in valDict.items(): 
        #print miniKey,key, val
        observationDistribution.loc[miniKey][key] = val
observationDistribution

Unnamed: 0,0,1,4,5,10,13,15,17
1,0.023805,0.001284,0.0,0.0,0.0,0.088889,0.092754,0.025
5,0.09835,0.002744,0.041096,0.859504,0.001378,0.0,0.0,0.0
6,0.123617,0.001859,0.386986,0.066116,0.0,0.011111,0.0,0.025
7,0.008979,0.0,0.0,0.0,0.0,0.011111,0.002899,0.075
8,0.17039,0.000177,0.020548,0.0,0.0,0.222222,0.142029,0.675
9,0.053456,0.000266,0.003425,0.0,0.0,0.055556,0.156522,0.0
12,0.065776,0.985172,0.037671,0.0,0.0,0.0,0.002899,0.025
13,0.031948,0.002966,0.0,0.004132,0.0,0.0,0.017391,0.05
14,0.190436,0.001903,0.34589,0.066116,0.000431,0.077778,0.005797,0.025
17,0.029651,0.000266,0.0,0.0,0.0,0.122222,0.144928,0.05
