In [385]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
from datetime import datetime
import math

In [386]:
sensorData = pd.read_table("SensorData.txt")
activityData = pd.read_table("activitiesData.txt")

On élimine la première ligne de chaque jeux de données, ce qui servent à rien

In [387]:
sensorData = sensorData.ix[1:]
activityData = activityData.ix[1:]

In [388]:
activityData.head()

Unnamed: 0,Start time,End time,ID
1,25-Feb-2008 00:22:46,25-Feb-2008 09:34:12,10
2,25-Feb-2008 09:37:17,25-Feb-2008 09:38:02,4
3,25-Feb-2008 09:49:23,25-Feb-2008 09:53:28,13
4,25-Feb-2008 10:02:28,25-Feb-2008 10:12:42,5
5,25-Feb-2008 10:19:06,25-Feb-2008 16:55:38,1


On vérifie si il y a des valeurs manquantes dans notre jeux de données:

In [389]:
print sensorData.isnull().sum()

Start time    0
End time      1
ID            1
Val           1
dtype: int64


In [390]:
sensorData = sensorData[sensorData['End time'].notnull()]

In [391]:
sensorData.isnull().sum()

Start time    0
End time      0
ID            0
Val           0
dtype: int64

In [392]:
print len(sensorData)

1319


In [393]:
sensorData.iloc[-1]

Start time    23-Mar-2008 19:04:46
End time      23-Mar-2008 19:04:47
ID                              12
Val                              1
Name: 1319, dtype: object

In [394]:
print activityData.isnull().sum()

Start time    0
End time      1
ID            1
dtype: int64


In [395]:
activityData.iloc[-1]

Start time    Length: 245
End time              NaN
ID                    NaN
Name: 246, dtype: object

In [396]:
activityData = activityData[activityData['End time'].notnull()]

On crée les listes des activités et des capteurs:

In [397]:
activityLabel = Series(['leave house','use toilet','take shower','go to bed','prepare breakfast','prepare dinner',
                        'get drink'], index=[1,4,5,10,13,15,17])
activityLabel

1           leave house
4            use toilet
5           take shower
10            go to bed
13    prepare breakfast
15       prepare dinner
17            get drink
dtype: object

In [398]:
sensorLabel = Series(['Microwave','Hall-Toilet door','Hall-Bathroom door','Cups cupboard','Fridge',
                      'Plates cupboard','Front door','Dishwasher','ToiletFlush','Freezer','Pans Cupboard',
                      'Washing machine','Groceries Cupboard','Hall-Bedroom door'],
                     index = [1,5,6,7,8,9,12,13,14,17,18,20,23,24])
sensorLabel

1              Microwave
5       Hall-Toilet door
6     Hall-Bathroom door
7          Cups cupboard
8                 Fridge
9        Plates cupboard
12            Front door
13            Dishwasher
14           ToiletFlush
17               Freezer
18         Pans Cupboard
20       Washing machine
23    Groceries Cupboard
24     Hall-Bedroom door
dtype: object

Maintenant on cherche à gérer les 2 colonnes de temps : Start Time et End Time

In [399]:
x1 = '25-Feb-2008 00:22:46'
x2 = '25-Feb-2008 00:23:47'
y1 = pd.to_datetime(x1)
y2 = pd.to_datetime(x2)
print type(y2-y1)
print (y2-y1)/pd.Timedelta(1,'s')

<class 'pandas.tslib.Timedelta'>
61.0


In [400]:
activityData['Start time'] = pd.to_datetime(activityData['Start time'])

In [401]:
activityData['End time'] = pd.to_datetime(activityData['End time'])

In [402]:
sensorData['Start time'] = pd.to_datetime(sensorData['Start time'])

In [404]:
sensorData['End time'] = pd.to_datetime(sensorData['End time'])

Convert ID columns from string to int:

In [416]:
sensorData.ID = sensorData['ID'].apply(int)

In [418]:
activityData.ID = activityData['ID'].apply(int)

Convert to raw feature matrix:

In [448]:
def convert2RawFeatMatrix(trainingSensorData, trainingActivityData, timeStep):
    beginTime = 0 
    endTime = 0
    
    if sensorData['Start time'][1] < activityData['Start time'][1]: # we compare 1st time step of activity data vs sensor data
        beginTime = sensorData['Start time'][1]
    else:
        beginTime = activityData['Start time'][1]
        
    if sensorData['End time'][len(sensorData)] > activityData['End time'][len(activityData)]:
        endTime = sensorData['End time'][len(sensorData)]
    else:
        endTime = activityData['End time'][len(activityData)]
        
    duration = endTime - beginTime
    numberOfTimeStep = math.ceil(duration/pd.Timedelta(timeStep,'s'))
    
    featureMatrix = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(numberOfTimeStep))
    labels = Series(0, index = np.arange(numberOfTimeStep))
    
    for index,rowSensor in trainingSensorData.iterrows():
        differenceStart = rowSensor['Start time'] - beginTime
        differenceEnd = rowSensor['End time'] - beginTime
        
        startStep = round(differenceStart / pd.Timedelta(timeStep,'s'))
        endStep = round(differenceEnd / pd.Timedelta(timeStep,'s'))
        
        featureMatrix.loc[rowSensor.ID][startStep:endStep+1] = 1
        
    for index, rowActivity in trainingActivityData.iterrows(): 
        differenceStart = rowActivity['Start time'] - beginTime
        differenceEnd = rowActivity['End time'] - beginTime
        
        startStep = round(differenceStart / pd.Timedelta(timeStep,'s'))
        endStep = round(differenceEnd / pd.Timedelta(timeStep,'s'))
        
        labels[startStep:endStep] = rowActivity.ID
        
    return featureMatrix, labels
    
        
                                                                              
    

In [449]:
featureMatrix, labels = convert2RawFeatMatrix(sensorData, activityData, 60)

In [455]:
featureMatrix

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,39995.0,39996.0,39997.0,39998.0,39999.0,40000.0,40001.0,40002.0,40003.0,40004.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [454]:
labels

0         0
1         0
2         0
3        10
4        10
5        10
6        10
7        10
8        10
9        10
10       10
11       10
12       10
13       10
14       10
15       10
16       10
17       10
18       10
19       10
20       10
21       10
22       10
23       10
24       10
25       10
26       10
27       10
28       10
29       10
         ..
39975     1
39976     1
39977     1
39978     1
39979     1
39980     1
39981     1
39982     1
39983     1
39984     1
39985     1
39986     1
39987     1
39988     1
39989     1
39990     1
39991     1
39992     1
39993     1
39994     1
39995     1
39996     1
39997     1
39998     1
39999     1
40000     1
40001     1
40002     1
40003     1
40004     1
dtype: int64

In [447]:
x = Series(np.arange(10))
x[2:3] = 9999
x

0       0
1       1
2    9999
3       3
4       4
5       5
6       6
7       7
8       8
9       9
dtype: int64

In [445]:
round(2.3)

2.0

In [356]:
print activityData.head()
print sensorData.head()

           Start time            End time  ID
1 2008-02-25 00:22:46 2008-02-25 09:34:12  10
2 2008-02-25 09:37:17 2008-02-25 09:38:02   4
3 2008-02-25 09:49:23 2008-02-25 09:53:28  13
4 2008-02-25 10:02:28 2008-02-25 10:12:42   5
5 2008-02-25 10:19:06 2008-02-25 16:55:38   1
           Start time            End time  ID Val
1 2008-02-25 00:20:14 2008-02-25 00:22:57  24   1
2 2008-02-25 09:33:41 2008-02-25 09:33:42  24   1
3 2008-02-25 09:33:47 2008-02-25 17:21:12  24   1
4 2008-02-25 09:36:43 2008-02-25 09:37:04   5   1
5 2008-02-25 09:37:20 2008-02-25 09:37:23   6   1


In [249]:
print sensorData['End time'][len(sensorData)]
print activityData['End time'][len(activityData)]
a =  activityData['End time'][len(activityData)] - sensorData['End time'][len(sensorData)]  
print a 
print round(a/ pd.Timedelta(10,'s'))

2008-03-23 19:04:47
2008-03-23 19:04:58
0 days 00:00:11
1.0


In [355]:
d = pd.DataFrame(0, index = sensorLabel.index, columns = np.arange(3))
d.loc[6][:3] = 222
d

Unnamed: 0,0,1,2
1,0,0,0
5,0,0,0
6,222,222,222
7,0,0,0
8,0,0,0
9,0,0,0
12,0,0,0
13,0,0,0
14,0,0,0
17,0,0,0


In [313]:
l = sensorData.iloc[:3]

for index,row in l.iterrows():
    print index,
    print row['Start time'],
    print row.ID

3

TypeError: tuple indices must be integers, not str

In [362]:
labels = Series(0, index = np.arange(5))
labels[1:3] = 245
labels

0      0
1    245
2    245
3      0
4      0
dtype: int64