In [1]:
#imports
import pandas as pd
import numpy as np
import datetime as dt
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from operator import itemgetter

In [2]:
#create engine and load the full database into a pandas data frame
#This database is created from the csv files by running makeSensorDatabase.py
disk_engine = create_engine('sqlite:///sensorData.db')
df = pd.read_sql_query('SELECT * FROM full_data',disk_engine)
df.head(10)

Unnamed: 0,index,sequential_number,x_acceleration,y_acceleration,z_acceleration,label,participant_id
0,0,0,1502,2215,2153,1,1
1,1,1,1667,2072,2047,1,1
2,2,2,1611,1957,1906,1,1
3,3,3,1601,1939,1831,1,1
4,4,4,1643,1965,1879,1,1
5,5,5,1604,1959,1921,1,1
6,6,6,1640,1829,1940,1,1
7,7,7,1607,1910,1910,1,1
8,8,8,1546,2045,1910,1,1
9,9,9,1529,2049,1972,1,1


Explanation of columns:

participant_id: a number 1-15 labeling the person wearing the gear.

sequential_number labels the data points in sequential order for each participant.

x_acceleration, y_acceleration, and z_acceleration: the x,y and z direction accelerometer readings respectively

label: a number 1-7 labeling the type of activity of the accelerometer wearer. 
       
       --- 1: Working at Computer
       
       --- 2: Standing Up, Walking and Going up\down stairs
       
       --- 3: Standing
       
       --- 4: Walking
       
       --- 5: Going Up\Down Stairs
       
       --- 6: Walking and Talking with Someone
       
       --- 7: Talking while Standing

In [3]:
df.columns

Index([u'index', u'sequential_number', u'x_acceleration', u'y_acceleration', u'z_acceleration', u'label', u'participant_id'], dtype='object')

Our goal is to predict the type of activity (label) of the wearer from the accelerometer data.  Before we predict with any model, we will first establish some "naive" null baseline predictions to get a better idea of the predictive power of our models.

In [4]:
#Find a count of each label
df['label'].value_counts()

1    608667
7    593563
4    357064
3    216737
5     51498
2     47878
6     47770
0      3719
dtype: int64

In [5]:
#drop values where label is 0 since this corresponds to an unclassified activity
df = df[df['label'] != 0]
df['label'].value_counts()

1    608667
7    593563
4    357064
3    216737
5     51498
2     47878
6     47770
dtype: int64

In [6]:
#compute % of total labels that are 1, the most common label
df['label'].value_counts()[1]/float(len(df.index))

0.31649036984115347

From the above observation, we se that 31% of the data points correspond to wearers participating in activity 1 (working at a computer).  So simply by always guessing label 1, we can get 31.65% accuracy, higher than the 14.29% we would get by always guessing one of the 7 labels if there were equal amounts of data points for each.  Any effective model should be able to predict with accuracy significantly higher than the 31.65% bench mark on this data set.

As a second naive baseline, we will train a logistic regression on a random subset of 75% of the data and testing on the rest.  We will simply use x_acceleration, y_acceleration, z_acceleration as the features.  So we're simply and naively predicting the activity of the participant from a single accelerometer reading.

In [9]:
#make features matrix X and label vector y
X = df[['x_acceleration','y_acceleration','z_acceleration']]
#flatten the label vector.
y = np.ravel(df['label'])
#split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [10]:
#Initialize and train our logistic regression model
toy_model = LogisticRegression()
toy_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [11]:
#predict test labels on test set and compute the accuracy
predictions = toy_model.predict(X_test)
print metrics.accuracy_score(y_test,predictions)

0.347684564107


As we might expect with such a basic strategy, our results are not much better than the strategy of simply predicting 1 for every label.  The confusion matrix computed below also confirms this notion. By definition a confusion matrix C is such that C_{i, j} is equal to the number of observations known to be in group i but predicted to be in group j.

In [13]:
C=  metrics.confusion_matrix(y_test,predictions)
print C
print metrics.classification_report(y_test, predictions)

[[106485      7      0     10      6      7  45777]
 [  7613      0      0      2      0      0   4383]
 [ 28899      0      0      0      0      0  25473]
 [ 58066      0      0      0      0      0  30961]
 [  7595      0      0      0      0      0   5102]
 [  5310      0      0      0      0      0   6772]
 [ 87647      0      0      0      0      0  60680]]
             precision    recall  f1-score   support

          1       0.35      0.70      0.47    152292
          2       0.00      0.00      0.00     11998
          3       0.00      0.00      0.00     54372
          4       0.00      0.00      0.00     89027
          5       0.00      0.00      0.00     12697
          6       0.00      0.00      0.00     12082
          7       0.34      0.41      0.37    148327

avg / total       0.22      0.35      0.26    480795



In [16]:
np.sum(C, axis=0)

array([301615,      7,      0,     12,      6,      7, 179148])

From the confusion matrix we can see our toy linear regression model classifies (seemingly randomly) almost every activity as either 1 or 7.  Note above that 7 (walking in a straight line) is the second most common activity in the data set, having nearly as many data points as 1 label one, and together they constitute around 60% of the total data points.  

  So as one might expect, we cannot meaningfully classify the activity from a single accelerometer reading.  For a more effective strategy, we need to reorgnize our data frame.  instead of predicting activity from an instaneous window, we will group our data points by participant and by activity label.  Then for a fixed participant performing a fixed activity, we will cluster consecutive data points into windows of 2 seconds.  So we will the predict the activity of the wearer over features derived from this 2 second window of data points

In [17]:
'''Reorganize our data frame to group data points into 2 second windows and derive features from these


First add column to record ordered counts of the data point grouped for each participant and each label
This will be used to group consecutive data points into the windows.
'''
temp = df[['participant_id','label']]
temp['count']=1
counts = temp.groupby(['participant_id','label'])['count'].cumsum()
df['record_counts']=counts
df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,index,sequential_number,x_acceleration,y_acceleration,z_acceleration,label,participant_id,record_counts
0,0,0,1502,2215,2153,1,1,1
1,1,1,1667,2072,2047,1,1,2
2,2,2,1611,1957,1906,1,1,3
3,3,3,1601,1939,1831,1,1,4
4,4,4,1643,1965,1879,1,1,5


From the data descrition, the sampling frequency of the accelerometer is 52 Hz.  We'll initialize variables to represent the sampling frequency and our desired window length (2 seconds)

In [18]:
  
#one sample taken every 1/52 seconds (52 observations per second)
frequency = 52
#length of observation window in seconds
window_length = 2

In [19]:
'''for each participant add window column which records which window the activity took place in
this will allow for features based on values computed over observations in the window

'''

windows = df['record_counts'].apply(lambda x: int(x/(frequency*window_length)))
df['window'] = windows

In [20]:
df.groupby(['participant_id','label'])['window'].value_counts()

participant_id  label     
1               1      319    104
                       231    104
                       297    104
                       10     104
                       42     104
                       74     104
                       106    104
                       138    104
                       170    104
                       202    104
                       234    104
                       266    104
                       298    104
                       11     104
                       43     104
...
15              7      27     104
                       11     104
                       154    104
                       138    104
                       122    104
                       106    104
                       90     104
                       74     104
                       58     104
                       42     104
                       26     104
                       10     104
                       153    104
                 

So from above we see that for a fixed participant id and label the data points are partition according to their window number, and each window has 104 points (52 observations per second (frequency) x 2 seconds (window length) except for windows at the end of the observation period (the total activity observation period may not be a multiple of 2 seconds)

For a derived features, we will also add columns to record the jerk in each direction (the derivative of acceleration)

In [21]:
grouped = df.groupby(['participant_id','label'])
#add features for jerk
x_jerk = grouped['x_acceleration'].apply(lambda x: (x - x.shift(1))*frequency)
y_jerk = grouped['y_acceleration'].apply(lambda x: (x - x.shift(1))*frequency)
z_jerk = grouped['z_acceleration'].apply(lambda x: (x - x.shift(1))*frequency)
df['x_jerk'] = x_jerk
df['y_jerk'] = y_jerk
df['z_jerk'] = z_jerk
df.head(10)

Unnamed: 0,index,sequential_number,x_acceleration,y_acceleration,z_acceleration,label,participant_id,record_counts,window,x_jerk,y_jerk,z_jerk
0,0,0,1502,2215,2153,1,1,1,0,,,
1,1,1,1667,2072,2047,1,1,2,0,8580.0,-7436.0,-5512.0
2,2,2,1611,1957,1906,1,1,3,0,-2912.0,-5980.0,-7332.0
3,3,3,1601,1939,1831,1,1,4,0,-520.0,-936.0,-3900.0
4,4,4,1643,1965,1879,1,1,5,0,2184.0,1352.0,2496.0
5,5,5,1604,1959,1921,1,1,6,0,-2028.0,-312.0,2184.0
6,6,6,1640,1829,1940,1,1,7,0,1872.0,-6760.0,988.0
7,7,7,1607,1910,1910,1,1,8,0,-1716.0,4212.0,-1560.0
8,8,8,1546,2045,1910,1,1,9,0,-3172.0,7020.0,0.0
9,9,9,1529,2049,1972,1,1,10,0,-884.0,208.0,3224.0


Using these windows we can compute some summary statistics on the observations to use as features for our model.  For each of the 6 variables (jerk, acceleration in the 3 directions: x,y,z) we will compute the variance of the variable, the mean and the range (max - min) over each window sample.  These will serve as the 18 features for our model.

In [22]:
variables = ['x_acceleration','y_acceleration','z_acceleration','x_jerk','y_jerk','z_jerk']
data = df.groupby(['participant_id','label','window'])[variables].agg([(lambda x: max(x)-min(x)),np.mean,np.var])
data.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x_acceleration,x_acceleration,x_acceleration,y_acceleration,y_acceleration,y_acceleration,z_acceleration,z_acceleration,z_acceleration,x_jerk,x_jerk,x_jerk,y_jerk,y_jerk,y_jerk,z_jerk,z_jerk,z_jerk
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,<lambda>,mean,var,<lambda>,mean,var,<lambda>,mean,var,<lambda>,mean,var,<lambda>,mean,var,<lambda>,mean,var
participant_id,label,window,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1,1,0,323,1612.349515,2899.190367,771,2048.466019,13979.859128,478,2035.932039,6592.495336,,70.862745,4967798.93147,,19.882353,42716188.025626,,-81.568627,16198486.544749
1,1,1,797,1991.836538,36940.875934,498,2292.211538,6039.488798,1095,1924.913462,26570.778846,20124.0,51.5,12710243.631068,44928.0,75.5,28758110.038835,91000.0,112.5,62299010.699029
1,1,2,437,1960.144231,5258.978996,355,2358.528846,3092.1545,428,2156.576923,10580.65422,24024.0,119.0,15179503.76699,22360.0,4.0,8804522.873786,21736.0,-102.5,14369184.990291
1,1,3,175,1952.740385,768.796023,172,2385.548077,643.764656,153,2078.317308,2285.403193,11960.0,-12.5,2132510.660194,10660.0,-17.0,2330503.68932,4680.0,63.0,523771.262136
1,1,4,52,1960.278846,67.950616,50,2373.836538,52.138069,43,2126.557692,77.0646,2236.0,2.0,108050.951456,2028.0,-1.0,100808.31068,1976.0,-3.0,98700.038835


In [25]:
#clean up data frame by reseting to a simple index
cleaned_data = data.dropna()
cleaned_data = cleaned_data.reset_index()
cleaned_data.head(5)

Unnamed: 0_level_0,participant_id,label,window,x_acceleration,x_acceleration,x_acceleration,y_acceleration,y_acceleration,y_acceleration,z_acceleration,z_acceleration,z_acceleration,x_jerk,x_jerk,x_jerk,y_jerk,y_jerk,y_jerk,z_jerk,z_jerk,z_jerk
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,<lambda>,mean,var,<lambda>,mean,var,<lambda>,...,var,<lambda>,mean,var,<lambda>,mean,var,<lambda>,mean,var
0,1,1,1,797,1991.836538,36940.875934,498,2292.211538,6039.488798,1095,...,26570.778846,20124,51.5,12710243.631068,44928,75.5,28758110.038835,91000,112.5,62299010.699029
1,1,1,2,437,1960.144231,5258.978996,355,2358.528846,3092.1545,428,...,10580.65422,24024,119.0,15179503.76699,22360,4.0,8804522.873786,21736,-102.5,14369184.990291
2,1,1,3,175,1952.740385,768.796023,172,2385.548077,643.764656,153,...,2285.403193,11960,-12.5,2132510.660194,10660,-17.0,2330503.68932,4680,63.0,523771.262136
3,1,1,4,52,1960.278846,67.950616,50,2373.836538,52.138069,43,...,77.0646,2236,2.0,108050.951456,2028,-1.0,100808.31068,1976,-3.0,98700.038835
4,1,1,5,40,1963.625,39.401699,29,2376.442308,31.76363,44,...,103.984223,1820,-1.0,95452.815534,1352,0.0,66891.184466,1872,-11.5,89886.038835


In [26]:
#rename columns and use simple index.
features =  ['x_accel_peaks','x_accel_mean','x_accel_var',
             'y_accel_peaks','y_accel_mean','y_accel_var',
             'z_accel_peaks','z_accel_mean','z_accel_var',
             'x_jerk_peaks','x_jerk_mean','x_jerk_var',
             'y_jerk_peaks','y_jerk_mean','y_jerk_var',
             'z_jerk_peaks','z_jerk_mean','z_jerk_var'
                     ]
columns = ['participant_id','label','window'] + features
cleaned_data.columns = columns
cleaned_data.head(5)

Unnamed: 0,participant_id,label,window,x_accel_peaks,x_accel_mean,x_accel_var,y_accel_peaks,y_accel_mean,y_accel_var,z_accel_peaks,...,z_accel_var,x_jerk_peaks,x_jerk_mean,x_jerk_var,y_jerk_peaks,y_jerk_mean,y_jerk_var,z_jerk_peaks,z_jerk_mean,z_jerk_var
0,1,1,1,797,1991.836538,36940.875934,498,2292.211538,6039.488798,1095,...,26570.778846,20124,51.5,12710243.631068,44928,75.5,28758110.038835,91000,112.5,62299010.699029
1,1,1,2,437,1960.144231,5258.978996,355,2358.528846,3092.1545,428,...,10580.65422,24024,119.0,15179503.76699,22360,4.0,8804522.873786,21736,-102.5,14369184.990291
2,1,1,3,175,1952.740385,768.796023,172,2385.548077,643.764656,153,...,2285.403193,11960,-12.5,2132510.660194,10660,-17.0,2330503.68932,4680,63.0,523771.262136
3,1,1,4,52,1960.278846,67.950616,50,2373.836538,52.138069,43,...,77.0646,2236,2.0,108050.951456,2028,-1.0,100808.31068,1976,-3.0,98700.038835
4,1,1,5,40,1963.625,39.401699,29,2376.442308,31.76363,44,...,103.984223,1820,-1.0,95452.815534,1352,0.0,66891.184466,1872,-11.5,89886.038835


Next we will train the data and check the accuracy with cross validation using a leave one participant out.  That is, we'll loop through the 15 different participant.  At each step we will save the data for one participant and train on the data for the remaining 14.  We then test out of sample on the 15th participant and record the accuracy.  If our features are meaningful, we would expect to get a higher accuracy than guessing.  However, we might also expect to get lower accuracy than if we did a random 15-fold cross validation because in this model, we do not train on the left out participant, so when we test it's the first time data for that particpant has been observed.  If the activity patterns are significantly different for each participant, than the model could have a harder time predicting the labels for the new participant.  On the other hand, for the random cross-validation, the model is trained on all 15 participants, so the model has already learned the activity patterns of each individual.
   The logistic regression provides a relatively simple and flexible model that is not too complicated to fit.  It is useful for preliminary data analysis and for examining which features are useful to select before possibly choosing a more advanced model.

In [27]:
X = cleaned_data[features]
y = cleaned_data['label']
model = LogisticRegression()
accuracy_scores = []
label_counts = []
for test_id in range(1,16):
    train_indices = cleaned_data.index[cleaned_data['participant_id'] !=test_id]
    test_indices =  cleaned_data.index[cleaned_data['participant_id'] ==test_id]
    X_train = X.loc[train_indices,:]
    X_test = X.loc[test_indices,:]
    y_train = y.loc[train_indices]
    y_test = y.loc[test_indices]
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test,predictions)
    accuracy_scores.append(accuracy)
    label_counts.append(Counter(y_test))

In [29]:
print accuracy_scores

[0.45221295702373315, 0.59727479182437548, 0.69622833843017329, 0.4578005115089514, 0.41107491856677525, 0.59198813056379818, 0.38835572616762637, 0.56245268735806209, 0.37555697008274985, 0.562962962962963, 0.58541458541458546, 0.5868971792538672, 0.54489164086687303, 0.65827338129496404, 0.57661290322580649]


In [34]:
accuracy_scores = np.array(accuracy_scores)
print "mean accuracy: "+ str(accuracy_scores.mean())
print "variance of the accuracy: "+str(accuracy_scores.var())

mean accuracy: 0.53653317897
variance of the accuracy: 0.0087985965473


In [45]:
#check the confusion matrix when testing on participant 15 for the model trained on 1-14
C=  metrics.confusion_matrix(y_test,predictions)
print C
print "Number of predictions for each label: \n" + str(C.sum(axis= 0))
print "Number of data points for each label: \n" + str(label_counts[14])
print metrics.classification_report(y_test, predictions)

[[354   0   0  15   0   0 127]
 [ 17   0   0   3   0   0  24]
 [  5   0   0  37   0   0  34]
 [ 19   0   1 129   0   0  19]
 [  2   0   0  17   0   0  12]
 [  0   0   0   9   0   0   0]
 [ 45   0   0  34   0   0  89]]
Number of predictions for each label: 
[442   0   1 244   0   0 305]
Number of data points for each label: 
Counter({1: 496, 4: 168, 7: 168, 3: 76, 2: 44, 5: 31, 6: 9})
             precision    recall  f1-score   support

          1       0.80      0.71      0.75       496
          2       0.00      0.00      0.00        44
          3       0.00      0.00      0.00        76
          4       0.53      0.77      0.63       168
          5       0.00      0.00      0.00        31
          6       0.00      0.00      0.00         9
          7       0.29      0.53      0.38       168

avg / total       0.54      0.58      0.55       992



From the above confusion matrix (  C_{i, j} is equal to the number of observations known to be in group i but predicted to be in group j), we observe that again most of the observations are classified either as label 1 (working at a computer) or label 7 (talking while standing), but now we have also a good portion classified as state 4 ( walking).  However, no data points (save 1) are predicted to belong in classes 2,3,5 and 6.  It is also, interesting to note that the precision for label 1 (correct label 1 predicitions/ total # of label 1 predictions) is very high at 80%, but only 29% for label 7.

In [64]:
coefficientDF = pd.DataFrame(np.transpose(model.coef_),index =X.columns )
coefficientDF

Unnamed: 0,0,1,2,3,4,5,6
x_accel_peaks,-3.3e-05,1.004028e-05,-8.685992e-07,0.007469256,-0.0001600595,7.333645e-05,4.77103e-05
x_accel_mean,-2e-05,-0.0006084537,-0.0003095777,-0.004389287,-0.0001826498,-0.0002669642,-4.811946e-05
x_accel_var,-0.000211,0.000137945,1.214333e-05,1.457178e-05,-3.503694e-05,0.0001472086,4.480283e-05
y_accel_peaks,-3.5e-05,2.638802e-06,1.173925e-06,0.008597318,0.0003367239,5.10432e-05,3.477941e-05
y_accel_mean,5e-06,-0.0006853135,-0.000390913,0.003782417,-0.00140734,-0.0007539034,-0.0001153178
y_accel_var,-0.000393,-0.0001585969,-6.827102e-05,0.0001297311,0.0003266384,2.77506e-05,-0.0005432287
z_accel_peaks,-2.7e-05,1.474185e-06,2.808337e-06,0.004402738,-0.0001446679,6.343948e-05,5.034281e-05
z_accel_mean,3.7e-05,-0.0006009011,-0.0003274423,-0.002415806,-0.0002976586,-0.000743082,-0.0001222782
z_accel_var,-0.000108,-5.608652e-05,7.434234e-05,-0.0001295255,0.0001677517,-5.762908e-06,0.0002518509
x_jerk_peaks,-4e-05,8.61306e-05,-1.042191e-05,-5.707369e-05,6.691327e-05,6.956292e-05,-1.726091e-05


Now again, we will fit our data to a logistic regression model, but this time we will use a random 15-fold cross validation so that the model is trained and tested on data involving all 15 participants.

In [49]:
#try random 15-fold cross validation.  See if the results are improved
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=15)


In [50]:
print scores
print "Mean of the scores: " + str(scores.mean())
print "Variance of the scores: " + str(scores.var())

[ 0.54987835  0.52311436  0.62956946  0.5264013   0.5264013   0.47684809
  0.59886086  0.58469055  0.52198697  0.57131214  0.47514262  0.37326813
  0.57294214  0.58516707  0.51507742]
Mean of the scores: 0.535377384622
Variance of the scores: 0.00366413485135


Interestingly, the accuracy in this cross-validation is similar to the leave one participant out validation done above, indicating that the model likely is not detecting significant variation in the activities of the different participants.

For a second model, we will train a random forest a validate it using one participant left out cross-validation.  Random forests are an effective prediction tool that benefits from the ability of random trees to capture the effects of complex structures in the data.  Trees tend to be low bias but have high variance.  Random forests are effective at reducing this variance by predicting using a majority vote from an ensemble of random trees.  

In [61]:
X = cleaned_data[features]
y = cleaned_data['label']
forest = RandomForestClassifier()
forest_accuracy_scores = []
forest_label_counts = []
y_test = None
predictions = None
for test_id in range(1,16):
    train_indices = cleaned_data.index[cleaned_data['participant_id'] !=test_id]
    test_indices =  cleaned_data.index[cleaned_data['participant_id'] ==test_id]
    X_train = X.loc[train_indices,:]
    X_test = X.loc[test_indices,:]
    y_train = y.loc[train_indices]
    y_test = y.loc[test_indices]
    forest.fit(X_train,y_train)
    predictions = forest.predict(X_test)
    accuracy = metrics.accuracy_score(y_test,predictions)
    forest_accuracy_scores.append(accuracy)
    forest_label_counts.append(Counter(y_test))

In [62]:
forest_accuracy_scores = np.array(forest_accuracy_scores)
print forest_accuracy_scores
print "mean accuracy: "+ str(forest_accuracy_scores.mean())
print "variance of the accuracy: "+str(forest_accuracy_scores.var())

[ 0.38037203  0.666919    0.66258919  0.62489344  0.36286645  0.68026706
  0.56174024  0.43830431  0.32399745  0.47572016  0.4995005   0.64422202
  0.42414861  0.6807554   0.48487903]
mean accuracy: 0.527411660469
variance of the accuracy: 0.0148584473449


In [63]:
#check the confusion matrix for the random forest when testing on participant 15 for the model trained on 1-14
C=  metrics.confusion_matrix(y_test,predictions)
print C
print "Number of predictions for each label: \n" + str(C.sum(axis= 0))
print "Number of data points for each label: \n" + str(label_counts[14])
print metrics.classification_report(y_test, predictions)

[[310   2  37   8   0   1 138]
 [  9   0   7   2   3   2  21]
 [ 13   0  19  29   0   0  15]
 [ 15   1  25  95   3   0  29]
 [  6   0   5  13   0   0   7]
 [  0   0   5   4   0   0   0]
 [ 61   1  38  10   0   1  57]]
Number of predictions for each label: 
[414   4 136 161   6   4 267]
Number of data points for each label: 
Counter({1: 496, 4: 168, 7: 168, 3: 76, 2: 44, 5: 31, 6: 9})
             precision    recall  f1-score   support

          1       0.75      0.62      0.68       496
          2       0.00      0.00      0.00        44
          3       0.14      0.25      0.18        76
          4       0.59      0.57      0.58       168
          5       0.00      0.00      0.00        31
          6       0.00      0.00      0.00         9
          7       0.21      0.34      0.26       168

avg / total       0.52      0.48      0.50       992



In [73]:
#check feature importances (higher the value, the more important the feature)
feature_scores = zip(features,forest.feature_importances_)
for x in feature_scores:
    print x[0] + ": "+ str(x[1])

x_accel_peaks: 0.0436406648664
x_accel_mean: 0.130897482573
x_accel_var: 0.0424317282689
y_accel_peaks: 0.0808949259656
y_accel_mean: 0.119754465254
y_accel_var: 0.0997967662228
z_accel_peaks: 0.0229688216811
z_accel_mean: 0.162821791487
z_accel_var: 0.0393893480595
x_jerk_peaks: 0.033697340586
x_jerk_mean: 0.0194047738919
x_jerk_var: 0.0475627005238
y_jerk_peaks: 0.0218903718842
y_jerk_mean: 0.0173413394158
y_jerk_var: 0.0316018321705
z_jerk_peaks: 0.0195097200995
z_jerk_mean: 0.0190427823123
z_jerk_var: 0.0473531447383


So the mean acceleration in the z direction, x direction and y direction respecively rank as the three most importand features.

The accuracy scores on the forest tested on participant 15 are similar to those for the logistic regression model.  For improved performance, we likely need to use more advanced features to describe the jerk and acceleration time series for each window.  It might be worthwhile to plot a few of the time series to see if any distinct differences show up that could be incorporated as features.  Perhaps some features involving the shape of the acceleration trajectories could be informative.  It also might be worthwhile to use more advanced model, such as one using neural networks.