# Human Activity Recognition using Smartphones Dataset
### This dataset ref:
[1] Davide Anguita, Alessandro Ghio, Luca Oneto, Xavier Parra and Jorge L. Reyes-Ortiz. Human Activity Recognition on Smartphones using a Multiclass Hardware-Friendly Support Vector Machine. International Workshop of Ambient Assisted Living (IWAAL 2012). Vitoria-Gasteiz, Spain. Dec 2012

### The following code:
- Merges the train and test data, and adds subject and activity features.
- Extracts only the measurements on the mean and standard deviation for each measurement.
- Features for ex.[tBodyAcc-mean()X] are broken up into signal(tBodyAcc) and statistic(meanX).
  There are NAs present, but this is a consequence of some signals having an meanXYZ and not a
  general mean and std, and visa versa. This takes a "wide" dataset and converts to "long"
- Activites that were ID coded were converted to descriptive strings 
- Creates an independent dataframe, saved to averages.csv, that reports the averages of each
  statistic grouped by (subject,activity,signal). Again, there are NAs present, due to some
  statistics not applicable to certain signals.

In [2]:
## Load all relevent libraries
library(tidyr)
library(tidyverse) ## contains dplyr

In [14]:
## Some functions used to format the feature names
firstElement <- function(x){x[1]}
secthirdElement <- function(x){paste0(x[2],x[3])}

## Load the Data

In [3]:
## Load all the relevent data
X_train <- read.csv("./train/X_train.txt",sep="",header=F,nrows=7352)
train_subject <- readLines("./train/subject_train.txt")
train_activity <- readLines("./train/y_train.txt")

X_test <- read.csv("./test/X_test.txt",sep="",header=F,nrows=2947)
test_subject <- readLines("./test/subject_test.txt")
test_activity <- readLines("./test/y_test.txt")

X_features <- readLines("features.txt")

## Merge all the data into a cohesive dataframe

In [19]:
## Merge train/test data.
X <- rbind(X_train,X_test)

## Merge subject and activity sets
subject <- c(train_subject,test_subject)
activity <- c(train_activity,test_activity)
subActivity <- cbind(subject,activity)
names(subActivity) <- c("subject","activity")

## Add features to X
names(X) <- gsub("[0-9]+","",X_features)

In [13]:
## Look at a sample of the merged data
head(X,n=3)

tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,⋯,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,-0.9951121,-0.9831846,-0.923527,-0.9347238,⋯,-0.07432303,-0.2986764,-0.7103041,-0.11275434,0.030400372,-0.4647614,-0.01844588,-0.8412468,0.1799406,-0.05862692
0.2784188,-0.01641057,-0.1235202,-0.9982453,-0.9753002,-0.960322,-0.9988072,-0.9749144,-0.9576862,-0.9430675,⋯,0.15807454,-0.5950509,-0.8614993,0.05347695,-0.007434566,-0.7326262,0.70351059,-0.8447876,0.1802889,-0.05431672
0.2796531,-0.01946716,-0.1134617,-0.9953796,-0.967187,-0.978944,-0.9965199,-0.9636684,-0.9774686,-0.9386916,⋯,0.41450281,-0.3907482,-0.7601037,-0.11855926,0.17789948,0.1006992,0.80852908,-0.8489335,0.1806373,-0.04911782


In [10]:
## Select only features with mean,meanXYZ,std, and stdXYZ
statsFeatures <- grep("mean\\(\\)|std", X_features) # get indexes of relevent stat features
statsData <- X[,statsFeatures]

In [18]:
## Look at Data
head(statsData,n=1)

tBodyAcc-meanX,tBodyAcc-meanY,tBodyAcc-meanZ,tBodyAcc-stdX,tBodyAcc-stdY,tBodyAcc-stdZ,tGravityAcc-meanX,tGravityAcc-meanY,tGravityAcc-meanZ,tGravityAcc-stdX,⋯,fBodyGyro-stdY,fBodyGyro-stdZ,fBodyAccMag-meanNA,fBodyAccMag-stdNA,fBodyBodyAccJerkMag-meanNA,fBodyBodyAccJerkMag-stdNA,fBodyBodyGyroMag-meanNA,fBodyBodyGyroMag-stdNA,fBodyBodyGyroJerkMag-meanNA,fBodyBodyGyroJerkMag-stdNA
0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,0.9633961,-0.1408397,0.1153749,-0.9852497,⋯,-0.9738861,-0.9940349,-0.9521547,-0.956134,-0.9937257,-0.993755,-0.9801349,-0.9613094,-0.9919904,-0.9906975


In [15]:
## clean up feature names
signal_stat_names <- strsplit(names(statsData),"-")

signal <-sapply(signal_stat_names,firstElement) ## retrieve signal
stats <- sapply(signal_stat_names,secthirdElement) ## get statistic
stats <- gsub("\\(\\)","",stats) ## remove ()
signal_stats <- paste(signal,stats,sep="-") ## reformat
names(statsData) <- signal_stats

In [17]:
## Check data
head(statsData,n=1)

tBodyAcc-meanX,tBodyAcc-meanY,tBodyAcc-meanZ,tBodyAcc-stdX,tBodyAcc-stdY,tBodyAcc-stdZ,tGravityAcc-meanX,tGravityAcc-meanY,tGravityAcc-meanZ,tGravityAcc-stdX,⋯,fBodyGyro-stdY,fBodyGyro-stdZ,fBodyAccMag-meanNA,fBodyAccMag-stdNA,fBodyBodyAccJerkMag-meanNA,fBodyBodyAccJerkMag-stdNA,fBodyBodyGyroMag-meanNA,fBodyBodyGyroMag-stdNA,fBodyBodyGyroJerkMag-meanNA,fBodyBodyGyroJerkMag-stdNA
0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,0.9633961,-0.1408397,0.1153749,-0.9852497,⋯,-0.9738861,-0.9940349,-0.9521547,-0.956134,-0.9937257,-0.993755,-0.9801349,-0.9613094,-0.9919904,-0.9906975


In [20]:
## Add the subject and activity columns
statsData <- cbind(subActivity,statsData)

In [21]:
head(statsData,n=1)

subject,activity,tBodyAcc-meanX,tBodyAcc-meanY,tBodyAcc-meanZ,tBodyAcc-stdX,tBodyAcc-stdY,tBodyAcc-stdZ,tGravityAcc-meanX,tGravityAcc-meanY,⋯,fBodyGyro-stdY,fBodyGyro-stdZ,fBodyAccMag-meanNA,fBodyAccMag-stdNA,fBodyBodyAccJerkMag-meanNA,fBodyBodyAccJerkMag-stdNA,fBodyBodyGyroMag-meanNA,fBodyBodyGyroMag-stdNA,fBodyBodyGyroJerkMag-meanNA,fBodyBodyGyroJerkMag-stdNA
1,5,0.2885845,-0.02029417,-0.1329051,-0.9952786,-0.9831106,-0.9135264,0.9633961,-0.1408397,⋯,-0.9738861,-0.9940349,-0.9521547,-0.956134,-0.9937257,-0.993755,-0.9801349,-0.9613094,-0.9919904,-0.9906975


In [22]:
dim(statsData)

## Tidy the Data

In [26]:
statsData %>% 
  gather(key="signal_stats",value="value",-subject,-activity) %>% # gather colunmns
  separate(col=signal_stats,into=c("signal","statistic"),sep="-") %>% # sep gathered cols
  group_by_at(vars(-value)) %>%  # group by everything other than the value column. 
  mutate(row_id=1:n()) %>% ungroup() %>%  # build group index
  spread(key=statistic, value=value) %>%    # spread
  select(-row_id) -> #removes group index. assigns final "result" to result
  result

In [27]:
head(result)

subject,activity,signal,meanNA,meanX,meanY,meanZ,stdNA,stdX,stdY,stdZ
1,1,fBodyAcc,,-0.2609049,-0.1225668,-0.331216,,-0.356707,-0.19956719,-0.1777802
1,1,fBodyAcc,,-0.1511153,-0.02904997,-0.2573071,,-0.2621973,-0.02385785,-0.3221639
1,1,fBodyAcc,,-0.2304074,0.02542685,-0.3773113,,-0.2935223,-0.05769317,-0.2900854
1,1,fBodyAcc,,-0.1513229,0.1952672,-0.3212387,,-0.2631256,0.08785532,-0.216975
1,1,fBodyAcc,,-0.2258036,0.11028848,-0.2048832,,-0.2268023,0.11880106,-0.1463515
1,1,fBodyAcc,,-0.2904287,0.05782228,-0.2483574,,-0.1999707,-0.06209912,-0.1106583


In [28]:
# Remove the NAs in feature names
names(result)[4] <- "mean"; names(result)[8] <- "std"

# Change activity values from ID codes to strings
result$activity <- revalue(result$activity, c("1"="walking","2"="walkingUpstairs",
                                              "3"="walkingDownstairs","4"="sitting",
                                              "5"="standing","6"="laying"))

In [29]:
sample_n(result,6)

subject,activity,signal,mean,meanX,meanY,meanZ,std,stdX,stdY,stdZ
20,standing,tBodyAccJerk,,0.07549118,0.002051514,-0.03674039,,-0.9323461,-0.8533053,-0.9741196
6,laying,fBodyAccMag,-0.978044,,,,-0.9596557,,,
12,walking,tBodyGyroMag,-0.3438419,,,,-0.299706,,,
21,laying,fBodyAccMag,-0.9834632,,,,-0.9531258,,,
2,sitting,fBodyBodyGyroMag,-0.9859797,,,,-0.9846586,,,
25,sitting,fBodyAccJerk,,-0.99210286,-0.97630319,-0.98369014,,-0.9935039,-0.9725965,-0.9860911


In [30]:
head(result)

subject,activity,signal,mean,meanX,meanY,meanZ,std,stdX,stdY,stdZ
1,walking,fBodyAcc,,-0.2609049,-0.1225668,-0.331216,,-0.356707,-0.19956719,-0.1777802
1,walking,fBodyAcc,,-0.1511153,-0.02904997,-0.2573071,,-0.2621973,-0.02385785,-0.3221639
1,walking,fBodyAcc,,-0.2304074,0.02542685,-0.3773113,,-0.2935223,-0.05769317,-0.2900854
1,walking,fBodyAcc,,-0.1513229,0.1952672,-0.3212387,,-0.2631256,0.08785532,-0.216975
1,walking,fBodyAcc,,-0.2258036,0.11028848,-0.2048832,,-0.2268023,0.11880106,-0.1463515
1,walking,fBodyAcc,,-0.2904287,0.05782228,-0.2483574,,-0.1999707,-0.06209912,-0.1106583


In [31]:
tail(result)

subject,activity,signal,mean,meanX,meanY,meanZ,std,stdX,stdY,stdZ
9,laying,tGravityAccMag,-0.9476551,,,,-0.9165117,,,
9,laying,tGravityAccMag,-0.9738158,,,,-0.9765451,,,
9,laying,tGravityAccMag,-0.9667412,,,,-0.9752216,,,
9,laying,tGravityAccMag,-0.9575159,,,,-0.9555869,,,
9,laying,tGravityAccMag,-0.9185906,,,,-0.9105931,,,
9,laying,tGravityAccMag,-0.9149584,,,,-0.9056572,,,


## Create a new independent dataset that reports averages 

In [33]:
result %>%
    group_by(subject,activity,signal) %>%
    summarise_each(funs(mean), avg_mean = mean, avg_meanX = meanX, avg_meanY = meanY,
                  avg_meanZ = meanZ, avg_std = std, avg_stdX = stdX, avg_stdY = stdY,
                  avg_stdZ = stdZ) ->
    average_result

`summarise_each()` is deprecated.
Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
To map `funs` over a selection of variables, use `summarise_at()`


In [35]:
head(average_result,10)

subject,activity,signal,avg_mean,avg_meanX,avg_meanY,avg_meanZ,avg_std,avg_stdX,avg_stdY,avg_stdZ
1,walking,fBodyAcc,,-0.20279431,0.08971273,-0.331560118,,-0.3191347,0.05604001,-0.2796868
1,walking,fBodyAccJerk,,-0.17054696,-0.03522552,-0.468999225,,-0.1335866,0.10673986,-0.5347134
1,walking,fBodyAccMag,-0.1286235,,,,-0.39803259,,,
1,walking,fBodyBodyAccJerkMag,-0.0571194,,,,-0.1034924,,,
1,walking,fBodyBodyGyroJerkMag,-0.3193086,,,,-0.38160191,,,
1,walking,fBodyBodyGyroMag,-0.1992526,,,,-0.32101795,,,
1,walking,fBodyGyro,,-0.3390322,-0.10305942,-0.25594094,,-0.5166919,-0.03350816,-0.4365622
1,walking,tBodyAcc,,0.27733076,-0.01738382,-0.111148104,,-0.2837403,0.11446134,-0.2600279
1,walking,tBodyAccJerk,,0.07404163,0.02827211,-0.004168406,,-0.1136156,0.0670025,-0.5026998
1,walking,tBodyAccJerkMag,-0.1414288,,,,-0.07447175,,,


In [36]:
dim(average_result)

## Write to .csv file

In [39]:
write.csv(average_result, file = "averages.csv",row.names=FALSE)
write.csv(result, file = "results.csv",row.names=FALSE)