#Analyzing physical activity monitor data

In [ ]:
val spark = sparkSession

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6efab1e2


In [ ]:
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

val SEED = 181

import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
SEED: Int = 181


###Physical activity monitor data

In [ ]:
val PaxDF = spark.read
  .format("parquet")
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/paxraw.parquet")
  

PaxDF.describe("SEQN", "PAXINTEN").show

+-------+------------------+------------------+
|summary|              SEQN|          PAXINTEN|
+-------+------------------+------------------+
|  count|         147124122|         147124122|
|   mean|31278.131119443486|275.77466697813156|
| stddev| 5901.679060963683|1908.7407319232314|
|    min|             21005|                 0|
|    max|             41474|             32767|
+-------+------------------+------------------+

PaxDF: org.apache.spark.sql.DataFrame = [SEQN: int, PAXSTAT: int ... 6 more fields]


###Demographics Data

In [ ]:
val DemoDF = spark.read
  .format("parquet")
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/demo.parquet")

DemoDF: org.apache.spark.sql.DataFrame = [SEQN: int, RIDAGEYR: int]


In [ ]:
DemoDF.limit(3).show

+-----+--------+
| SEQN|RIDAGEYR|
+-----+--------+
|31127|       0|
|31128|      11|
|31129|      15|
+-----+--------+



## Cleaning physical activity monitor data

The physical activity monitors (PAMs) used in NHANES were programmed to detect and record the magnitude of acceleration 
or “intensity” of movement. Intensity readings were summed over each 1-minute epoch.

The dataset has some abnormally high "intensity" values stored by several devices. We can plot intensity value distribution.

In [ ]:
CustomPlotlyChart(PaxDF.where($"PAXINTEN" > 1000).sample(withReplacement=false, 0.05),
                  layout="""{title: 'Intencity value distribution', 
                             yaxis: {type: 'log'},
                             xaxis: {title: 'Intensity'},
                             bargap: 0.02}""",
                  dataOptions="{type: 'histogram', opacity: 0.7}",
                  dataSources="{x: 'PAXINTEN'}",
                  maxPoints=5000)

res8: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>


Count the number of devices which recorded abnormally high intensity values

In [ ]:
PaxDF.where($"PAXINTEN" > 27000).select($"SEQN").distinct.count

res10: Long = 449


Let's remove those devices from the dataset.

We will create a broadcasted variable containing a set of Respondent sequence numbers (`SEQN`) with abnormally high intensity values.

In [ ]:
val broadcastedBlackList = spark.sparkContext.broadcast(
  PaxDF.where($"PAXINTEN" > 27000).select($"SEQN").distinct
  .collect.map(_(0).asInstanceOf[Int]).toSet
)

def inBlacklistUDF = udf((seqNum: Int) => {
  broadcastedBlackList.value.contains(seqNum)
})

broadcastedBlackList: org.apache.spark.broadcast.Broadcast[scala.collection.immutable.Set[Int]] = Broadcast(16)
inBlacklistUDF: org.apache.spark.sql.expressions.UserDefinedFunction


In [ ]:
val PaxUnreliable = PaxDF.where(inBlacklistUDF($"SEQN"))

val PaxReliable = PaxDF.where(!inBlacklistUDF($"SEQN"))

println("Number of reliable devices: " + PaxReliable.select($"SEQN").distinct.count)
println("Number of unreliable devices " + PaxUnreliable.select($"SEQN").distinct.count)

Number of reliable devices: 14182
Number of unreliable devices 449
PaxUnreliable: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [SEQN: int, PAXSTAT: int ... 6 more fields]
PaxReliable: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [SEQN: int, PAXSTAT: int ... 6 more fields]


Now that we have respondent sequence numbers for somewhat reliable and unreliable data we can have a look at
raw intensity values over the tracking period. To make timeseries chart we need to add `datetime` column. 

In [ ]:
val reliableSeqNumSample = PaxReliable.select($"SEQN").distinct
  .sample(false, 0.01, SEED)
  .limit(10)
  .collect
  .map(_(0).asInstanceOf[Int]).toList

val PaxReliableWithDT = PaxReliable
                        .withColumn("datetime", concat($"PAXDAYSAS", lit(".01.2005 "), $"PAXHOUR", lit(":"), $"PAXMINUT"))
                        .withColumn("time", unix_timestamp($"datetime", "d.MM.yyyy HH:mm"))
                        .withColumn("datetime", from_unixtime($"time"))

reliableSeqNumSample: List[Int] = List(31763, 39582, 28343, 30809, 23682, 29746, 28953, 35211, 32672, 26273)
PaxReliableWithDT: org.apache.spark.sql.DataFrame = [SEQN: int, PAXSTAT: int ... 8 more fields]


In [ ]:
CustomPlotlyChart(PaxReliableWithDT
                    .where($"SEQN" === reliableSeqNumSample(3))
                    .where($"PAXDAYSAS" > 1 && $"PAXDAYSAS" < 4), // showing only two days
                  layout="""{title: 'Physical activity monitor data', 
                           yaxis: {title: 'Device Intensity Value'},
                           showlegend: false}""",
                  dataOptions="""{
                    colorscale: 'Electric',
                    autocolorscale: true
                  }""",
                  dataSources="""{
                    x: 'datetime',
                    y: 'PAXINTEN'
                  }""",
                 maxPoints=3000)

res30: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>


This is a locomotor activity data. What can we do with this data? What can it tell us about human health or age?
It's hard to directly compare different locomotor activity tracks because much of individual's social activity habbits are mixed into this data.
But we're interested to extract physiological information from noisy locomotor activity.
  For that we need to make feature engineering.

## Transition Matrix

One way to describe locomotor activity track is to apply the model of [Markov process](https://en.wikipedia.org/wiki/Markov_chain) to it
which is described by probabilities of transitions from one state to another. We can define such probabilities in a form of [transition matrix](https://en.wikipedia.org/wiki/Stochastic_matrix).

First, we need to define finite state space of such a locomotor activity process, let's say something like: low, medium, high, very high level of activity.

After that we need to go through a track of an individual person and count probabilities of transitions from one level of activity to another
for given person. As a result we'll get a locomotor activity transition matrix for a single person which can be treated as a locomotor findgerprint of a person.

### Spark ML Bucketize

To define finite state space of locomotor activities we can use [Bucketizer](https://spark.apache.org/docs/latest/ml-features.html#bucketizer) transformer from Spark ML library.

We can look at intencity value distribution recordd from reliable trackers to define desired intensity levels.

In [ ]:
CustomPlotlyChart(PaxReliableWithDT.where($"PAXINTEN" < 18000).sample(withReplacement=false, 0.05),
                  layout="""{title: 'Intencity value distribution recorded from reliable trackers', 
                             yaxis: {type: 'log'},
                             xaxis: {title: 'Intensity'},
                             bargap: 0.02}""",
                  dataOptions="{type: 'histogram', opacity: 0.7}",
                  dataSources="{x: 'PAXINTEN'}",
                  maxPoints=5000)

res18: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>


In [ ]:
import org.apache.spark.ml.feature.Bucketizer

val splits = Array(0, 30, 100, 300, 600, 900, 1400, 2000, 3500, 5000, Double.PositiveInfinity)

val bucketizer = new Bucketizer()
  .setInputCol("PAXINTEN")
  .setOutputCol("activityLevel")
  .setSplits(splits)

import org.apache.spark.ml.feature.Bucketizer
splits: Array[Double] = Array(0.0, 30.0, 100.0, 300.0, 600.0, 900.0, 1400.0, 2000.0, 3500.0, 5000.0, Infinity)
bucketizer: org.apache.spark.ml.feature.Bucketizer = bucketizer_f28c3c3f98ce


In [ ]:
val bucketedPax = bucketizer
  .transform(PaxReliableWithDT
             .withColumn("totalInten", $"PAXINTEN".cast(LongType))
             .withColumn("PAXINTEN", $"PAXINTEN".cast(DoubleType)))
  .withColumn("activityLevel", $"activityLevel".cast(IntegerType))

bucketedPax.select($"activityLevel").distinct.orderBy($"activityLevel").show

+-------------+
|activityLevel|
+-------------+
|            0|
|            1|
|            2|
|            3|
|            4|
|            5|
|            6|
|            7|
|            8|
|            9|
+-------------+

bucketedPax: org.apache.spark.sql.DataFrame = [SEQN: int, PAXSTAT: int ... 10 more fields]


### Computing Transition Matrix with Spark SQL Window Functions

To compute a transition matrix we need to collect previous minute activity of a given person for each minute of activity for the same person.

That's where Spark SQL Funcitons come in handy.

In [ ]:
import org.apache.spark.sql.expressions.Window

val windowSpec = Window.partitionBy("SEQN").orderBy("time")

val withLastMinuteDF = bucketedPax
  .select($"SEQN", $"totalInten", $"activityLevel", $"time")
  .withColumn("previousMinuteActivity", lag("activityLevel", 1).over(windowSpec))
  .withColumn("previousMinuteActivity", when(isnull($"previousMinuteActivity"), -1).otherwise($"previousMinuteActivity"))

import org.apache.spark.sql.expressions.Window
windowSpec: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@297570cc
withLastMinuteDF: org.apache.spark.sql.DataFrame = [SEQN: int, totalInten: bigint ... 3 more fields]


Here we specified a window to contain all records from one Respondent ( partitioned by `SEQN`) and ordered by `time`. 

And we're using `lag` window funciton to access previous record in specified window which in this case is a previous minute activity level of a given person.

Now that we have previous minute activity level we can start to build a transition matrx of desired size.
We can store the transition matrix `W` in a form of `Array[Array[Double]]` where `W(i)(j)` has the value of probability of transition from state `j` to state `i`.
 
First we will store in `W(i)(j)` a number of transitions from level `j` to level `i` of a given person and after that we will devide this value by total number of transitions in recorded track for the person.

In [ ]:
def initTransitionMatrix = udf{ (currentActivityLevel: Int, previousActivityLevel: Int, size: Int) => {
  val W = Array.fill(size, size)(0.0)
  if (previousActivityLevel >= 0)
    W.updated(currentActivityLevel, W(currentActivityLevel).updated(previousActivityLevel, 1.0))
  else
    W
}}

val dfW = withLastMinuteDF.withColumn("W", initTransitionMatrix($"activityLevel", $"previousMinuteActivity", lit(10)))

initTransitionMatrix: org.apache.spark.sql.expressions.UserDefinedFunction
dfW: org.apache.spark.sql.DataFrame = [SEQN: int, totalInten: bigint ... 4 more fields]


For each record we created a  matrix with single transition count: from previous minute acitivty level to current one.

In [ ]:
case class RespondentTrMatrix(seqn: Int, totalInten: Long, totalCount: Long, W: Array[Array[Double]])

defined class RespondentTrMatrix


In [ ]:
val initTrMatrixDS = dfW.select($"SEQN", $"totalInten", lit(1L).as("totalCount"), $"W").as[RespondentTrMatrix]

initTrMatrixDS: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [SEQN: int, totalInten: bigint ... 2 more fields]


In [ ]:
initTrMatrixDS.write.format("parquet").mode("overwrite")
.save("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/10_inten_tr_matrix_init.parquet")

In [ ]:
val initTrMatrixDS = spark.read
  .format("parquet")
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/10_inten_tr_matrix_init.parquet")
  .as[RespondentTrMatrix]


initTrMatrixDS: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [SEQN: int, totalInten: bigint ... 2 more fields]


In [ ]:
initTrMatrixDS.count

res33: Long = 142607456


In [ ]:

val sumTrMatrixDS = initTrMatrixDS.rdd
  .map(l => (l.seqn, l))
  .reduceByKey((l, r) => {
    val elementWiseArraySum = (a: Array[Double], b: Array[Double]) => {
      a.zip(b).map { case (x, y) => x + y }
    }
    val elementWiseMatrixSum = (c: Array[Array[Double]], d: Array[Array[Double]]) => {
      c.zip(d).map { case (x, y) => elementWiseArraySum(x, y) }
    }
    RespondentTrMatrix(l.seqn, l.totalInten + r.totalInten, l.totalCount + r.totalCount, elementWiseMatrixSum(l.W, r.W)) 
  })
  .map(r => {
    val trMatrix = r._2
    trMatrix.copy(W = trMatrix.W.map(_.map(_ / trMatrix.totalCount)))
  })
  .toDS

sumTrMatrixDS: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [seqn: int, totalInten: bigint ... 2 more fields]


In [ ]:
sumTrMatrixDS.write.format("parquet").mode("overwrite")
.save("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/10_inten_tr_matrix.parquet")

In [ ]:
val computedTrMatrixDS = spark.read
  .format("parquet")
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/10_inten_tr_matrix.parquet")
  .as[RespondentTrMatrix]


computedTrMatrixDS: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [seqn: int, totalInten: bigint ... 2 more fields]


In [ ]:
CustomPlotlyChart(computedTrMatrixDS.where($"totalInten" < 10e6).toDF,
                  layout="""{title: 'Cumulative intensity value distribution', 
                             xaxis: {title: 'Cumulative intensity value per week'},
                             bargap: 0.02}""",
                  dataOptions="{type: 'histogram', opacity: 0.7}",
                  dataSources="{x: 'totalInten'}",
                  maxPoints=8000)

res55: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.DataFrame] = <CustomPlotlyChart widget>


In [ ]:
val trMatrix = computedTrMatrixDS.where($"totalInten" < 1e7 && $"totalInten" > 1e5).sample(false, 0.1, 851).limit(1).collect.head.W

trMatrix: Array[Array[Double]] = Array(Array(0.6725198412698413, 0.01904761904761905, 0.01369047619047619, 0.006845238095238095, 0.002281746031746032, 0.0011904761904761906, 3.968253968253968E-4, 2.9761904761904765E-4, 9.92063492063492E-5, 0.0), Array(0.01755952380952381, 0.015376984126984126, 0.015476190476190477, 0.005257936507936508, 0.0017857142857142857, 0.001488095238095238, 8.928571428571428E-4, 4.96031746031746E-4, 0.0, 0.0), Array(0.01378968253968254, 0.013888888888888888, 0.034027777777777775, 0.01488095238095238, 0.007043650793650794, 0.003869047619047619, 0.001388888888888889, 5.952380952380953E-4, 0.0, 0.0), Array(0.007043650793650794, 0.005952380952380952, 0.014781746031746031, 0.015476190476190477, 0.008630952380952382, 0.0036706349206349206, 0.0016865079365079366, 9.9206...

In [ ]:
val trMatrixPlotData = trMatrix
                       .zipWithIndex.toSeq.toDF("transitions", "toActivityLevel").withColumn("fromActivityLevel", $"toActivityLevel")

trMatrixPlotData: org.apache.spark.sql.DataFrame = [transitions: array<double>, toActivityLevel: int ... 1 more field]


In [ ]:
CustomPlotlyChart(trMatrixPlotData,
                  layout="""{title: 'Physical activity Transition matrix',
                             xaxis: {title: 'from physical activity level'}, 
                             yaxis: {title: 'to physical activity level'},
                             width: 600, height: 600}""",
                  dataOptions="""{type: 'heatmap', 
                                  colorscale: 'Viridis',
                                  reversescale: false,
                                  colorbar: {
                                    title: 'Probability',
                                    tickmode: 'array',
                                    tickvals: [0, 0.02, 0.04, 0.06, 0.08, 0.1],
                                    ticktext: ['0', '0.02', '0.04', '0.06', '0.08', '>0.1']
                                  },
                                  zmin: 0.0, zmax: 0.10}""",
                  dataSources="{x: 'fromActivityLevel', y: 'toActivityLevel', z: 'transitions'}")

res12: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.DataFrame] = <CustomPlotlyChart widget>


## PCA (unsupervised learning)

In [ ]:
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.feature.PCA

import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.feature.PCA


In [ ]:
def logScaleFeaturesUDF = udf{ (vec: Vector) => Vectors.dense(vec.toArray.map(v => -scala.math.log(1e-7 + v)))}

logScaleFeaturesUDF: org.apache.spark.sql.expressions.UserDefinedFunction


In [ ]:
val flattenTrMatrixDF = computedTrMatrixDS.where($"totalInten" < 1e7 && $"totalInten" > 1e5).rdd
  .map(l => (l.seqn, Vectors.dense(l.W.flatten)))
  .toDF("SEQN", "features")
  .withColumn("logFeatures", logScaleFeaturesUDF($"features"))
  .join(DemoDF, "SEQN")
  .where($"RIDAGEYR" >= 35)

flattenTrMatrixDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [SEQN: int, features: vector ... 2 more fields]


In [ ]:
val pca = new PCA()
  .setInputCol("logFeatures")
  .setOutputCol("pcaFeatures")
  .setK(3)
  .fit(flattenTrMatrixDF)

val withLocomotorPCA = pca.transform(flattenTrMatrixDF).select("SEQN", "pcaFeatures", "RIDAGEYR")

pca: org.apache.spark.ml.feature.PCAModel = pca_eee8fef73ad1
withLocomotorPCA: org.apache.spark.sql.DataFrame = [SEQN: int, pcaFeatures: vector ... 1 more field]


In [ ]:
withLocomotorPCA.limit(3).show(false)

+-----+----------------------------------------------------------+--------+
|SEQN |pcaFeatures                                               |RIDAGEYR|
+-----+----------------------------------------------------------+--------+
|27456|[81.40489097419712,23.07153802344512,-33.73299800624375]  |75      |
|22384|[78.43394961167441,26.906499751389653,-36.994347277280106]|51      |
|28944|[61.72525161105295,3.9433827485319677,-35.16986563891355] |40      |
+-----+----------------------------------------------------------+--------+



In [ ]:
def getItemUDF = udf{ (vec: Vector, idx: Int) => vec(idx)}

getItemUDF: org.apache.spark.sql.expressions.UserDefinedFunction


In [ ]:
val locomotorPCvsAge = withLocomotorPCA
  .select($"SEQN", $"RIDAGEYR".as("age"),
          getItemUDF($"pcaFeatures", lit(0)).as("PC1"),
          getItemUDF($"pcaFeatures", lit(1)).as("PC2"),
          getItemUDF($"pcaFeatures", lit(2)).as("PC3"))

locomotorPCvsAge: org.apache.spark.sql.DataFrame = [SEQN: int, age: int ... 3 more fields]


In [ ]:
val matrixPCvsAge = locomotorPCvsAge.groupBy($"age").agg(
  mean($"PC1").as("meanPC1"), stddev($"PC1").as("stdPC1"),
  mean($"PC2").as("meanPC2"), stddev($"PC2").as("stdPC2"),
  mean($"PC3").as("meanPC3"), stddev($"PC3").as("stdPC3")
)

matrixPCvsAge: org.apache.spark.sql.DataFrame = [age: int, meanPC1: double ... 5 more fields]


In [ ]:
CustomPlotlyChart(matrixPCvsAge.where($"age" >= 15).orderBy($"age"),
                  layout="""{title: 'PC projections vs Age', 
                           xaxis: {title: 'Chronological age'},
                           yaxis: {title: 'PC projection'},
                           showlegend: false}""",
                  dataOptions="""{
                    type: 'scatter',
                    line: {width: 2},
                    error_y: {type: 'data', visible: true, thickness: 0.5, width: 0}
                  }""",
                  dataSources="""{
                    x: 'age',
                    y: 'meanPC1',
                    error_y: {array: 'stdPC1'}
                  }""")

res30: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>


In [ ]:
CustomPlotlyChart(
  locomotorPCvsAge.where($"age" >= 35), 
  layout="""{
        title: 'PCA of the Locomotor Transition Matrix',
        height: 900,
        xaxis: {title: 'PC1'},
        yaxis: {title: 'PC2'},
        hovermode: 'closest'
    }""",
  dataOptions="""{
    mode: 'markers',
    type: 'scatter',
    marker: {
        sizemode: 'area',
        size: 12,
        opacity: 0.75,
        colorscale: 'Jet',
        reversescale: true,
        colorbar: {
          title: 'Age',
          thickness: 8.0
        }
    }}""",
  dataSources="""{
    x: 'PC1',
    y: 'PC2',
    marker: {color: 'age'}}""",
  maxPoints=6000)

res34: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>
