#Analyzing physical activity monitor data

TODO intro

## Data

TODO

To convert from sas transport file format with extension`.xpt` to CSV we can use `xport` module from python PyPI packages:
```bash
pip install xport
```
and use the `xport` module as a command-line tool to convert an XPT file to CSV file:

```bash
python -m xport paxraw_d.xpt > paxraw_d.csv
```

## Reading the data

In [ ]:
val spark = sparkSession

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@536cdacc


In [ ]:
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._


###Physical activity monitor data

In [ ]:
val PaxSchema = StructType(
    StructField("SEQN", FloatType, false) ::
    StructField("PAXSTAT", FloatType, false) ::
    StructField("PAXCAL", FloatType, false) ::
    StructField("PAXDAYSAS", FloatType, false) ::
    StructField("PAXN", FloatType, false) ::
    StructField("PAXHOUR", FloatType, false) ::
    StructField("PAXMINUT", FloatType, false) ::
    StructField("PAXINTEN", FloatType, false) ::
    StructField("PAXSTEP", StringType, true) :: Nil
)

PaxSchema: org.apache.spark.sql.types.StructType = StructType(StructField(SEQN,FloatType,false), StructField(PAXSTAT,FloatType,false), StructField(PAXCAL,FloatType,false), StructField(PAXDAYSAS,FloatType,false), StructField(PAXN,FloatType,false), StructField(PAXHOUR,FloatType,false), StructField(PAXMINUT,FloatType,false), StructField(PAXINTEN,FloatType,false), StructField(PAXSTEP,StringType,true))


TODO note on `nan` values in `PAXSTEP` column

In [ ]:
val PaxDF = spark.read
  .format("csv")
  .schema(PaxSchema)
  .option("header", true)
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/paxraw_d.csv")
  // .withColumn("PAXSTEP", when($"PAXSTEP" === "nan", null).otherwise($"PAXSTEP".cast(DoubleType)))
  .select($"SEQN".cast(IntegerType),
          $"PAXSTAT".cast(IntegerType),
          $"PAXCAL".cast(IntegerType),
          $"PAXDAYSAS".cast(IntegerType),
          $"PAXN".cast(IntegerType),
          $"PAXHOUR".cast(IntegerType),
          $"PAXMINUT".cast(IntegerType),
          $"PAXINTEN".cast(IntegerType) // ,
//          $"PAXSTEP".cast(IntegerType)
         )
//  .filter(!isnull($"PAXSTEP"))
  .withColumn("datetime", concat($"PAXDAYSAS", lit(".01.2005 "), $"PAXHOUR", lit(":"), $"PAXMINUT"))
  .withColumn("time", unix_timestamp($"datetime", "d.MM.yyyy HH:mm"))
  .withColumn("datetime", from_unixtime($"time"))

PaxDF: org.apache.spark.sql.DataFrame = [SEQN: int, PAXSTAT: int ... 8 more fields]


In [ ]:
PaxDF.write.format("parquet").mode("overwrite").save("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/paxraw_d.parquet")

In [ ]:
val PaxDF = spark.read
  .format("parquet")
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/paxraw_d.parquet")
  

PaxDF: org.apache.spark.sql.DataFrame = [SEQN: int, PAXSTAT: int ... 8 more fields]


In [ ]:
PaxDF.describe("SEQN", "PAXINTEN").show

+-------+------------------+------------------+
|summary|              SEQN|          PAXINTEN|
+-------+------------------+------------------+
|  count|          74874095|          74874095|
|   mean| 36296.25028003343| 327.4901439943949|
| stddev|2989.7024533075023|2301.0512668916217|
|    min|             31128|                 0|
|    max|             41474|             32767|
+-------+------------------+------------------+



###Demographics Data

In [ ]:
val DemoDF_D = spark.read
  .format("csv")
  .option("header", true)
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/DEMO_D.csv")
  .select($"SEQN".cast(IntegerType), $"RIDAGEYR".cast(IntegerType))

DemoDF_D: org.apache.spark.sql.DataFrame = [SEQN: int, RIDAGEYR: int]


In [ ]:
val DemoDF_C = spark.read
  .format("csv")
  .option("header", true)
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/DEMO_C.csv")
  .select($"SEQN".cast(IntegerType), $"RIDAGEYR".cast(IntegerType))

DemoDF_C: org.apache.spark.sql.DataFrame = [SEQN: int, RIDAGEYR: int]


In [ ]:
val DemoDF = DemoDF_C.union(DemoDF_D)

DemoDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [SEQN: int, RIDAGEYR: int]


In [ ]:
DemoDF.limit(3).show

+-----+--------+
| SEQN|RIDAGEYR|
+-----+--------+
|21005|      19|
|21006|      16|
|21007|      14|
+-----+--------+



`PAXSTEP` - is the step count per minute recorded by the physical activity monitor.
The values like over `500` steps per minutes seems unreliable to me.
Let's excluse those devices that have recorded more than `500` steps per minute.

In [ ]:
CustomPlotlyChart(PaxDF.where($"PAXINTEN" > 1000).sample(withReplacement=false, 0.05),
                  layout="""{title: 'Intencity value distribution', 
                             yaxis: {type: 'log'},
                             xaxis: {title: 'Intensity'},
                             bargap: 0.02}""",
                  dataOptions="{type: 'histogram', opacity: 0.7}",
                  dataSources="{x: 'PAXINTEN'}",
                  maxPoints=5000)

res46: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>


In [ ]:
PaxDF.where($"PAXINTEN" > 27000).select($"SEQN").distinct.count

res13: Long = 262


In [ ]:
val broadcastedBlackList = spark.sparkContext.broadcast(
  PaxDF.where($"PAXINTEN" > 27000).select($"SEQN").distinct
  .collect.map(_(0).asInstanceOf[Int]).toSet
)

broadcastedBlackList: org.apache.spark.broadcast.Broadcast[scala.collection.immutable.Set[Int]] = Broadcast(19)


In [ ]:
def inBlacklistUDF = udf((seqNum: Int) => {
  broadcastedBlackList.value.contains(seqNum)
})

inBlacklistUDF: org.apache.spark.sql.expressions.UserDefinedFunction


In [ ]:
val PaxUnreliable = PaxDF.where(inBlacklistUDF($"SEQN"))

PaxUnreliable: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [SEQN: int, PAXSTAT: int ... 8 more fields]


In [ ]:
val PaxReliable = PaxDF.where(!inBlacklistUDF($"SEQN"))

PaxReliable: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [SEQN: int, PAXSTAT: int ... 8 more fields]


In [ ]:
PaxDF.select($"SEQN").distinct.count

res19: Long = 7455


In [ ]:
PaxUnreliable.select($"SEQN").distinct.count

res21: Long = 262


In [ ]:
val seqNums = PaxReliable.select($"SEQN").distinct
  .sample(false, 0.01)
  .limit(10)
  .collect
  .map(_(0).asInstanceOf[Int]).toList

seqNums: List[Int] = List(28905, 26165, 31047, 24253, 26376, 21629, 23129, 28671, 29849, 25325)


TODO show for both Reliable and Unreliable data.

In [ ]:
CustomPlotlyChart(PaxReliable.where($"SEQN" === seqNums(1)),
                  layout="""{title: 'Physical activity monitor data', 
                           yaxis: {title: 'Device Intensity Value'},
                           showlegend: false}""",
                  dataOptions="""{
                    colorscale: 'Electric',
                    autocolorscale: true
                  }""",
                  dataSources="""{
                    x: 'datetime',
                    y: 'PAXINTEN'
                  }""",
                 maxPoints=5000)

res29: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>


Let's see on steps count distribution

TODO show for both Reliable and Unreliable data.

In [ ]:
CustomPlotlyChart(PaxReliable.sample(withReplacement=false, 0.001),
                  layout="""{title: 'Device intencity value distribution', 
                             yaxis: {type: 'log'},
                             xaxis: {title: 'intensity value'},
                             bargap: 0.02}""",
                  dataOptions="{type: 'histogram', opacity: 0.7}",
                  dataSources="{x: 'PAXINTEN'}",
                  maxPoints=5000)

res31: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>


## Bucketize

*REPEAT for intevals between array([ 31128,  31672,  32217.05263158,  32761.57894737,
        33306.10526316,  33850.63157895,  34395.15789474,  34939.68421053,
        35484.21052632,  36028.73684211,  36573.26315789,  37117.78947368,
        37662.31578947,  38206.84210526,  38751.36842105,  39295.89473684,
        39840.42105263,  40384.94736842,  40929.47368421,  41474.        ])*

In [ ]:
import org.apache.spark.ml.feature.Bucketizer

val splits = Array(0, 30, 100, 300, 600, 900, 1400, 2000, 3500, 5000, Double.PositiveInfinity)

val bucketizer = new Bucketizer()
  .setInputCol("PAXINTEN")
  .setOutputCol("activityLevel")
  .setSplits(splits)

import org.apache.spark.ml.feature.Bucketizer
splits: Array[Double] = Array(0.0, 30.0, 100.0, 300.0, 600.0, 900.0, 1400.0, 2000.0, 3500.0, 5000.0, Infinity)
bucketizer: org.apache.spark.ml.feature.Bucketizer = bucketizer_2c620728d17d


In [ ]:
val bucketedPax = bucketizer
  .transform(PaxReliable
             .withColumn("totalInten", $"PAXINTEN".cast(LongType))
             .withColumn("PAXINTEN", $"PAXINTEN".cast(DoubleType)))
  .withColumn("activityLevel", $"activityLevel".cast(IntegerType))

bucketedPax: org.apache.spark.sql.DataFrame = [SEQN: int, PAXSTAT: int ... 10 more fields]


In [ ]:
bucketedPax.select($"activityLevel").distinct.orderBy($"activityLevel").show

+-------------+
|activityLevel|
+-------------+
|            0|
|            1|
|            2|
|            3|
|            4|
|            5|
|            6|
|            7|
|            8|
|            9|
+-------------+



## Building a Transition Matrix

In [ ]:
import org.apache.spark.sql.expressions.Window

import org.apache.spark.sql.expressions.Window


1. use `Window Function` to collect previous minute activity

In [ ]:
val windowSpec = Window.partitionBy("SEQN").orderBy("time")

windowSpec: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@530c08bf


In [ ]:
val df = bucketedPax
  .select($"SEQN", $"totalInten", $"activityLevel", $"time")
  .withColumn("previousMinuteActivity", lag("activityLevel", 1).over(windowSpec))
  .withColumn("previousMinuteActivity", when(isnull($"previousMinuteActivity"), -1).otherwise($"previousMinuteActivity"))

df: org.apache.spark.sql.DataFrame = [SEQN: int, totalInten: bigint ... 3 more fields]


In [ ]:
def initTransitionMatrix = udf{ (currentActivityLevel: Int, previousActivityLevel: Int, size: Int) => {
  val W = Array.fill(size, size)(0.0)
  if (previousActivityLevel >= 0)
    W.updated(currentActivityLevel, W(currentActivityLevel).updated(previousActivityLevel, 1.0))
  else
    W
}}

initTransitionMatrix: org.apache.spark.sql.expressions.UserDefinedFunction


In [ ]:
val dfW = df.withColumn("W", initTransitionMatrix($"activityLevel", $"previousMinuteActivity", lit(10)))

dfW: org.apache.spark.sql.DataFrame = [SEQN: int, totalInten: bigint ... 4 more fields]


In [ ]:
case class RespondentTrMatrix(seqn: Int, totalInten: Long, totalCount: Long, W: Array[Array[Double]])

defined class RespondentTrMatrix


In [ ]:
val trMatrixDS = dfW.select($"SEQN", $"totalInten", lit(1L).as("totalCount"), $"W").as[RespondentTrMatrix]

trMatrixDS: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [SEQN: int, totalInten: bigint ... 2 more fields]


In [ ]:
trMatrixDS.write.format("parquet").mode("overwrite")
.save("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/10_inten_tr_matrix_init.parquet")

In [ ]:
val initTrMatrixDS = spark.read
  .format("parquet")
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/10_inten_tr_matrix_init.parquet")
  .as[RespondentTrMatrix]


initTrMatrixDS: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [SEQN: int, totalInten: bigint ... 2 more fields]


In [ ]:

val sumTrMatrixDS = initTrMatrixDS.rdd
  .map(l => (l.seqn, l))
  .reduceByKey((l, r) => {
    val elementWiseArraySum = (a: Array[Double], b: Array[Double]) => {
      a.zip(b).map { case (x, y) => x + y }
    }
    val elementWiseMatrixSum = (c: Array[Array[Double]], d: Array[Array[Double]]) => {
      c.zip(d).map { case (x, y) => elementWiseArraySum(x, y) }
    }
    RespondentTrMatrix(l.seqn, l.totalInten + r.totalInten, l.totalCount + r.totalCount, elementWiseMatrixSum(l.W, r.W)) 
  })
  .map(r => {
    val trMatrix = r._2
    trMatrix.copy(W = trMatrix.W.map(_.map(_ / trMatrix.totalCount)))
  })
  .toDS

sumTrMatrixDS: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [seqn: int, totalInten: bigint ... 2 more fields]


In [ ]:
sumTrMatrixDS
  .write
  .format("parquet")
  .save("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/10_inten_tr_matrix_d.parquet")

In [ ]:
val computedTrMatrixDS_D = spark.read
  .format("parquet")
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/10_inten_tr_matrix_d.parquet")
  .as[RespondentTrMatrix]


computedTrMatrixDS_D: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [seqn: int, totalInten: bigint ... 2 more fields]


In [ ]:
val computedTrMatrixDS_C = spark.read
  .format("parquet")
  .load("./notebooks/spark-notebooks-gallery/gallery/physical-activity-monitor/data/10_inten_tr_matrix_c.parquet")
  .as[RespondentTrMatrix]


computedTrMatrixDS_C: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [seqn: int, totalInten: bigint ... 2 more fields]


In [ ]:
val computedTrMatrixDS = computedTrMatrixDS_C.union(computedTrMatrixDS_D)

computedTrMatrixDS: org.apache.spark.sql.Dataset[RespondentTrMatrix] = [seqn: int, totalInten: bigint ... 2 more fields]


In [ ]:
CustomPlotlyChart(computedTrMatrixDS.where($"totalInten" < 10e6).toDF,
                  layout="""{title: 'Cumulative intensity value distribution', 
                             xaxis: {title: 'Cumulative intensity value per week'},
                             bargap: 0.02}""",
                  dataOptions="{type: 'histogram', opacity: 0.7}",
                  dataSources="{x: 'totalInten'}",
                  maxPoints=8000)

res55: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.DataFrame] = <CustomPlotlyChart widget>


In [ ]:
val trMatrix = computedTrMatrixDS.where($"totalInten" < 1e7 && $"totalInten" > 1e5).sample(false, 0.1, 123).limit(1).collect.head.W

trMatrix: Array[Array[Double]] = Array(Array(0.6031746031746031, 0.0314484126984127, 0.019543650793650794, 0.010416666666666666, 0.002777777777777778, 0.0016865079365079366, 7.936507936507937E-4, 5.952380952380953E-4, 0.0, 0.0), Array(0.02976190476190476, 0.023015873015873017, 0.018154761904761906, 0.00744047619047619, 0.0025793650793650793, 0.0017857142857142857, 3.968253968253968E-4, 9.92063492063492E-5, 0.0, 0.0), Array(0.01974206349206349, 0.018353174603174604, 0.025396825396825397, 0.013591269841269842, 0.006746031746031746, 0.004662698412698413, 9.92063492063492E-4, 2.9761904761904765E-4, 0.0, 0.0), Array(0.008928571428571428, 0.006448412698412698, 0.016666666666666666, 0.01765873015873016, 0.009623015873015874, 0.006944444444444444, 0.0017857142857142857, 7.936507936507937E-4, 9....

In [ ]:
val trMatrixPlotData = trMatrix
                       .zipWithIndex.toSeq.toDF("transitions", "toActivityLevel").withColumn("fromActivityLevel", $"toActivityLevel")

trMatrixPlotData: org.apache.spark.sql.DataFrame = [transitions: array<double>, toActivityLevel: int ... 1 more field]


In [ ]:
CustomPlotlyChart(trMatrixPlotData,
                  layout="""{title: 'Physical activity Transition matrix',
                             xaxis: {title: 'from physical activity level'}, 
                             yaxis: {title: 'to physical activity level'},
                             width: 600, height: 600}""",
                  dataOptions="""{type: 'heatmap', 
                                  colorscale: 'Viridis',
                                  reversescale: false,
                                  colorbar: {
                                    title: 'Probability',
                                    tickmode: 'array',
                                    tickvals: [0, 0.02, 0.04, 0.06, 0.08, 0.1],
                                    ticktext: ['0', '0.02', '0.04', '0.06', '0.08', '>0.1']
                                  },
                                  zmin: 0.0, zmax: 0.10}""",
                  dataSources="{x: 'fromActivityLevel', y: 'toActivityLevel', z: 'transitions'}")

res19: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.DataFrame] = <CustomPlotlyChart widget>


## PCA (unsupervised learning)

In [ ]:
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.feature.PCA

import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.feature.PCA


In [ ]:
val flattenTrMatrixDF = computedTrMatrixDS.where($"totalInten" < 1e7 && $"totalInten" > 1e5).rdd
  .map(l => (l.seqn, Vectors.dense(l.W.flatten)))
  .toDF("SEQN", "features")
  .join(DemoDF, "SEQN")
  .where($"RIDAGEYR" >= 35)

flattenTrMatrixDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [SEQN: int, features: vector ... 1 more field]


In [ ]:
val pca = new PCA()
  .setInputCol("features")
  .setOutputCol("pcaFeatures")
  .setK(3)
  .fit(flattenTrMatrixDF)

val withLocomotorPCA = pca.transform(flattenTrMatrixDF).select("SEQN", "pcaFeatures", "RIDAGEYR")

pca: org.apache.spark.ml.feature.PCAModel = pca_a66288ee5aa8
withLocomotorPCA: org.apache.spark.sql.DataFrame = [SEQN: int, pcaFeatures: vector ... 1 more field]


In [ ]:
withLocomotorPCA.limit(3).show(false)

+-----+--------------------------------------------------------------+--------+
|SEQN |pcaFeatures                                                   |RIDAGEYR|
+-----+--------------------------------------------------------------+--------+
|37025|[-0.7409268884525191,-0.10169521657770558,0.07318224422409748]|83      |
|33073|[-0.7456233830269862,-0.06963077833015394,0.07415099416883301]|41      |
|38393|[-0.6148335022192626,-0.08044929219570623,0.06454239938324055]|51      |
+-----+--------------------------------------------------------------+--------+



In [ ]:
def getItemUDF = udf{ (vec: Vector, idx: Int) => vec(idx)}

getItemUDF: org.apache.spark.sql.expressions.UserDefinedFunction


In [ ]:
val locomotorPCvsAge = withLocomotorPCA
  .select($"SEQN", $"RIDAGEYR".as("age"),
          getItemUDF($"pcaFeatures", lit(0)).as("PC1"),
          getItemUDF($"pcaFeatures", lit(1)).as("PC2"),
          getItemUDF($"pcaFeatures", lit(2)).as("PC3"))

locomotorPCvsAge: org.apache.spark.sql.DataFrame = [SEQN: int, age: int ... 3 more fields]


In [ ]:
val matrixPCvsAge = locomotorPCvsAge.groupBy($"age").agg(
  mean($"PC1").as("meanPC1"), stddev($"PC1").as("stdPC1"),
  mean($"PC2").as("meanPC2"), stddev($"PC2").as("stdPC2"),
  mean($"PC3").as("meanPC3"), stddev($"PC3").as("stdPC3")
)

matrixPCvsAge: org.apache.spark.sql.DataFrame = [age: int, meanPC1: double ... 5 more fields]


In [ ]:
CustomPlotlyChart(matrixPCvsAge.where($"age" >= 15).orderBy($"age"),
                  layout="""{title: 'PC projections vs Age', 
                           xaxis: {title: 'Chronological age'},
                           yaxis: {title: 'PC projection'},
                           showlegend: false}""",
                  dataOptions="""{
                    type: 'scatter',
                    line: {width: 2},
                    error_y: {type: 'data', visible: true, thickness: 0.5, width: 0}
                  }""",
                  dataSources="""{
                    x: 'age',
                    y: 'meanPC3',
                    error_y: {array: 'stdPC3'}
                  }""")

res34: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>


In [ ]:
CustomPlotlyChart(
  locomotorPCvsAge.where($"age" >= 35), 
  layout="""{
        title: 'PCA of the Locomotor Transition Matrix',
        height: 900,
        xaxis: {title: 'PC1'},
        yaxis: {title: 'PC2'},
        hovermode: 'closest'
    }""",
  dataOptions="""{
    mode: 'markers',
    type: 'scatter',
    marker: {
        sizemode: 'area',
        size: 12,
        opacity: 0.75,
        colorscale: 'Jet',
        reversescale: true,
        colorbar: {
          title: 'Age',
          thickness: 8.0
        }
    }}""",
  dataSources="""{
    x: 'PC1',
    y: 'PC2',
    marker: {color: 'age'}}""",
  maxPoints=3000)

res30: notebook.front.widgets.charts.CustomPlotlyChart[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = <CustomPlotlyChart widget>
