In [1]:
import scala.collection.mutable.ListBuffer

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
import org.apache.spark.rdd.RDD

import org.ucsd.dse.capstone.traffic.Fields
import org.ucsd.dse.capstone.traffic.MLibUtils
import org.ucsd.dse.capstone.traffic.PivotHandler
import org.ucsd.dse.capstone.traffic.StandardPivotHandler

In [2]:
def do_run(sc: SparkContext, m_string_rdd: RDD[String], fid: String, file_dir_prefix: String, pivot_field: Int) = {
    val handler: PivotHandler = new StandardPivotHandler(sc, pivot_field)
    val m_vector_rdd: RDD[Vector] = MLibUtils.pivot(m_string_rdd, handler)
    //
    // obtain mean vector
    //
    val m_summary_stats: MultivariateStatisticalSummary = MLibUtils.summary_stats(m_vector_rdd)
    val mean_vector = m_summary_stats.mean.toArray
    val mean_filename = file_dir_prefix + "mean_vector." + fid + ".csv"
    MLibUtils.write_vectors(mean_filename, List[Vector](Vectors.dense(mean_vector)))
    //
    // execute PCA
    //
    val m_pca_vector_rdd: RDD[Vector] = m_vector_rdd
    val k = 30
    val (eigenvectors, eigenvalues) = MLibUtils.execute_pca(m_pca_vector_rdd, k)
    //
    // eigenvectors written out as column-major matrix
    //
    val eigenvectors_filename = file_dir_prefix + "eigenvectors." + fid + ".csv"
    MLibUtils.write_matrix(eigenvectors_filename, eigenvectors)
    //
    // eigenvalues written out as one row
    //
    val eigenvalue_filename = file_dir_prefix + "eigenvalues." + fid + ".csv"
    MLibUtils.write_vectors(eigenvalue_filename, List[Vector](eigenvalues))
    //
    // take a sample of 10 vectors
    //
    val sample_arr: Array[Vector] = m_vector_rdd.takeSample(false, 10, 47)
    val sample_filename = file_dir_prefix + "samples." + fid + ".csv"
    MLibUtils.write_vectors(sample_filename, sample_arr)
    //
    // print statements to verify
    //
    println("eigenvectors= " + eigenvectors)
    println("eigenvalues= " + eigenvalues)
    val m_list_buffer = new ListBuffer[Double]()
    val m_eig_arr: Array[Double] = eigenvalues.toArray
    var cum_sum = 0.0
    for (i <- 0 to m_eig_arr.length - 1) {
      cum_sum += m_eig_arr(i)
      m_list_buffer += cum_sum
    }
    println("perc variance explained= " + m_list_buffer)
}

In [4]:
val files: List[String] = List("/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010", "/home/dyerke/Documents/DSE/capstone_project/traffic/data/01_2010_first_seven_days")
val m_string_rdd: RDD[String] = MLibUtils.new_rdd(sc, files, 4)

//
// Execute PCA for each field
//
val m_fields_pca = List[Tuple2[String, Int]](
  ("/tmp/total_flow.", Fields.TotalFlow),
  ("/tmp/occupancy.", Fields.Occupancy),
  ("/tmp/speed.", Fields.Speed))
val fid = "01_2010" // hardcode id for now
m_fields_pca.foreach { tuple: Tuple2[String, Int] =>
  val file_dir_prefix = tuple._1
  val pivot_field = tuple._2
  do_run(sc, m_string_rdd, fid, file_dir_prefix, pivot_field)
}

eigenvectors= -0.009336523289587895   -0.025622414638707736   ... (30 total)
-0.009297540500599277   -0.02599018919669005    ...
-0.00902688036426008    -0.026546568639204404   ...
-0.008278388799454351   -0.02624671522129898    ...
-0.0072284032592879344  -0.024999445213452917   ...
-0.006386781947056375   -0.023760399946579944   ...
-0.005889673518323518   -0.022961005844406968   ...
-0.005806747751176736   -0.022523291377820515   ...
-0.0057911321193332005  -0.022489334456774875   ...
-0.005425139484011222   -0.021885599060931783   ...
-0.004797250144877089   -0.021000812259084862   ...
-0.004219050048464443   -0.020022375362309837   ...
-0.003947223318689387   -0.01946265384348214    ...
-0.0037661308159812543  -0.01905756690203121    ...
-0.003621856626874191   -0.018789897809615874   ...
-0.0035102869357698362  -0.017885526247962087   ...
-0.003320897245021156   -0.017176742425238263   ...
-0.003130835486593521   -0.016512298622650155   ...
-0.0032681070983289224  -0.016045347331