## Dependencies

This Notebook assumes you have an Accumulo cluster running and have a shaded jar file with accumulo dependencies.
- To setup a simple Accumulo cluster: https://github.com/apache/fluo-uno
- To build a shaded jar with accumulo dependencies: https://github.com/apache/accumulo-examples/tree/master/spark

Adding the shaded jar after the notebook is running does not work, it needs to be added prior to starting jupyter

```
SPARK_OPTS="--jars ~/repos/accumulo-examples/spark/target/accumulo-spark-shaded.jar" jupyter notebook
```

In [12]:
// Adding the MMLSpark JAR dynamically does work
%AddJar file:///~/repos/mmlspark/target/scala-2.11/mmlspark_2.11-0.17+108-eef7302b+20190818-2038.jar

Using cached version of mmlspark_2.11-0.17+108-eef7302b+20190818-2038.jar


In [2]:
import scala.collection.JavaConverters._
import scala.collection.JavaConversions._

import org.apache.accumulo.core.client.{Accumulo, AccumuloClient, BatchWriter}
import org.apache.accumulo.core.data.{Key, Mutation, Value}
import org.apache.accumulo.core.security.Authorizations

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DataType, MetadataBuilder, StringType, StructType, StructField}

In [3]:
def accumuloClientProperties = "/home/scott/repos/fluo-uno/install/accumulo-2.0.0/conf/accumulo-client.properties"

accumuloClientProperties: String


## Setup Accumulo Tables

In [4]:
val inputTable = "spark_example_input";
val rootPath = new Path("/spark_example/");

inputTable = spark_example_input
rootPath = /spark_example


/spark_example

In [5]:
// create accumulo client
val props = Accumulo.newClientProperties().from(accumuloClientProperties).build();
val client = Accumulo.newClient().from(props).build()

val hdfs = FileSystem.get(new Configuration());

props = {auth.type=password, auth.principal=root, instance.zookeepers=localhost:2181, instance.name=uno, auth.token=secret}
client = org.apache.accumulo.core.clientImpl.ClientContext@7500f36f
hdfs = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_1688114584_96, ugi=scott (auth:SIMPLE)]]


DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_1688114584_96, ugi=scott (auth:SIMPLE)]]

In [10]:
// cleanup hdfs root path
if (hdfs.exists(rootPath)) {
  hdfs.delete(rootPath, true);
}

// remove accumulo table 
if (client.tableOperations().exists(inputTable)) {
  client.tableOperations().delete(inputTable);
}

In [11]:
// create table
client.tableOperations().create(inputTable);

In [12]:
// write data to input table
val batchWriter = client.createBatchWriter(inputTable)
for (i <- 0 until 100) {
  val m = new Mutation(f"$i%03d")
  m.at().family("cf1").qualifier("cq1").put("" + i);
  batchWriter.addMutation(m);
}
batchWriter.close()

batchWriter = org.apache.accumulo.core.clientImpl.BatchWriterImpl@76640258


org.apache.accumulo.core.clientImpl.BatchWriterImpl@76640258

In [9]:
val scanner = client.createScanner(inputTable, Authorizations.EMPTY)
scanner.fetchColumnFamily("cf1")

scanner = org.apache.accumulo.core.clientImpl.ScannerImpl@3689a34f


org.apache.accumulo.core.clientImpl.ScannerImpl@3689a34f

In [10]:
scanner.zipWithIndex.foreach { case(e, i) => if (i < 10) println(e)}

000 cf1:cq1 [] 1566146448794 false=0
001 cf1:cq1 [] 1566146448794 false=1
002 cf1:cq1 [] 1566146448794 false=2
003 cf1:cq1 [] 1566146448794 false=3
004 cf1:cq1 [] 1566146448794 false=4
005 cf1:cq1 [] 1566146448794 false=5
006 cf1:cq1 [] 1566146448794 false=6
007 cf1:cq1 [] 1566146448794 false=7
008 cf1:cq1 [] 1566146448794 false=8
009 cf1:cq1 [] 1566146448794 false=9


## Spark - Accumulo Reader

In [6]:
// reload Spark session with config

val conf = new SparkConf()
conf.setAppName("Test")
// KryoSerializer is needed for serializing Accumulo Key when partitioning data for bulk import
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
// conf.set("spark.sql.legacy.replaceDatabricksSparkAvro.enabled", "false")
conf.registerKryoClasses(Array(classOf[Key], classOf[Value], classOf[Properties]))

val sparkSession = SparkSession.builder().config(conf).getOrCreate()

conf = org.apache.spark.SparkConf@3a37feb6
sparkSession = org.apache.spark.sql.SparkSession@796ecf79


org.apache.spark.sql.SparkSession@796ecf79

In [7]:
props.put("table", inputTable)
val properties = props.asScala

properties = Map(auth.type -> password, auth.principal -> root, table -> spark_example_input, instance.zookeepers -> localhost:2181, instance.name -> uno, auth.token -> secret)


Map(auth.type -> password, auth.principal -> root, table -> spark_example_input, instance.zookeepers -> localhost:2181, instance.name -> uno, auth.token -> secret)

In [8]:
def makeField(fieldName: String, columnFamily: String, columnQualifier: String, dataType: DataType, nullable: Boolean): StructField = {
    new StructField(fieldName, 
                    dataType, 
                    nullable, 
                    new MetadataBuilder()
                        .putString("cf", columnFamily)
                        .putString("cq", columnQualifier)
                        .build())
}

val schema = new StructType()
    .add(makeField("f1", "cf1", "cq1", StringType, false))

schema.json

schema = StructType(StructField(f1,StringType,false))


makeField: (fieldName: String, columnFamily: String, columnQualifier: String, dataType: org.apache.spark.sql.types.DataType, nullable: Boolean)org.apache.spark.sql.types.StructField


{"type":"struct","fields":[{"name":"f1","type":"string","nullable":false,"metadata":{"cf":"cf1","cq":"cq1"}}]}

In [9]:
val df = sparkSession
    .read
    .format("com.microsoft.ml.spark.accumulo")
    .options(properties)
    .schema(schema)
    .load()

df.show(10)

Name: org.apache.spark.SparkException
Message: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NoSuchMethodError: org.apache.avro.Schema$Field.<init>(Ljava/lang/String;Lorg/apache/avro/Schema;Ljava/lang/String;Ljava/lang/Object;)V
	at com.microsoft.ml.spark.accumulo.AccumuloInputPartitionReader$$anonfun$2.apply(AccumuloInputPartitionReader.scala:50)
	at com.microsoft.ml.spark.accumulo.AccumuloInputPartitionReader$$anonfun$2.apply(AccumuloInputPartitionReader.scala:49)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala

In [10]:
sparkSession.stop

lastException: Throwable = null
