Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SPARKC-554: Add support for custom rate limiters #1202

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,11 @@ OSS Cassandra this should never be used.</td>
<td>true</td>
<td>Sets whether to record connector specific metrics on write</td>
</tr>
<tr>
<td><code>input.ratelimiterprovider</code></td>
<td>main.scala.com.datastax.spark.connector.writer.LeakyBucketRateLimiterProvider</td>
<td>Determines which rate limiter provider to use in reads</td>
</tr>
<tr>
<td><code>input.reads_per_sec</code></td>
<td>2147483647</td>
Expand Down Expand Up @@ -314,6 +319,11 @@ finer control see the CassandraOption class</td>
<td>true</td>
<td>Sets whether to record connector specific metrics on write</td>
</tr>
<tr>
<td><code>output.ratelimiterprovider</code></td>
<td>main.scala.com.datastax.spark.connector.writer.LeakyBucketRateLimiterProvider</td>
<td>Determines which rate limiter provider to use in writes</td>
</tr>
<tr>
<td><code>output.throughput_mb_per_sec</code></td>
<td>2.147483647E9</td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ public WriterBuilder withBatchSize(BatchSize batchSize) {
new WriteConf(batchSize, writeConf.batchGroupingBufferSize(), writeConf.batchGroupingKey(),
writeConf.consistencyLevel(), writeConf.ifNotExists(), writeConf.ignoreNulls(),
writeConf.parallelismLevel(), writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(),
writeConf.taskMetricsEnabled()));
writeConf.taskMetricsEnabled(), writeConf.rateLimiterProvider()));
else
return this;
}
Expand All @@ -197,7 +197,7 @@ public WriterBuilder withBatchGroupingBufferSize(int batchGroupingBufferSize) {
new WriteConf(writeConf.batchSize(), batchGroupingBufferSize, writeConf.batchGroupingKey(),
writeConf.consistencyLevel(), writeConf.ifNotExists(), writeConf.ignoreNulls(),
writeConf.parallelismLevel(), writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(),
writeConf.taskMetricsEnabled()));
writeConf.taskMetricsEnabled(), writeConf.rateLimiterProvider()));
else
return this;
}
Expand All @@ -216,7 +216,7 @@ public WriterBuilder withBatchGroupingKey(BatchGroupingKey batchGroupingKey) {
new WriteConf(writeConf.batchSize(), writeConf.batchGroupingBufferSize(), batchGroupingKey,
writeConf.consistencyLevel(), writeConf.ifNotExists(), writeConf.ignoreNulls(),
writeConf.parallelismLevel(), writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(),
writeConf.taskMetricsEnabled()));
writeConf.taskMetricsEnabled(), writeConf.rateLimiterProvider()));
else
return this;
}
Expand All @@ -234,7 +234,8 @@ public WriterBuilder withConsistencyLevel(ConsistencyLevel consistencyLevel) {
return withWriteConf(
new WriteConf(writeConf.batchSize(), writeConf.batchGroupingBufferSize(), writeConf.batchGroupingKey(),
consistencyLevel, writeConf.ifNotExists(), writeConf.ignoreNulls(), writeConf.parallelismLevel(),
writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(), writeConf.taskMetricsEnabled()));
writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(), writeConf.taskMetricsEnabled(),
writeConf.rateLimiterProvider()));
else
return this;
}
Expand All @@ -252,7 +253,8 @@ public WriterBuilder withParallelismLevel(int parallelismLevel) {
return withWriteConf(
new WriteConf(writeConf.batchSize(), writeConf.batchGroupingBufferSize(), writeConf.batchGroupingKey(),
writeConf.consistencyLevel(), writeConf.ifNotExists(), writeConf.ignoreNulls(), parallelismLevel,
writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(), writeConf.taskMetricsEnabled()));
writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(), writeConf.taskMetricsEnabled(),
writeConf.rateLimiterProvider()));
else
return this;
}
Expand All @@ -271,7 +273,7 @@ public WriterBuilder withThroughputMBPS(int throughputMBPS) {
new WriteConf(writeConf.batchSize(), writeConf.batchGroupingBufferSize(), writeConf.batchGroupingKey(),
writeConf.consistencyLevel(), writeConf.ifNotExists(), writeConf.ignoreNulls(),
writeConf.parallelismLevel(), throughputMBPS, writeConf.ttl(), writeConf.timestamp(),
writeConf.taskMetricsEnabled()));
writeConf.taskMetricsEnabled(), writeConf.rateLimiterProvider()));
else
return this;
}
Expand All @@ -290,7 +292,7 @@ public WriterBuilder withTaskMetricsEnabled(boolean taskMetricsEnabled) {
new WriteConf(writeConf.batchSize(), writeConf.batchGroupingBufferSize(), writeConf.batchGroupingKey(),
writeConf.consistencyLevel(), writeConf.ifNotExists(), writeConf.ignoreNulls(),
writeConf.parallelismLevel(), writeConf.throughputMiBPS(), writeConf.ttl(),
writeConf.timestamp(), taskMetricsEnabled));
writeConf.timestamp(), taskMetricsEnabled, writeConf.rateLimiterProvider()));
else
return this;
}
Expand All @@ -307,7 +309,7 @@ public WriterBuilder withIfNotExists(boolean ifNotExists) {
return withWriteConf(
new WriteConf(writeConf.batchSize(), writeConf.batchGroupingBufferSize(), writeConf.batchGroupingKey(),
writeConf.consistencyLevel(), ifNotExists, writeConf.ignoreNulls(), writeConf.parallelismLevel(),
writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(), writeConf.taskMetricsEnabled()));
writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(), writeConf.taskMetricsEnabled(), writeConf.rateLimiterProvider()));
else
return this;
}
Expand All @@ -324,7 +326,7 @@ public WriterBuilder withIgnoreNulls(boolean ignoreNulls) {
return withWriteConf(
new WriteConf(writeConf.batchSize(), writeConf.batchGroupingBufferSize(), writeConf.batchGroupingKey(),
writeConf.consistencyLevel(), writeConf.ifNotExists(), ignoreNulls, writeConf.parallelismLevel(),
writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(), writeConf.taskMetricsEnabled()));
writeConf.throughputMiBPS(), writeConf.ttl(), writeConf.timestamp(), writeConf.taskMetricsEnabled(), writeConf.rateLimiterProvider()));
else
return this;
}
Expand All @@ -343,7 +345,8 @@ private WriterBuilder withTimestamp(TimestampOption timestamp) {
writeConf.throughputMiBPS(),
writeConf.ttl(),
timestamp,
writeConf.taskMetricsEnabled()));
writeConf.taskMetricsEnabled(),
writeConf.rateLimiterProvider()));
}


Expand Down Expand Up @@ -424,7 +427,8 @@ private WriterBuilder withTTL(TTLOption ttl) {
writeConf.throughputMiBPS(),
ttl,
writeConf.timestamp(),
writeConf.taskMetricsEnabled()));
writeConf.taskMetricsEnabled(),
writeConf.rateLimiterProvider()));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class RDDFunctions[T](rdd: RDD[T]) extends WritableToCassandra[T] with Serializa
implicit
connector: CassandraConnector = CassandraConnector(sparkContext),
rwf: RowWriterFactory[T]): Unit = {

val writer = TableWriter(connector, keyspaceName, tableName, columns, writeConf)
rdd.sparkContext.runJob(rdd, writer.write _)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import com.datastax.spark.connector._
import com.datastax.spark.connector.cql._
import com.datastax.spark.connector.rdd.reader._
import com.datastax.spark.connector.writer._
import com.datastax.spark.connector.util.RateLimiterUtil
import com.google.common.util.concurrent.{FutureCallback, Futures, SettableFuture}
import org.apache.spark.rdd.RDD

Expand Down Expand Up @@ -118,10 +119,11 @@ class CassandraJoinRDD[L, R] private[connector](
rowMetadata: CassandraRowMetadata,
leftIterator: Iterator[L]
): Iterator[(L, R)] = {
val rateLimiter = new RateLimiter(
readConf.readsPerSec, readConf.readsPerSec
val rateLimiter = RateLimiterUtil.getRateLimiter(
readConf.rateLimiterProvider,
readConf.readsPerSec,
readConf.readsPerSec
)

val queryExecutor = QueryExecutor(session, readConf.parallelismLevel, None, None)

def pairWithRight(left: L): SettableFuture[Iterator[(L, R)]] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import com.datastax.driver.core.{ResultSet, Session}
import com.datastax.spark.connector._
import com.datastax.spark.connector.cql._
import com.datastax.spark.connector.rdd.reader._
import com.datastax.spark.connector.util.RateLimiterUtil
import com.datastax.spark.connector.writer._
import com.google.common.util.concurrent.{FutureCallback, Futures, SettableFuture}
import org.apache.spark.rdd.RDD
Expand Down Expand Up @@ -141,8 +142,10 @@ class CassandraLeftJoinRDD[L, R] private[connector](
rowMetadata: CassandraRowMetadata,
leftIterator: Iterator[L]
): Iterator[(L, Option[R])] = {
val rateLimiter = new RateLimiter(
readConf.readsPerSec, readConf.readsPerSec
val rateLimiter = RateLimiterUtil.getRateLimiter(
readConf.rateLimiterProvider,
readConf.readsPerSec,
readConf.readsPerSec
)

val queryExecutor = QueryExecutor(session, readConf.parallelismLevel,None, None)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package com.datastax.spark.connector.rdd

import com.datastax.driver.core.HostDistance
import com.datastax.spark.connector.cql._
import com.datastax.spark.connector.rdd.ClusteringOrder.{Ascending, Descending}
import com.datastax.spark.connector.rdd.reader._
Expand All @@ -27,7 +26,6 @@ abstract class CassandraRDD[R : ClassTag](

ConfigCheck.checkConfig(sc.getConf)


protected[connector] def keyspaceName: String

protected[connector] def tableName: String
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package com.datastax.spark.connector.rdd

import com.datastax.driver.core.ConsistencyLevel
import com.datastax.spark.connector.util.{ConfigParameter, ConfigCheck, Logging}
import com.datastax.spark.connector.util.{ConfigCheck, ConfigParameter, Logging}
import main.scala.com.datastax.spark.connector.writer.LeakyBucketRateLimiterProvider
import org.apache.spark.SparkConf

/** Read settings for RDD
Expand All @@ -15,15 +16,18 @@ import org.apache.spark.SparkConf
* @param taskMetricsEnabled whether or not enable task metrics updates (requires Spark 1.2+)
* @param readsPerSec maximum read throughput allowed per single core in requests/s while
* joining an RDD with C* table (joinWithCassandraTable operation)
* also used by enterprise integrations*/
* also used by enterprise integrations
* @param rateLimiterProvider fully qualified name to a custom rate limiter provider
*/
case class ReadConf(
splitCount: Option[Int] = None,
splitSizeInMB: Int = ReadConf.SplitSizeInMBParam.default,
fetchSizeInRows: Int = ReadConf.FetchSizeInRowsParam.default,
consistencyLevel: ConsistencyLevel = ReadConf.ConsistencyLevelParam.default,
taskMetricsEnabled: Boolean = ReadConf.TaskMetricParam.default,
parallelismLevel: Int = ReadConf.ParallelismLevelParam.default,
readsPerSec: Int = ReadConf.ReadsPerSecParam.default
readsPerSec: Int = ReadConf.ReadsPerSecParam.default,
rateLimiterProvider: String = ReadConf.RateLimiterProviderParam.default
)


Expand Down Expand Up @@ -93,6 +97,13 @@ object ReadConf extends Logging {
"""Sets max requests per core per second for joinWithCassandraTable and some Enterprise integrations"""
)

val RateLimiterProviderParam = ConfigParameter[String] (
name = "spark.cassandra.input.ratelimiterprovider",
section = ReferenceSection,
default = new LeakyBucketRateLimiterProvider().getClass.getName,
description = """Determines which rate limiter provider to use in reads"""
)

// Whitelist for allowed Read environment variables
val Properties = Set(
SplitCountParam,
Expand All @@ -102,7 +113,8 @@ object ReadConf extends Logging {
SplitSizeInMBParam,
TaskMetricParam,
ThroughputJoinQueryPerSecParam,
ParallelismLevelParam
ParallelismLevelParam,
RateLimiterProviderParam
)

def fromSparkConf(conf: SparkConf): ReadConf = {
Expand Down Expand Up @@ -136,7 +148,8 @@ object ReadConf extends Logging {
readsPerSec = conf.getInt(ReadsPerSecParam.name,
throughtputJoinQueryPerSec.getOrElse(ReadsPerSecParam.default)),
parallelismLevel = conf.getInt(ParallelismLevelParam.name, ParallelismLevelParam.default),
splitCount = conf.getOption(SplitCountParam.name).map(_.toInt)
splitCount = conf.getOption(SplitCountParam.name).map(_.toInt),
rateLimiterProvider = conf.get(RateLimiterProviderParam.name, RateLimiterProviderParam.default)
)
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.datastax.spark.connector.util

import com.datastax.spark.connector.writer.{BaseRateLimiter, RateLimiterProvider}

/**
* Exports a method to retrieve a custom rate limiter based on dynamic configuration.
*/
object RateLimiterUtil extends Logging {
var provider:RateLimiterProvider = _

/**
* Instantiates a rate limiter provider based on its fully qualified classname and should that not be possible,
* fallbacks to the leaky bucket rate limiter provider in this project.
*
* @param className fully qualified classname of the rate limiter provider to instantiate
* @param args optional sequence of arguments passed on to the provider
* @return an instantiated rate limiter
*/
def getRateLimiter(className: String, args: Any*): BaseRateLimiter = {
try {
provider = Class.forName(className).newInstance.asInstanceOf[RateLimiterProvider]
} catch {
case e:ClassNotFoundException => {
logError("Could not find custom rate limiter provider. Error: " + e)
throw e
}
case e:InstantiationException => {
logError("Could not instantiate custom rate limiter provider. Error: " + e)
throw e
}
case e:Throwable => {
logError("Error: " + e)
throw e
}
}

provider.getRateLimiterWithConf(args:_*)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package com.datastax.spark.connector.writer

/**
* Represents a rate limiter.
*/
trait BaseRateLimiter {

/**
* Processes a single packet and it is up to the implementing class to determine whether
* or not the thread should sleep.
*
* @param packetSize the size of the packet currently being processed
*/
def maybeSleep(packetSize: Long): Unit

}
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ import scala.annotation.tailrec
* @param sleep a function to call to slow down the calling thread;
* must use the same time units as `time`
*/
class RateLimiter(
rate: Long,
bucketSize: Long,
time: () => Long = System.currentTimeMillis,
sleep: Long => Any = Thread.sleep) {
class LeakyBucketRateLimiter(
rate: Long,
bucketSize: Long,
time: () => Long = System.currentTimeMillis,
sleep: Long => Any = Thread.sleep) extends BaseRateLimiter {

require(rate > 0, "A positive rate is required")
require(bucketSize > 0, "A positive bucket size is required")
Expand Down Expand Up @@ -62,5 +62,4 @@ class RateLimiter(
if (delay > 0L)
sleep(delay)
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package main.scala.com.datastax.spark.connector.writer

import com.datastax.spark.connector.util.Logging
import com.datastax.spark.connector.writer.{BaseRateLimiter, LeakyBucketRateLimiter, RateLimiterProvider}

/**
* Instantiates a leaky bucket rate limiter based on the supplied configuration.
*/
class LeakyBucketRateLimiterProvider extends RateLimiterProvider with Logging {
{}

override def getRateLimiterWithConf(args: Any*): BaseRateLimiter = {
val rate = args(0).asInstanceOf[Number].longValue
val bucketSize = args(1).asInstanceOf[Number].longValue

/**
* If optional arguments are present and cannot be casted correctly,
* omit them and instantiate rate limiter with only rate and bucketSize
*/
try {
if (args.size > 2) {
val time = args(2).asInstanceOf[() => Long]
if (args.size > 3) {
val sleep = args(3).asInstanceOf[Long => Any]
new LeakyBucketRateLimiter(rate, bucketSize, time, sleep)
}
new LeakyBucketRateLimiter(rate, bucketSize, time)
}
} catch {
case _: Exception => {
logError("Invalid optional arguments when instantiating leaky bucket rate limiter")
new LeakyBucketRateLimiter(rate, bucketSize)
}
}

new LeakyBucketRateLimiter(rate, bucketSize)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.datastax.spark.connector.writer

/**
* Represents a provider that creates and returns a rate limiter with possible configuration.
*/
trait RateLimiterProvider {
/**
* Given a set of arguments, instantiates and returns a rate limiter.
*
* @param args sequence of arguments that can customize the returned rate limiter
* @return the created rate limiter
*/
def getRateLimiterWithConf(args: Any*): BaseRateLimiter
}
Loading