Skip to content
This repository has been archived by the owner on Mar 24, 2021. It is now read-only.

Commit

Permalink
introduce requestTimeout and concurrentSave for large volume of data …
Browse files Browse the repository at this point in the history
…save
  • Loading branch information
Yang Lei committed Sep 3, 2015
1 parent 51678c4 commit 9095e23
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 32 deletions.
4 changes: 3 additions & 1 deletion README.md
Expand Up @@ -35,7 +35,7 @@ Spark Version | Release # | Binary Location
1.3.0 | v0.1 | [Location] (https://github.com/cloudant/spark-cloudant/releases/download/v0.1/cloudant-spark.jar)
1.3.1 | v1.3.1.2 | [Location] (https://github.com/cloudant/spark-cloudant/releases/download/v1.3.1.2/cloudant-spark.jar)
1.4.0 | v1.4.0.0 | [Location] (https://github.com/cloudant/spark-cloudant/releases/download/1.4.0.0/cloudant-spark.jar)
1.4.1 | v1.4.1.1 | [Location] (https://github.com/cloudant/spark-cloudant/releases/download/v1.4.1.1/cloudant-spark.jar)
1.4.1 | v1.4.1.2 | [Location] (https://github.com/cloudant/spark-cloudant/releases/download/v1.4.1.2/cloudant-spark.jar)


### Build from source:
Expand Down Expand Up @@ -187,6 +187,8 @@ riak.port|| riak port
jsonstore.rdd.partitions|5|the number of partitions intent used to drive JsonStoreRDD loading query result in parallel. The actual number is calculated based on total rows returned and satisfying maxInPartition and minInPartition
jsonstore.rdd.maxInPartition|-1|the max rows in a partition. -1 means unlimited
jsonstore.rdd.minInPartition|10|the min rows in a partition.
jsonstore.rdd.requestTimeout|100000| the request timeout in milli-second
jsonstore.rdd.concurrentSave|-1| the parallel saving size. -1 means unlimited


Default values are defined in [here](cloudant-spark-sql/src/main/resources/application.conf)
Expand Down
4 changes: 3 additions & 1 deletion cloudant-spark-sql/src/main/resources/application.conf
Expand Up @@ -3,13 +3,15 @@ akka {
}

spray.can.server {
request-timeout = 30s
request-timeout = 100s
}

spark-sql {
jsonstore.rdd = {
partitions = 5
maxInPartition = -1
minInPartition = 10
requestTimeout = 100000
concurrentSave = -1
}
}
Expand Up @@ -29,7 +29,7 @@ import play.api.libs.json.JsNumber
/**
* @author yanglei
*/
@serializable case class CloudantConfig(val host: String, val dbName: String, val indexName: String = null)(implicit val username: String, val password: String, val partitions:Int, val maxInPartition: Int, val minInPartition:Int) extends JsonStoreConfig{
@serializable case class CloudantConfig(val host: String, val dbName: String, val indexName: String = null)(implicit val username: String, val password: String, val partitions:Int, val maxInPartition: Int, val minInPartition:Int, val requestTimeout:Long,val concurrentSave:Int) extends JsonStoreConfig{

private lazy val dbUrl = {"http://"+ host+"/"+dbName}

Expand Down
Expand Up @@ -37,6 +37,8 @@ trait JsonStoreConfig {
implicit val partitions: Int
implicit val maxInPartition: Int
implicit val minInPartition: Int
implicit val requestTimeout: Long
implicit val concurrentSave: Int
def allowPartition(): Boolean = {true}
def getOneUrl(): String
def getRangeUrl(field: String, start: Any, startInclusive:Boolean=false, end:Any, endInclusive:Boolean=false, includeDoc: Boolean = true): (String, Boolean)
Expand Down
Expand Up @@ -39,35 +39,40 @@ import com.cloudant.spark.CloudantConfig
val PARTITION_CONFIG = "jsonstore.rdd.partitions"
val MAX_IN_PARTITION_CONFIG = "jsonstore.rdd.maxInPartition"
val MIN_IN_PARTITION_CONFIG = "jsonstore.rdd.minInPartition"
val REQUEST_TIMEOUT_CONFIG = "jsonstore.rdd.requestTimeout"
val CONCURRENT_SAVE_CONFIG = "jsonstore.rdd.concurrentSave"

val configFactory = ConfigFactory.load()
import java.util.concurrent.TimeUnit._

val timeoutInMillis = Duration(configFactory.getDuration("spray.can.server.request-timeout", SECONDS),SECONDS).toMillis

val ROOT_CONFIG_NAME = "spark-sql"
val rootConfig = configFactory.getConfig(ROOT_CONFIG_NAME)
val defaultPartitions = rootConfig.getInt(PARTITION_CONFIG)
val defaultMaxInPartition = rootConfig.getInt(MAX_IN_PARTITION_CONFIG)
val defaultMinInPartition = rootConfig.getInt(MIN_IN_PARTITION_CONFIG)
val defaultRequestTimeout = rootConfig.getLong(REQUEST_TIMEOUT_CONFIG)
val defaultConcurrentSave = rootConfig.getInt(CONCURRENT_SAVE_CONFIG)

def getConfig(context: SQLContext, dbName: String, indexName:String = null): JsonStoreConfig = {
val sparkConf = context.sparkContext.getConf
implicit val total = sparkConf.getInt(PARTITION_CONFIG,defaultPartitions)
implicit val max = sparkConf.getInt(MAX_IN_PARTITION_CONFIG,defaultMaxInPartition)
implicit val min =sparkConf.getInt(MIN_IN_PARTITION_CONFIG,defaultMinInPartition)
implicit val requestTimeout =sparkConf.getLong(REQUEST_TIMEOUT_CONFIG,defaultRequestTimeout)
implicit val concurrentSave =sparkConf.getInt(CONCURRENT_SAVE_CONFIG,defaultConcurrentSave)

if (sparkConf.contains(CLOUDANT_HOST_CONFIG))
{
val host = sparkConf.get(CLOUDANT_HOST_CONFIG)
val user = sparkConf.get(CLOUDANT_USERNAME_CONFIG)
val passwd = sparkConf.get(CLOUDANT_PASSWORD_CONFIG)
return CloudantConfig(host, dbName, indexName)(user, passwd, total, max, min)
return CloudantConfig(host, dbName, indexName)(user, passwd, total, max, min,requestTimeout,concurrentSave)
}
if (sparkConf.contains(RIAK_HOST_CONFIG))
{
val host = sparkConf.get(RIAK_HOST_CONFIG)
val port = sparkConf.get(RIAK_PORT_CONFIG)
return RiakConfig(host, port, dbName)(partitions=total, maxInPartition=max, minInPartition=min)
return RiakConfig(host, port, dbName)(partitions=total, maxInPartition=max, minInPartition=min,requestTimeout=requestTimeout,concurrentSave=concurrentSave)
}
null
}
Expand All @@ -87,6 +92,9 @@ import com.cloudant.spark.CloudantConfig
val minS = parameters.getOrElse(MIN_IN_PARTITION_CONFIG,null)
implicit val min = if (minS ==null) sparkConf.getInt(MIN_IN_PARTITION_CONFIG,defaultMinInPartition) else minS.toInt

implicit val requestTimeout =sparkConf.getLong(REQUEST_TIMEOUT_CONFIG,defaultRequestTimeout)
implicit val concurrentSave =sparkConf.getInt(CONCURRENT_SAVE_CONFIG,defaultConcurrentSave)

val dbName = parameters.getOrElse("database", parameters.getOrElse("path",null))
val indexName = parameters.getOrElse("index",null)

Expand All @@ -98,13 +106,13 @@ import com.cloudant.spark.CloudantConfig
val host = parameters.getOrElse(CLOUDANT_HOST_CONFIG,sparkConf.get(CLOUDANT_HOST_CONFIG))
val user = parameters.getOrElse(CLOUDANT_USERNAME_CONFIG,sparkConf.get(CLOUDANT_USERNAME_CONFIG))
val passwd = parameters.getOrElse(CLOUDANT_PASSWORD_CONFIG,sparkConf.get(CLOUDANT_PASSWORD_CONFIG))
return CloudantConfig(host, dbName, indexName)(user, passwd, total, max, min)
return CloudantConfig(host, dbName, indexName)(user, passwd, total, max, min,requestTimeout,concurrentSave)
}
if (sparkConf.contains(RIAK_HOST_CONFIG) || parameters.contains(RIAK_HOST_CONFIG))
{
val host = parameters.getOrElse(RIAK_HOST_CONFIG,sparkConf.get(RIAK_HOST_CONFIG))
val port = parameters.getOrElse(RIAK_PORT_CONFIG,sparkConf.get(RIAK_PORT_CONFIG))
return RiakConfig(host, port, dbName)(partitions=total, maxInPartition=max, minInPartition=min)
return RiakConfig(host, port, dbName)(partitions=total, maxInPartition=max, minInPartition=min,requestTimeout=requestTimeout,concurrentSave=concurrentSave)
}
null
}
Expand Down
Expand Up @@ -45,7 +45,8 @@ import scala.util.Random
*/
class JsonStoreDataAccess (config: JsonStoreConfig) {

implicit lazy val timeout = {Timeout(JsonStoreConfigManager.timeoutInMillis)}
implicit lazy val timeout = {Timeout(config.requestTimeout)}
lazy val concurrentSave = config.concurrentSave
lazy val envSystem = {SparkEnv.get.actorSystem}

lazy val logger = {Logging(envSystem, getClass)}
Expand Down Expand Up @@ -168,30 +169,42 @@ class JsonStoreDataAccess (config: JsonStoreConfig) {
implicit val stringMarshaller = Marshaller.of[String](`application/json`) {
(value, ct, ctx) => ctx.marshalTo(HttpEntity(ct, value))
}
val allFutures = data.map { x =>
var pipeline: HttpRequest => Future[HttpResponse] = null
if (validCredentials!=null)
{
pipeline = (
addCredentials(validCredentials)
~> sendReceive
)
}else
val parallelSize = if (concurrentSave>0) concurrentSave else data.size
val blocks = data.size/parallelSize + (if ( data.size % parallelSize != 0) 1 else 0)

for (i <- 0 until blocks){
val start = parallelSize*i
val end = if (parallelSize+start<data.size) parallelSize+start else data.size
logger.info(s"Save from $start to $end for block size $blocks at $i/$blocks")
val allFutures = {
for ( j <- start until end) yield
{
val x = data(j)
var pipeline: HttpRequest => Future[HttpResponse] = null
if (validCredentials!=null)
{
pipeline = (
addCredentials(validCredentials)
~> sendReceive
)
}else
{
pipeline = sendReceive
}
val request = Post(url,x)
val response: Future[HttpResponse] = pipeline(request)
response
}
}
val f= Future.sequence(allFutures.toList)
val result = Await.result(f, timeout.duration)
if(!existing)
{
pipeline = sendReceive
logger.info("shutdown newly created ActorSystem")
system.shutdown()
}
val request = Post(url,x)
val response: Future[HttpResponse] = pipeline(request)
response
}
val f= Future.sequence(allFutures.toList)
val result = Await.result(f, timeout.duration)
if(!existing)
{
logger.info("shutdown newly created ActorSystem")
system.shutdown()
logger.info("Save result "+result.length +" rows is full:"+((end-start)==result.length))
}
logger.info("Save result "+result.length +"rows is full:"+(data.length==result.length))
}
}

Expand Up @@ -34,7 +34,7 @@ import play.api.libs.json.JsNumber
*/


@serializable case class RiakConfig(val host: String, val port: String, val dbName: String)(implicit val username: String=null, val password: String=null,val partitions:Int, val maxInPartition: Int, val minInPartition:Int) extends JsonStoreConfig{
@serializable case class RiakConfig(val host: String, val port: String, val dbName: String)(implicit val username: String=null, val password: String=null,val partitions:Int, val maxInPartition: Int, val minInPartition:Int,val requestTimeout:Long,val concurrentSave:Int) extends JsonStoreConfig{

private lazy val dbUrl = {"http://"+ host+":"+port+"/search/query/"+dbName+"?wt=json&q="}

Expand Down

0 comments on commit 9095e23

Please sign in to comment.