In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DateType, \
    StringType, TimestampType, DecimalType, IntegerType
from parsers import parse_line
import glob

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.addPyFile("parsers.py")

In [2]:
from configreader import ConfigReader
from azure.storage.blob import BlobServiceClient

reader = ConfigReader("config.cfg", "azure-storage")
config = reader.get_config()

# Get Azure storage info from config
storage_acct_name = config["account_name"]
storage_acct_access_key = config["access_key"]
storage_container = config["container_name"]

# Set Spark Azure storage account and key
storage_acct_key_str = f"fs.azure.account.key.{storage_acct_name}.blob.core.windows.net"
spark.conf.set(storage_acct_key_str, storage_acct_access_key)

# Set base Spark filepath for container
container_base_path = f"​wasbs://{storage_container}@{storage_acct_name}.blob.core.windows.net"
mount_base_path 

# Set up container client
blob_service_client = BlobServiceClient(account_url=f"https://{storage_acct_name}.blob.core.windows.net", \
    credential=storage_acct_access_key)
container_client = blob_service_client.get_container_client(storage_container)
ingest_data_dir = "data"

# Set filetype
file_type = "txt"
file_suffix = f".{file_type}"
suffix_len = len(file_suffix)

# Get list of file names
blob_list = container_client.list_blobs(name_starts_with=ingest_data_dir)
cont_filepaths = [ blob.name for blob in blob_list if blob.name[-suffix_len:] == file_suffix ]

spark_filepath_list = [ f"{container_base_path}/{file}" for file in cont_filepaths ]
spark_filepath_str = ",".join(spark_filepath_list)
print(spark_filepath_str)

​wasbs://stock-proj-data@cdgreggstockproj.blob.core.windows.net/data/csv/2020-08-05/NYSE/part-00000-5e4ced0a-66e2-442a-b020-347d0df4df8f-c000.txt,​wasbs://stock-proj-data@cdgreggstockproj.blob.core.windows.net/data/csv/2020-08-06/NYSE/part-00000-214fff0a-f408-466c-bb15-095cd8b648dc-c000.txt,​wasbs://stock-proj-data@cdgreggstockproj.blob.core.windows.net/data/json/2020-08-05/NASDAQ/part-00000-c6c48831-3d45-4887-ba5f-82060885fc6c-c000.txt,​wasbs://stock-proj-data@cdgreggstockproj.blob.core.windows.net/data/json/2020-08-06/NASDAQ/part-00000-092ec1db-39ab-4079-9580-f7c7b516a283-c000.txt


In [8]:
raw = spark.sparkContext.textFile(f'{spark_filepath_str}')
parsed = raw.map(lambda line: parse_line(line)) 

IllegalArgumentException: java.net.URISyntaxException: Illegal character in scheme name at index 0: ​wasbs://stock-proj-data@cdgreggstockproj.blob.core.windows.net/data/csv/2020-08-05/NYSE/part-00000-5e4ced0a-66e2-442a-b020-347d0df4df8f-c000.txt

In [5]:
# Establish common event schema
schema = StructType([ \
    StructField('trade_dt', DateType(), True), \
    StructField('rec_type', StringType(), True), \
    StructField('symbol', StringType(), True), \
    StructField('exchange', StringType(), True), \
    StructField('event_tm', TimestampType(), True), \
    StructField('event_seq_nb', IntegerType(), True), \
    StructField('arrival_tm', TimestampType(), True), \
    StructField('trade_pr', DecimalType(17,14), True), \
    StructField('bid_pr', DecimalType(17,14), True), \
    StructField('bid_size', IntegerType(), True), \
    StructField('ask_pr', DecimalType(17,14), True), \
    StructField('ask_size', IntegerType(), True), \
    StructField('partition', StringType(), True) \
])      

# Create dataframe with parsed data and schema
df = spark.createDataFrame(parsed, schema)

In [6]:
df.show(10)

Py4JJavaError: An error occurred while calling o46.showString.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: /"spark_filepath_str"
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:213)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:307)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:303)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:57)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:307)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:303)
	at org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:307)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:303)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:57)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:307)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:303)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:57)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:307)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:303)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:57)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:307)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:303)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:57)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:307)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:303)
	at org.apache.spark.rdd.RDD.getNumPartitions(RDD.scala:323)
	at org.apache.spark.sql.execution.collect.Collector.<init>(Collector.scala:199)
	at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:82)
	at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:88)
	at org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:75)
	at org.apache.spark.sql.execution.collect.InternalRowFormat$.collect(cachedSparkResults.scala:62)
	at org.apache.spark.sql.execution.ResultCacheManager.$anonfun$getOrComputeResultInternal$1(ResultCacheManager.scala:496)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResultInternal(ResultCacheManager.scala:495)
	at org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:399)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:58)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.collect.Collector$.callExecuteCollect(Collector.scala:118)
	at com.databricks.service.SparkServiceImpl$.$anonfun$executePlan$3(SparkServiceImpl.scala:122)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:126)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:267)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:104)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:852)
	at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:217)
	at com.databricks.service.SparkServiceImpl$.$anonfun$executePlan$1(SparkServiceImpl.scala:115)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$4(UsageLogging.scala:432)
	at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:240)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
	at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:235)
	at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:232)
	at com.databricks.spark.util.PublicDBLogging.withAttributionContext(DatabricksSparkUsageLogger.scala:18)
	at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:277)
	at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:270)
	at com.databricks.spark.util.PublicDBLogging.withAttributionTags(DatabricksSparkUsageLogger.scala:18)
	at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:413)
	at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:339)
	at com.databricks.spark.util.PublicDBLogging.recordOperation(DatabricksSparkUsageLogger.scala:18)
	at com.databricks.spark.util.PublicDBLogging.recordOperation0(DatabricksSparkUsageLogger.scala:55)
	at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:98)
	at com.databricks.spark.util.UsageLogger.recordOperation(UsageLogger.scala:70)
	at com.databricks.spark.util.UsageLogger.recordOperation$(UsageLogger.scala:57)
	at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:67)
	at com.databricks.spark.util.UsageLogging.recordOperation(UsageLogger.scala:347)
	at com.databricks.spark.util.UsageLogging.recordOperation$(UsageLogger.scala:326)
	at com.databricks.service.SparkServiceImpl$.recordOperation(SparkServiceImpl.scala:92)
	at com.databricks.service.SparkServiceImpl$.executePlan(SparkServiceImpl.scala:111)
	at com.databricks.service.SparkServiceRPCHandler.execute0(SparkServiceRPCHandler.scala:668)
	at com.databricks.service.SparkServiceRPCHandler.$anonfun$executeRPC0$1(SparkServiceRPCHandler.scala:474)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
	at com.databricks.service.SparkServiceRPCHandler.executeRPC0(SparkServiceRPCHandler.scala:370)
	at com.databricks.service.SparkServiceRPCHandler$$anon$2.call(SparkServiceRPCHandler.scala:321)
	at com.databricks.service.SparkServiceRPCHandler$$anon$2.call(SparkServiceRPCHandler.scala:307)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at com.databricks.service.SparkServiceRPCHandler.$anonfun$executeRPC$1(SparkServiceRPCHandler.scala:357)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
	at com.databricks.service.SparkServiceRPCHandler.executeRPC(SparkServiceRPCHandler.scala:334)
	at com.databricks.service.SparkServiceRPCServlet.doPost(SparkServiceRPCServer.scala:152)
	at javax.servlet.http.HttpServlet.service(HttpServlet.java:707)
	at javax.servlet.http.HttpServlet.service(HttpServlet.java:790)
	at org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:791)
	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:550)
	at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:190)
	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:501)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
	at org.eclipse.jetty.server.Server.handle(Server.java:516)
	at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:388)
	at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:633)
	at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:380)
	at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273)
	at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
	at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)
	at org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)
	at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)
	at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)
	at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)
	at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)
	at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:375)
	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:773)
	at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:905)
	at java.lang.Thread.run(Thread.java:748)


In [22]:
df.write.partitionBy("partition").mode("overwrite").parquet("ingest-data")