In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DateType, \
    StringType, TimestampType, DecimalType, IntegerType
from parsers import parse_line, parse_csv, parse_json
import glob

spark = SparkSession.builder.master("local[*]").appName('Data ingest').getOrCreate()

In [2]:
# Load all files matching "part-*" at same time
filenames = glob.glob("**/part-*", recursive=True)
filename_string = ",".join(filenames)

# Ingest raw and parse
raw = spark.sparkContext.textFile(filename_string)
parsed = raw.map(lambda line: parse_line(line))

# Establish common event schema
schema = StructType([ \
    StructField('trade_dt', DateType(), True), \
    StructField('rec_type', StringType(), True), \
    StructField('symbol', StringType(), True), \
    StructField('exchange', StringType(), True), \
    StructField('event_tm', TimestampType(), True), \
    StructField('event_seq_nb', IntegerType(), True), \
    StructField('arrival_tm', TimestampType(), True), \
    StructField('trade_pr', DecimalType(17,14), True), \
    StructField('bid_pr', DecimalType(17,14), True), \
    StructField('bid_size', IntegerType(), True), \
    StructField('ask_pr', DecimalType(17,14), True), \
    StructField('ask_size', IntegerType(), True), \
    StructField('partition', StringType(), True) \
])      

# Create dataframe with parsed data and schema
df = spark.createDataFrame(parsed, schema)

In [3]:
df.show(10)

+----------+--------+------+--------+--------------------+------------+-------------------+-----------------+-----------------+--------+-----------------+--------+---------+
|  trade_dt|rec_type|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|         trade_pr|           bid_pr|bid_size|           ask_pr|ask_size|partition|
+----------+--------+------+--------+--------------------+------------+-------------------+-----------------+-----------------+--------+-----------------+--------+---------+
|2020-08-06|       Q|  SYMA|  NASDAQ|2020-08-06 09:38:...|           1|2020-08-06 09:30:00|             null|78.13370587077013|     100|79.82516338248990|     100|        Q|
|2020-08-06|       Q|  SYMA|  NASDAQ|2020-08-06 09:46:...|           2|2020-08-06 09:30:00|             null|76.52304470696788|     100|76.57240785476783|     100|        Q|
|2020-08-06|       Q|  SYMA|  NASDAQ|2020-08-06 09:52:...|           3|2020-08-06 09:30:00|             null|78.74535582037817|   

In [4]:
df.write.partitionBy("partition").mode("overwrite").parquet("ingest_data")