In [41]:
from datetime import datetime
import json
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
#from pyspark.sql import sparkContext

def parse_csv(line):
    record_type_pos = 2
    record = line.split(",")
    try:
        if record[record_type_pos] == 'T':
              return (datetime.strptime(record[0], '%Y-%m-%d').date(),
                      record[2],
                      record[3],
                      record[6],
                      datetime.strptime(record[4], '%Y-%m-%d %H:%M:%S.%f'),
                      int(record[5]),
                      datetime.strptime(record[1], '%Y-%m-%d %H:%M:%S.%f'),
                      float(record[7]),
                      None,
                      None,
                      None,
                      None,
                      record[2])
        elif record[record_type_pos] == 'Q':
              return (datetime.strptime(record[0], '%Y-%m-%d').date(),
                      record[2],
                      record[3],
                      record[6],
                      datetime.strptime(record[4], '%Y-%m-%d %H:%M:%S.%f'),
                      int(record[5]),
                      datetime.strptime(record[1], '%Y-%m-%d %H:%M:%S.%f'),
                      None,
                      float(record[7]),
                      int(record[8]),
                      float(record[9]),
                      int(record[10]),
                      record[2])
    except:
        return (None, None, None, None, None, None, None, None, None, None, None, None, 'B')


def parse_json(line):
    line = json.loads(line)
    record_type = line['event_type']
    try:
        if record_type == 'T':
              return (datetime.strptime(line['trade_dt'], '%Y-%m-%d').date(),
                      record_type,
                      line['symbol'],
                      line['exchange'],
                      datetime.strptime(line['event_tm'], '%Y-%m-%d %H:%M:%S.%f'),
                      line['event_seq_nb'],
                      datetime.strptime(line['file_tm'], '%Y-%m-%d %H:%M:%S.%f'),
                      line['price'],
                      None,
                      None,
                      None,
                      None,
                      record_type)
        elif record_type == 'Q':
              return (datetime.strptime(line['trade_dt'], '%Y-%m-%d').date(),
                      record_type,
                      line['symbol'],
                      line['exchange'],
                      datetime.strptime(line['event_tm'], '%Y-%m-%d %H:%M:%S.%f'),
                      line['event_seq_nb'],
                      datetime.strptime(line['file_tm'], '%Y-%m-%d %H:%M:%S.%f'),
                      None,
                      line['bid_pr'],
                      line['bid_size'],
                      line['ask_pr'],
                      line['ask_size'],
                      record_type)
    except:
        return (None, None, None, None, None, None, None, None, None, None, None, None, 'B')

common_event_schema = StructType([StructField('trade_dt', DateType(), True),
                           StructField('rec_type', StringType(), True),
                           StructField('symbol', StringType(), True),
                           StructField('exchange', StringType(), True),
                           StructField('event_tm', TimestampType(), True),
                           StructField('event_seq_nb', IntegerType(), True),
                           StructField('arrival_tm', TimestampType(), True),
                           StructField('trade_pr', FloatType(), True),
                           StructField('bid_pr', FloatType(), True),
                           StructField('bid_size', IntegerType(), True),
                           StructField('ask_pr', FloatType(), True),
                           StructField('ask_size', IntegerType(), True),
                           StructField('partition', StringType(), True)])

dates = ['2020-08-05','2020-08-06']
csvlist = []
jsonlist = []
#spark = SparkSession.builder.master(‘local’).appName(‘app’).getOrCreate()
spark = SparkSession.builder.getOrCreate()
spark.conf.set("fs.azure.account.key.asastorewin.blob.core.windows.net","k+/sXvVVFPVSeOuCPMW8HUq9uqvczt4Ya0fQdPGxRepxIVnQ5iJtIXYxw14Y9Y5M5YmvyIs+Cuy8sXZwIn7eXQ==")
for dt in dates:
    rawcsv = spark.sparkContext.textFile(f"wasbs://newestcontainer@asastorewin.blob.core.windows.net/data/csv/%s/NYSE/*.txt" %dt)
    rawjson = spark.sparkContext.textFile(f"wasbs://newestcontainer@asastorewin.blob.core.windows.net/data/json/%s/NASDAQ/*.txt" %dt)
    parsedcsv = rawcsv.map(lambda line: parse_csv(line))
    parsedjson = rawjson.map(lambda line: parse_json(line))
    datacsv = spark.createDataFrame(parsedcsv, common_event_schema)
    datajson = spark.createDataFrame(parsedjson, common_event_schema)
    csvlist.append(datacsv)
    jsonlist.append(datajson)

csv_data = csvlist[0].union(csvlist[1])
json_data= jsonlist[0].union(jsonlist[1])
all_data = csv_data.union(json_data)

print('csv data: ')
csv_data.show()

print('\n\njson data: ')
json_data.show()

#json_data.write.partitionBy("partition").mode("overwrite").parquet("wasbs://newestcontainer@asastorewin.blob.core.windows.net/output_dir")

StatementMeta(SparkPool01, 2, 43, Finished, Available)

#subset of data from printing (rand out of compute after making a change and can no longer output)
#+----------+--------------------+--------+------+--------------------+------------+--------+------+--------+------+--------+---------+
#|  trade_dt|          arrival_tm|rec_type|symbol|            event_tm|event_seq_nb|trade_pr|bid_pr|bid_size|ask_pr|ask_size|partition|
#+----------+--------------------+--------+------+--------------------+------------+--------+------+--------+------+--------+---------+
#|2020-08-06|2020-08-06 09:30:...|       Q|  SYMA|2020-08-06 09:39:...|           1|    NYSE|    78|     100|    78|     100|        Q|
#|2020-08-06|2020-08-06 09:30:...|       Q|  SYMA|2020-08-06 09:47:...|           2|    NYSE|    77|     100|    77|     100|        Q|
#|2020-08-06|2020-08-06 09:30:...|       Q|  SYMA|2020-08-06 09:56:...|           3|    NYSE|    75|     100|    75|     100|        Q|
#+----------+--------------------+--------+------+--------------------+------------+--------+------+--------+------+--------+---------+