In [34]:
from pyspark.sql import SparkSession
spark = SparkSession\
    .builder\
    .appName("XML Parse")\
    .config("spark.sql.shuffle.partitions", "10")\
    .config("spark.metrics.conf.*.sink.console.class", "org.apache.spark.metrics.sink.ConsoleSink")\
    .getOrCreate()

In [35]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [36]:
lines = sqlContext.read\
    .format("com.databricks.spark.xml")\
    .option("rowTag", "XML_FILE")\
    .option("inferSchema", "false")\
    .load('hdfs://localhost:8020/user/danielqueiroz/datalake/raw/example_*.xml')

In [37]:
lines.printSchema()

root
 |-- LIMIT: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- AVAILABLE: string (nullable = true)
 |    |    |-- AVAILABLE_AMOUNT: string (nullable = true)
 |    |    |-- AVAILABLE_AMOUNT_USD: string (nullable = true)
 |    |    |-- BLOCKED: string (nullable = true)
 |    |    |-- CAP_LIST: struct (nullable = true)
 |    |    |    |-- CAP: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- AVAILABLE: string (nullable = true)
 |    |    |    |    |    |-- AVAILABLE_AMOUNT: string (nullable = true)
 |    |    |    |    |    |-- AVAILABLE_AMOUNT_USD: string (nullable = true)
 |    |    |    |    |    |-- BLOCKED: string (nullable = true)
 |    |    |    |    |    |-- CATEGORY: string (nullable = true)
 |    |    |    |    |    |-- CURRENCY: string (nullable = true)
 |    |    |    |    |    |-- END_DATE: string (nullable = true)
 |    |    |    |    |    |-- EXTENSION_DATE: string (

In [38]:
# realiza as tranformações necessárias para realizar o parse do xml

from pyspark.sql.types import *
from pyspark.sql import functions as F

# carrega a estrutura inicial do xml
lines_transf_001 = lines\
    .select(
        F.col("_BASE_DATE").alias("base_date"), 
        F.explode("LIMIT").alias("limit")
    )

result = lines_transf_001\
    .selectExpr(
        "base_date", 
        "limit._ID as limit_id", 
        "explode(limit.FACILITY_LIST.FACILITY) as facility")\
    .selectExpr(
        "base_date", 
        "limit_id", 
        "facility._ID as facility_id", 
        "facility.PARENT_ID as facility_parent_id", 
        "facility.LOCAL_FACILITY_ID as facility_local_id",
        "facility.GLOBAL_FACILITY_ID as facility_global_id",
        "explode(facility.UTILIZATION_LIST.UTILIZATION) as utilization")\
    .selectExpr(
        "base_date", 
        "limit_id", 
        "facility_id", 
        "facility_parent_id",
        "facility_local_id", 
        "facility_global_id", 
        "utilization._TRADE_REF_ID AS trade_ref_id")\
    .select( 
        "base_date",
        "limit_id",
        "facility_id", 
        "facility_parent_id",
        "facility_local_id", 
        "facility_global_id",
        "trade_ref_id"
    )


In [40]:
result.explain()

== Physical Plan ==
*(2) Project [base_date#740, limit_id#745, _gen_alias_787#787 AS facility_id#753, _gen_alias_788#788 AS facility_parent_id#754, _gen_alias_789#789 AS facility_local_id#755, _gen_alias_790#790 AS facility_global_id#756, utilization#763._TRADE_REF_ID AS trade_ref_id#771]
+- Generate explode(_gen_alias_791#791), [base_date#740, limit_id#745, _gen_alias_787#787, _gen_alias_788#788, _gen_alias_789#789, _gen_alias_790#790], false, [utilization#763]
   +- *(1) Project [base_date#740, _gen_alias_792#792 AS limit_id#745, facility#749._ID AS _gen_alias_787#787, facility#749.PARENT_ID AS _gen_alias_788#788, facility#749.LOCAL_FACILITY_ID AS _gen_alias_789#789, facility#749.GLOBAL_FACILITY_ID AS _gen_alias_790#790, facility#749.UTILIZATION_LIST.UTILIZATION AS _gen_alias_791#791]
      +- Generate explode(_gen_alias_793#793), [base_date#740, _gen_alias_792#792], false, [facility#749]
         +- Project [_BASE_DATE#737 AS base_date#740, limit#742._ID AS _gen_alias_792#792, limit

In [30]:
result.printSchema()

root
 |-- base_date: string (nullable = true)
 |-- limit_id: string (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- facility_parent_id: string (nullable = true)
 |-- facility_local_id: string (nullable = true)
 |-- facility_global_id: string (nullable = true)
 |-- trade_ref_id: string (nullable = true)

