In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import json
from collections import namedtuple


In [9]:
spark = SparkSession.builder.appName('teste').getOrCreate()

In [10]:
# Defina o esquema
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
schema = StructType([
    StructField("visitnumber", StringType(), True),
    StructField("visitid", StringType(), True),
    StructField("customdimensions", ArrayType(StructType([
        StructField("index", IntegerType(), True),
        StructField("value", StringType(), True)
    ])), True)
])

# Crie um DataFrame a partir do esquema

df = spark.read.json("dados_para_teste/array_of_structs.json", multiLine=True, schema=schema)
df.show(truncate=False)
df.printSchema()


+-----------+-------+---------------------------------------------------+
|visitnumber|visitid|customdimensions                                   |
+-----------+-------+---------------------------------------------------+
|3          |id-3   |[{1, example1}, {2, example2}, {3, example3}]      |
|4          |id-4   |[{1, example4}, {2, example5}, {3, example6}]      |
|5          |id-5   |[{1, example7}, {2, }, {3, example9}]              |
|6          |id-6   |[{15, example10}, {10, example11}, {4, example12}] |
|7          |id-7   |[{16, example13}, {10, example14}, {14, example15}]|
|8          |id-8   |[{16, example16}, {1, example17}, {3, }]           |
+-----------+-------+---------------------------------------------------+

root
 |-- visitnumber: string (nullable = true)
 |-- visitid: string (nullable = true)
 |-- customdimensions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- index: integer (nullable = true)
 |    |    |-- value: string 

In [11]:
df_new = df.withColumn("customdimensions", map_from_entries("customdimensions"))
df_new.show(truncate=False)
df_new.printSchema()

+-----------+-------+---------------------------------------------------+
|visitnumber|visitid|customdimensions                                   |
+-----------+-------+---------------------------------------------------+
|3          |id-3   |{1 -> example1, 2 -> example2, 3 -> example3}      |
|4          |id-4   |{1 -> example4, 2 -> example5, 3 -> example6}      |
|5          |id-5   |{1 -> example7, 2 -> , 3 -> example9}              |
|6          |id-6   |{15 -> example10, 10 -> example11, 4 -> example12} |
|7          |id-7   |{16 -> example13, 10 -> example14, 14 -> example15}|
|8          |id-8   |{16 -> example16, 1 -> example17, 3 -> }           |
+-----------+-------+---------------------------------------------------+

root
 |-- visitnumber: string (nullable = true)
 |-- visitid: string (nullable = true)
 |-- customdimensions: map (nullable = true)
 |    |-- key: integer
 |    |-- value: string (valueContainsNull = true)



In [12]:
indices_selecionados = namedtuple('indices_selecionados', ['index', 'nome'])
lista_indices_selecionados = [
    indices_selecionados(1, "nome1"), 
    indices_selecionados(3, "nome2"), 
    indices_selecionados(14, "nome3"), 
    indices_selecionados(16, "nome4")
]

for y in lista_indices_selecionados:
    nome_col = "index_" + str(y.nome)
    
    
    a = df_new.customdimensions.getField(y.index)
        
    
    df_new = df_new.withColumn(nome_col, a)
    

In [13]:
df_new.show(truncate=False)
df_new.printSchema()

+-----------+-------+---------------------------------------------------+-----------+-----------+-----------+-----------+
|visitnumber|visitid|customdimensions                                   |index_nome1|index_nome2|index_nome3|index_nome4|
+-----------+-------+---------------------------------------------------+-----------+-----------+-----------+-----------+
|3          |id-3   |{1 -> example1, 2 -> example2, 3 -> example3}      |example1   |example3   |null       |null       |
|4          |id-4   |{1 -> example4, 2 -> example5, 3 -> example6}      |example4   |example6   |null       |null       |
|5          |id-5   |{1 -> example7, 2 -> , 3 -> example9}              |example7   |example9   |null       |null       |
|6          |id-6   |{15 -> example10, 10 -> example11, 4 -> example12} |null       |null       |null       |null       |
|7          |id-7   |{16 -> example13, 10 -> example14, 14 -> example15}|null       |null       |example15  |example13  |
|8          |id-8   |{16