## JSON Plano to CSV in Spark

Nos llego un archivo json en texto plano (es decir todo en una sola linea) y deseamos convertirlo en un csv

In [1]:
import findspark
findspark.init()

# Creamos la session de Spark
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()
print('Apache Spark Version :' + spark.sparkContext.version)

Apache Spark Version :3.3.1


### Formato

Si el archivo JSON tiene saltos de linea especificar la opcion `multiLine` en `True`

El formato del archivo JSON tiene que ser todos string, por defecto.(Esto para no entrar en conflictos y mantenerlo de forma generalizada, si es necesario algun otro tipo de dato se tiene que tener presente los distintos tipos de datos que maneja Spark)

In [11]:
path = "struct_1.json"
peopleDF = spark.read.json(path, multiLine=True)

peopleDF.printSchema()

root
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- gender: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- importe: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)



### Schema format Pyspark

In [12]:
json_schema = peopleDF.schema
json_schema

StructType([StructField('records', ArrayType(StructType([StructField('email', StringType(), True), StructField('first_name', StringType(), True), StructField('gender', StringType(), True), StructField('id', StringType(), True), StructField('importe', StringType(), True), StructField('last_name', StringType(), True)]), True), True)])

### Ejemplo de seguimiento

Ya importado el schema solo hay que pasarle por variable como lo haciamos en los otros ejemplos

In [13]:
df_json = spark.read.text("filejsonplano.csv")
df = df_json.withColumn("parsed_data", from_json(df_json["value"], json_schema))\
                .select("parsed_data.*")

df.printSchema()

root
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- gender: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- importe: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)



In [14]:
df = df.withColumn("rec_exp", explode_outer("records"))
df_final = df.select(col('rec_exp.*'))

In [15]:
df_final.show()
df_final.printSchema()

+--------------------+----------+------+---+-------+---------+
|               email|first_name|gender| id|importe|last_name|
+--------------------+----------+------+---+-------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   null|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2|   null| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|   null|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|   null|    Valek|
+--------------------+----------+------+---+-------+---------+

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: string (nullable = true)
 |-- importe: string (nullable = true)
 |-- last_name: string (nullable = true)



### JSON


In [2]:
path = "filejsonplano.csv"
df = spark.read.json(path)

In [3]:
df.printSchema()
df.show()

root
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- gender: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip_address: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)

+--------------------+
|             records|
+--------------------+
|[{jpenddreth0@cen...|
+--------------------+



In [4]:
df = df.withColumn("rec_exp", explode_outer("records"))
df_final = df.select(col('rec_exp.*'))

In [5]:
df_final.printSchema()
df_final.show()

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- last_name: string (nullable = true)

+--------------------+----------+------+---+--------------+---------+
|               email|first_name|gender| id|    ip_address|last_name|
+--------------------+----------+------+---+--------------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   26.58.193.2|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2| 229.179.4.212| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|180.66.162.255|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|  67.76.188.26|    Valek|
+--------------------+----------+------+---+--------------+---------+



In [6]:
path = "struct_2.csv"
df = spark.read.json(path)

In [7]:
df.printSchema()
df.show()

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- last_name: string (nullable = true)

+--------------------+----------+------+---+--------------+---------+
|               email|first_name|gender| id|    ip_address|last_name|
+--------------------+----------+------+---+--------------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   26.58.193.2|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2| 229.179.4.212| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|180.66.162.255|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|  67.76.188.26|    Valek|
+--------------------+----------+------+---+--------------+---------+



In [14]:
path = "s3.json"
df = spark.read.json(path, multiLine=True)

In [15]:
df.printSchema()
df.show()

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- last_name: string (nullable = true)

+--------------------+----------+------+---+--------------+---------+
|               email|first_name|gender| id|    ip_address|last_name|
+--------------------+----------+------+---+--------------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   26.58.193.2|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2| 229.179.4.212| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|180.66.162.255|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|  67.76.188.26|    Valek|
+--------------------+----------+------+---+--------------+---------+



In [16]:
path = "xx.json"
df = spark.read.json(path, multiLine=True)

In [17]:
df.printSchema()
df.show()

root
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- gender: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip_address: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)

+--------------------+
|             records|
+--------------------+
|[{jpenddreth0@cen...|
+--------------------+



In [18]:
df = df.withColumn("rec_exp", explode_outer("records"))
df_final = df.select(col('rec_exp.*'))

In [19]:
df_final.show()
df_final.printSchema()

+--------------------+----------+------+---+--------------+---------+
|               email|first_name|gender| id|    ip_address|last_name|
+--------------------+----------+------+---+--------------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   26.58.193.2|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2| 229.179.4.212| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|180.66.162.255|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|  67.76.188.26|    Valek|
+--------------------+----------+------+---+--------------+---------+

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- last_name: string (nullable = true)

