## JSON Plano to CSV in Spark

Nos llego un archivo json en texto plano (es decir todo en una sola linea) y deseamos convertirlo en un csv

In [2]:
# import findspark
# findspark.init()

# Creamos la session de Spark
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import os

spark = SparkSession.builder.getOrCreate()
print('Apache Spark Version :' + spark.sparkContext.version)

Apache Spark Version :3.5.0


### Formato

Si el archivo JSON tiene saltos de linea especificar la opcion `multiLine` en `True`

El formato del archivo JSON tiene que ser todos string, por defecto.(Esto para no entrar en conflictos y mantenerlo de forma generalizada, si es necesario algun otro tipo de dato se tiene que tener presente los distintos tipos de datos que maneja Spark)

In [11]:
path = "struct_1.json"
peopleDF = spark.read.json(path, multiLine=True)

peopleDF.printSchema()

root
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- gender: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- importe: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)



### Schema format Pyspark

In [12]:
json_schema = peopleDF.schema
json_schema

StructType([StructField('records', ArrayType(StructType([StructField('email', StringType(), True), StructField('first_name', StringType(), True), StructField('gender', StringType(), True), StructField('id', StringType(), True), StructField('importe', StringType(), True), StructField('last_name', StringType(), True)]), True), True)])

### Ejemplo de seguimiento

Ya importado el schema solo hay que pasarle por variable como lo haciamos en los otros ejemplos

In [13]:
df_json = spark.read.text("filejsonplano.csv")
df = df_json.withColumn("parsed_data", from_json(df_json["value"], json_schema))\
                .select("parsed_data.*")

df.printSchema()

root
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- gender: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- importe: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)



In [14]:
df = df.withColumn("rec_exp", explode_outer("records"))
df_final = df.select(col('rec_exp.*'))

In [15]:
df_final.show()
df_final.printSchema()

+--------------------+----------+------+---+-------+---------+
|               email|first_name|gender| id|importe|last_name|
+--------------------+----------+------+---+-------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   null|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2|   null| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|   null|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|   null|    Valek|
+--------------------+----------+------+---+-------+---------+

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: string (nullable = true)
 |-- importe: string (nullable = true)
 |-- last_name: string (nullable = true)



### JSON


In [2]:
path = "filejsonplano.csv"
df = spark.read.json(path)

In [3]:
df.printSchema()
df.show()

root
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- gender: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip_address: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)

+--------------------+
|             records|
+--------------------+
|[{jpenddreth0@cen...|
+--------------------+



In [4]:
df = df.withColumn("rec_exp", explode_outer("records"))
df_final = df.select(col('rec_exp.*'))

In [5]:
df_final.printSchema()
df_final.show()

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- last_name: string (nullable = true)

+--------------------+----------+------+---+--------------+---------+
|               email|first_name|gender| id|    ip_address|last_name|
+--------------------+----------+------+---+--------------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   26.58.193.2|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2| 229.179.4.212| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|180.66.162.255|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|  67.76.188.26|    Valek|
+--------------------+----------+------+---+--------------+---------+



In [6]:
path = "struct_2.csv"
df = spark.read.json(path)

In [7]:
df.printSchema()
df.show()

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- last_name: string (nullable = true)

+--------------------+----------+------+---+--------------+---------+
|               email|first_name|gender| id|    ip_address|last_name|
+--------------------+----------+------+---+--------------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   26.58.193.2|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2| 229.179.4.212| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|180.66.162.255|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|  67.76.188.26|    Valek|
+--------------------+----------+------+---+--------------+---------+



In [14]:
path = "s3.json"
df = spark.read.json(path, multiLine=True)

In [15]:
df.printSchema()
df.show()

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- last_name: string (nullable = true)

+--------------------+----------+------+---+--------------+---------+
|               email|first_name|gender| id|    ip_address|last_name|
+--------------------+----------+------+---+--------------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   26.58.193.2|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2| 229.179.4.212| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|180.66.162.255|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|  67.76.188.26|    Valek|
+--------------------+----------+------+---+--------------+---------+



In [16]:
path = "xx.json"
df = spark.read.json(path, multiLine=True)

In [17]:
df.printSchema()
df.show()

root
 |-- records: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- gender: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip_address: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)

+--------------------+
|             records|
+--------------------+
|[{jpenddreth0@cen...|
+--------------------+



In [18]:
df = df.withColumn("rec_exp", explode_outer("records"))
df_final = df.select(col('rec_exp.*'))

In [19]:
df_final.show()
df_final.printSchema()

+--------------------+----------+------+---+--------------+---------+
|               email|first_name|gender| id|    ip_address|last_name|
+--------------------+----------+------+---+--------------+---------+
|jpenddreth0@censu...|  Jeanette|Female|  1|   26.58.193.2|Penddreth|
|gfrediani1@senate...|   Giavani|  Male|  2| 229.179.4.212| Frediani|
| nbea2@imageshack.us|     Noell|Female|  3|180.66.162.255|      Bea|
|      wvalek3@vk.com|   Willard|  Male|  4|  67.76.188.26|    Valek|
+--------------------+----------+------+---+--------------+---------+

root
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- last_name: string (nullable = true)



In [19]:
df = spark.read.text("s4.csv")

In [20]:
# value se llama la columna, en mi caso es json
df_replace = df.withColumn('value', regexp_replace('value', '\};\{', '\}|\{'))

In [24]:
df_replace.show(20, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [29]:
df_final = df_replace.withColumn('value_explode', explode(split(df_replace.value, '\|')))

In [30]:
df_final.select(df_final.value_explode).show(20, False)

+--------------------------------------------------------------------------------------------------------------------------------------+
|value_explode                                                                                                                         |
+--------------------------------------------------------------------------------------------------------------------------------------+
|{"id":1,"first_name":"Jeanette","last_name":"Penddreth","email":"jpenddreth0@census.gov","gender":"Female","ip_address":"26.58.193.2"}|
|{"id":2,"first_name":"Giavani","last_name":"Frediani","email":"gfrediani1@senate.gov","gender":"Male","ip_address":"229.179.4.212"}   |
|{"id":3,"first_name":"Noell","last_name":"Bea","email":"nbea2@imageshack.us","gender":"Female","ip_address":"180.66.162.255"}         |
|{"id":4,"first_name":"Willard;howard","last_name":"Valek","email":"wvalek3@vk.com","gender":"Male","ip_address":"67.76.188.26"}       |
+----------------------------------------

<hr>

## Issues

No importa la comilla simple o doble, lo que hay que parsear es el `False, None` porque se corrompe el json.

Basicamente porque no son palabras reservadas de json, si no mas bien de Python

In [5]:
import os
base_path = "/home/jovyan/work"
csv_path = os.path.join(base_path, 'data/struct_3.csv')

df_json = spark.read.text(csv_path)
df_json.show(20, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                              |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{'_id': ObjectId('507f191e810c19729de860ea'),  'id':1,'first_name':'Jeanette','last_name':'Penddreth','email':'jpenddreth0@census.gov','gender':'Female','ip_address':'26.58.193.2', 'eschd': True}|
|{'_id': ObjectId('507f191e810c-19729de860ea'), 'id':2,'first_name':'Giavani','last_name':'Frediani','email':'gfrediani1@senate.gov','gender':'Male','ip_address':'229.179.4.212'}                  |
|{'_id': O

In [6]:
# Caso 1: Cambiamos el formato de la columna value para que sea un JSON válido
df_json = df_json.select(regexp_replace(df_json.value, 'True', '"True"').alias('value'))
df_json = df_json.select(regexp_replace(df_json.value, 'None', '"None"').alias('value'))
df_json.show(20, False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{'_id': ObjectId('507f191e810c19729de860ea'),  'id':1,'first_name':'Jeanette','last_name':'Penddreth','email':'jpenddreth0@census.gov','gender':'Female','ip_address':'26.58.193.2', 'eschd': "True"}|
|{'_id': ObjectId('507f191e810c-19729de860ea'), 'id':2,'first_name':'Giavani','last_name':'Frediani','email':'gfrediani1@senate.gov','gender':'Male','ip_address':'229.179.4.212'}                    |


In [7]:
# Caso 2: Cambiamos el formato de la columna ObjectId para que sea un JSON válido
df_json = df_json.select(regexp_replace(df_json.value, 
                                        r'ObjectId\((.+?)\)',
                                        regexp_extract(df_json.value, r'ObjectId\((.+?)\),', 1)).alias('value'))
df_json.show(20, False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                      |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{'_id': '507f191e810c19729de860ea',  'id':1,'first_name':'Jeanette','last_name':'Penddreth','email':'jpenddreth0@census.gov','gender':'Female','ip_address':'26.58.193.2', 'eschd': "True"}|
|{'_id': '507f191e810c-19729de860ea', 'id':2,'first_name':'Giavani','last_name':'Frediani','email':'gfrediani1@senate.gov','gender':'Male','ip_address':'229.179.4.212'}                    |
|{'_id': '507f19-1e810c19729de860ea', 'id':3,'firs

In [8]:
json_schema = StructType([
    StructField('_id', StringType(), True),
    StructField('id', StringType(), True),
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('email', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('ip_address', StringType(), True),
    StructField('eschd', StringType(), True),
])

df = df_json.withColumn("parsed_data", from_json(df_json["value"], json_schema))\
                .select("parsed_data.*")

df.show(20, False)

+-------------------------+---+----------+---------+----------------------+------+--------------+-----+
|_id                      |id |first_name|last_name|email                 |gender|ip_address    |eschd|
+-------------------------+---+----------+---------+----------------------+------+--------------+-----+
|507f191e810c19729de860ea |1  |Jeanette  |Penddreth|jpenddreth0@census.gov|Female|26.58.193.2   |True |
|507f191e810c-19729de860ea|2  |Giavani   |Frediani |gfrediani1@senate.gov |Male  |229.179.4.212 |NULL |
|507f19-1e810c19729de860ea|3  |Noell     |Bea      |nbea2@imageshack.us   |Female|180.66.162.255|None |
|507f191e810c19729de-860ea|4  |Willard   |Valek    |wvalek3@vk.com        |Male  |67.76.188.26  |NULL |
+-------------------------+---+----------+---------+----------------------+------+--------------+-----+

