# spark-json-to-parquet

Converts a JSON file to parquet using ApacheSpark

In [None]:
!pip install pyspark==3.3.1

In [None]:
import glob
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import shutil
import sys
import logging
import re

In [None]:
# source path and file name (default: data.csv)
data_json = os.environ.get('data_json', 'data.json')

# destination path and parquet file name (default: data.parquet)
output_data_parquet = os.environ.get('output_data_parquet', 'data.parquet')

# url of master (default: local mode)
master = os.environ.get('master', "local[*]")

# temporal data storage for local execution
data_dir = os.environ.get('data_dir', '../../data/')

In [None]:
parameters = list(
    map(lambda s: re.sub('$', '"', s),
        map(
            lambda s: s.replace('=', '="'),
            filter(
                lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)),
                sys.argv
            )
    )))

for parameter in parameters:
    logging.warning('Parameter: ' + parameter)
    exec(parameter)

In [None]:
sc = SparkContext.getOrCreate(SparkConf().setMaster(master))
spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.json(data_dir + data_json)

In [None]:
destination = data_dir + output_data_parquet

if os.path.isfile(destination):
    os.remove(destination)
if os.path.isdir(destination):
    shutil.rmtree(destination)

In [None]:
df.coalesce(1).write.parquet(destination)

In [None]:
# get rid of nasty spark / hadoop folder
source = glob.glob(destination+ '/*.parquet')[0]
shutil.move(source, data_dir + output_data_parquet + '.tmp')
shutil.rmtree(destination) 
shutil.move(data_dir + output_data_parquet + '.tmp', destination)