# Condense PARQUET
Condenses a partitioned folder containing PARQUET files into a single PARQUET file by re-partitioning it to
one and extracting the single PARQUET file

In [None]:
%%bash
export version=`python --version |awk '{print $2}' |awk -F"." '{print $1$2}'`

echo $version

if [ $version == '36' ] || [ $version == '37' ]; then
    echo 'Starting installation...'
    pip3 install pyspark==2.4.8 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
elif [ $version == '38' ] || [ $version == '39' ]; then
    pip3 install pyspark==3.1.2 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
else
    echo 'Currently only python 3.6, 3.7 , 3.8 and 3.9 are supported, in case you need a different version please open an issue at https://github.com/IBM/claimed/issues'
    exit -1
fi

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import sys
import re

In [None]:
# target parquet file (default: data_condensed.parquet)
data_parquet_condensed = os.environ.get('data_parquet', 'data_condensed.parquet')

# data_parquet path and parquet file name (default: data.parquet)
data_parquet = os.environ.get('data_parquet', 'data.parquet')

# master url of master (default: local mode)
master = os.environ.get('master', "local[*]")

# data_dir temporal data storage for local execution
data_dir = os.environ.get('data_dir', '../../data/')

In [None]:
# override parameters received from a potential call using %run magic
parameters = list(
    map(
        lambda s: re.sub('$', '"', s),
        map(
            lambda s: s.replace('=', '="'),
            filter(
                lambda s: s.find('=') > -1,
                sys.argv
            )
        )
    )
)

for parameter in parameters:
    exec(parameter)

In [None]:
sc = SparkContext.getOrCreate(SparkConf().setMaster(master))
spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.parquet(data_dir + data_parquet)

In [None]:
df = df.repartition(1)

In [None]:
df.write.parquet(data_dir + data_parquet_condensed)

In [None]:
!mv {data_dir}/{data_parquet_condensed}/`ls {data_dir}/{data_parquet_condensed} |grep .parquet` {data_dir}/{data_parquet_condensed}_tmp

In [None]:
!rm -Rf  {data_dir}/{data_parquet_condensed}

In [None]:
!mv {data_dir}/{data_parquet_condensed}_tmp {data_dir}/{data_parquet_condensed}