# Samples from a parquet source using ApacheSpark

In [None]:
%%bash
export version=`python --version |awk '{print $2}' |awk -F"." '{print $1$2}'`

echo $version

if [ $version == '36' ] || [ $version == '37' ]; then
    echo 'Starting installation...'
    pip3 install pyspark==2.4.8 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
elif [ $version == '38' ] || [ $version == '39' ]; then
    pip3 install pyspark==3.1.2 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
else
    echo 'Currently only python 3.6, 3.7 , 3.8 and 3.9 are supported, in case you need a different version please open an issue at https://github.com/IBM/claimed/issues'
    exit -1
fi

In [None]:
# @param data_dir temporal data storage for local execution
# @param data_parquet path and parquet file name (default: data.parquet)
# @param data_parquet_target path and parquet file name
# (default: data_sample.parquet)
# @param master url of master (default: local mode)
# @param sampling_rate (default: 1%)
# @param sampling_seed (default: 42)

In [None]:
import os
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
import shutil

In [None]:
data_parquet = os.environ.get('data_parquet', 'data.parquet')
data_parquet_target = os.environ.get(
    'data_parquet_target', 'data_sample.parquet')
master = os.environ.get('master', "local[*]")
data_dir = os.environ.get('data_dir', '../../data/')
sampling_rate = float(os.environ.get('sampling_rate', 0.1))
sampling_seed = int(os.environ.get('sampling_rate', 42))

In [None]:
skip = False

if os.path.exists(data_dir + data_parquet_target):
    skip = True

In [None]:
if not skip:
    sc = SparkContext.getOrCreate(SparkConf().setMaster(master))
    spark = SparkSession.builder.getOrCreate()

In [None]:
if not skip:
    df = spark.read.parquet(data_dir + data_parquet)

In [None]:
if not skip:
    df = df.sample(sampling_rate, sampling_seed)

In [None]:
if not skip:
    shutil.rmtree(data_dir + data_parquet_target, ignore_errors=True)

In [None]:
if not skip:
    df.write.parquet(data_dir + data_parquet_target)