In [1]:
import os
import atexit
import sys

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=3
tasks_per_node=8 
memory_per_task=1024 #1 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="1:00" #1 hour
os.environ['SBATCH_PARTITION']='single' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

sj.wait_to_start()
sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)

INFO:sparkhpc.sparkjob:Submitted batch job 610059

INFO:sparkhpc.sparkjob:Submitted cluster 0


In [31]:
from pyspark.sql.functions import rand

In [32]:
filtered_file = sc.textFile('allFiltered2.json')

handle1 = open('shuffled_filtered1.json', 'w')
handle2 = open('shuffled_filtered2.json', 'w')
handle3 = open('shuffled_filtered3.json', 'w')

In [33]:
df = filtered_file.map(lambda x: (x,)).toDF(['row'])

df = df.orderBy(rand())

selection = 0

for i in df.collect():
    if selection == 0:
        handle1.write(str(i['row']))
        handle1.write('\n')
        selection += 1
    elif selection == 1:
        handle2.write(str(i['row']))
        handle2.write('\n')
        selection += 1
    else:
        handle3.write(str(i['row']))
        handle3.write('\n')
        selection = 0

In [34]:
handle1.close()
handle2.close()
handle3.close()