# Exemplo Spark.
Exemplo de dataflow especificado no SPARK instrumentado utilizando o wrapper desenvolvido em python.

## Composição:
Duas atividades de mapeamento que apenas recebem dados de entrada e os jogam na saída.

In [1]:
import uuid
from subprocess import Popen, PIPE, STDOUT
from datetime import datetime
from random import randint
import findspark
findspark.init('/opt/spark-2.2.0-bin-hadoop2.7')
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import DataFrameWriter, Row
import sys
sys.path.append('../')
from PDE import PDE

In [2]:
# Configura o spark
conf = ( SparkConf()
         .setMaster("local[*]")
         .setAppName('pyspark')
        )
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
base_url = "http://localhost:22000/"

In [3]:
class MapTransformation(object):
    def __init__(self,rdd):
        self._rdd = rdd
    
    def do_nothing(self):
        return self._rdd.map(lambda s: s)

In [4]:
# Configura proveniência prospectiva.

#Define o dataflow
p1 = Popen(['java', '-jar', 'PDG-1.0.jar','-dataflow','-tag','spark-example'])
p1.wait()
# Define as transformações.
p2 = Popen(['java', '-jar', 'PDG-1.0.jar','-transformation','-dataflow',
       'spark-example','-tag','dt1'])
p2.wait()
p3= Popen(['java', '-jar', 'PDG-1.0.jar','-transformation','-dataflow',
       'spark-example','-tag','dt2'])
p3.wait()

# Define os programas
p4 = Popen(['java', '-jar', 'PDG-1.0.jar','-program',
            '-dataflow','spark-example','-transformation',
            'dt1','-name','testando','-filepath','path'])
p4.wait()

# Define os Datasets.
p5 = Popen(['java', '-jar', 'PDG-1.0.jar','-set','-dataflow',
       'spark-example','-transformation','dt1','-tag',
       'ds1','-type','output'])
p5.wait()

p6 = Popen(['java', '-jar', 'PDG-1.0.jar','-set','-dataflow',
       'spark-example','-transformation','dt2','-tag',
       'ds2','-type','input'])
p6.wait()

p7 = Popen(['java', '-jar', 'PDG-1.0.jar','-set','-dataflow',
       'spark-example','-transformation','dt2','-tag',
       'ds3','-type','output'])
p7.wait()

# Define os extratores.
p8 = Popen(['java', '-jar', 'PDG-1.0.jar','-extractor',
        '-dataflow','spark-example','-transformation',
            'dt1','-set','ds1','-tag','ext1',
            '-algorithm','extraction','csv'])
p8.wait()

p9 = Popen(['java', '-jar', 'PDG-1.0.jar','-extractor',
        '-dataflow','spark-example','-transformation',
        'dt2','-set','ds2','-tag','ext2','-algorithm',
        'extraction','csv'])
p9.wait()

p10 = Popen(['java', '-jar', 'PDG-1.0.jar','-extractor',
        '-dataflow','spark-example','-transformation',
        'dt2','-set','ds3','-tag','ext3','-algorithm',
        'extraction','csv'])
p10.wait()

# Define os atributos.
p11 = Popen(['java', '-jar', 'PDG-1.0.jar','-attribute',
        '-dataflow','spark-example','-transformation',
        'dt1','-set','ds1','-name','att1','-type','text',
        '-extractor','ext1'])
p11.wait()

p12 = Popen(['java', '-jar', 'PDG-1.0.jar','-attribute',
        '-dataflow','spark-example','-transformation',
        'dt1','-set','ds1','-name','att2','-type',
        'numeric','-extractor','ext1'])
p12.wait()

p13 = Popen(['java', '-jar', 'PDG-1.0.jar','-attribute','-dataflow',
       'spark-example','-transformation','dt2',
        '-set','ds2','-name','att3','-type',
        'text','-extractor','ext2'])
p13.wait()

p14 = Popen(['java', '-jar', 'PDG-1.0.jar','-attribute',
        '-dataflow','spark-example','-transformation',
        'dt2','-set','ds2','-name','att4','-type',
        'numeric','-extractor','ext2'])
p14.wait()

p15 = Popen(['java', '-jar', 'PDG-1.0.jar','-attribute',
        '-dataflow','spark-example','-transformation',
        'dt2','-set','ds3','-name','att5','-type','text',
        '-extractor','ext3'])
p15.wait()

p16 = Popen(['java', '-jar', 'PDG-1.0.jar','-attribute',
        '-dataflow','spark-example','-transformation',
        'dt2','-set','ds3','-name','att6','-type',
        'numeric','-extractor','ext3'])
p16.wait()

0

In [5]:
# Configura proveniência retrospectiva e execução do dataflow.

# Configura Task 1
p17 = Popen(['java', '-jar', 'PDG-1.0.jar','-task',
            '-dataflow','spark-example',
            'transformation','dt1',
             '-id','t1','-subid',
             '1','-resouce','resource',
             '-workspace','workspace',
             '-status','running'])
p17.wait()

# Cria performances para t1.
p18 = Popen(['java', '-jar', 'PDG-1.0.jar','-performance',
             '-dataflow','spark-example',
             '-transformation','dt1','-task','t1',
             '-starttime','-description',
             'pf1','-computation','description'])
p18.wait()

# Cria input.
rdd = sc.parallelize([Row(att1=str(uuid.uuid4()),
        att2=randint(0,100)) for x in range(5)])

m1 = MapTransformation(rdd)

r2 = m1.do_nothing()

# Escreve em csv.
df1 = sqlContext.createDataFrame(r2)

# Arquivo de saída de t1.
p19 = Popen(['java', '-jar', 'PDG-1.0.jar','-file','-dataflow',
             'spark-example','transformation','dt1',
             '-id','t1','-name','dt1-output.csv',
             '-path','.'])
p19.wait()

df1.coalesce(1).write.save(path='dt1-output.csv', format='csv',
                           mode='append', sep=',')

# Adiciona informações à tarefa 1.
p20 = Popen(['java', '-jar', 'PDG-1.0.jar','-performance',
             '-dataflow','spark-example',
             '-transformation','dt1','-task','t1',
             '-endtime','-description','pf1',
             '-computation','description'])
p20.wait()

# Modifica o estado da tarefa 1 para finished.
p21 = Popen(['java', '-jar', 'PDG-1.0.jar','-task',
             '-dataflow','spark-example','transformation',
             'dt1','-id','t1','-resouce','resource',
             '-workspace','workspace','-status',
             'finished'])
p21.wait()
# Lê do csv.

# Configura Task 2.
p22 = Popen(['java', '-jar', 'PDG-1.0.jar','-task','-dataflow',
             'spark-example','transformation','dt2',
             '-id','t2','-resouce','resource','-workspace'
             ,'workspace','-status','running'])
p22.wait()

# Adiciona arquivo de entrada à t2.
p23 = Popen(['java', '-jar', 'PDG-1.0.jar','-file',
             '-dataflow','spark-example',
             'transformation','dt2','-id','t2',
             '-name','dt1-output.csv','-path','.'])
p23.wait()

# Criar arquivo de performance para t2.
p24 = Popen(['java', '-jar', 'PDG-1.0.jar','-performance',
             '-dataflow','spark-example',
             '-transformation','dt2','-task','t2',
             '-starttime','-description',
             'pf2','-computation','description'])
p24.wait()

df2 = (sqlContext.read
         .format("com.databricks.spark.csv")
         .option("header", "false")
         .load('dt1-output.csv'))

# Carrega informações na task2.
m2 = MapTransformation(df2.rdd)

r3 = m2.do_nothing()

# Adiciona informações à tarefa 2.
p25 = Popen(['java', '-jar', 'PDG-1.0.jar','-performance',
             '-dataflow','spark-example',
             '-transformation','dt2','-task','t2',
             '-endtime','-description','pf2',
             '-computation','description'])
p25.wait()

# Modifica o estado da tarefa 2 para finished.
p26 = Popen(['java', '-jar', 'PDG-1.0.jar',
             '-task','-dataflow','spark-example'
             ,'transformation','dt2','-id',
             't2','-resouce','resource','-workspace'
             ,'workspace','-status','finished'])
p26.wait()

0