#PySpark - Operações de Transformação e Ação

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

##Operações de Transformação

In [0]:
# Criando o SparkSessiso
spark = (SparkSession.builder
        .appName("Map")
        .getOrCreate())

In [0]:
# Carrega arquivo texto base das operações
data = spark.sparkContext.textFile("dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/README.md")

##map(func)

In [0]:
mapFile = data.map(lambda line : (line, len(line)))
# Saída para os workers (não imprime na tela)
mapFile.foreach(print)

# Pega os dados dos workers e traz para o Master usando a função collect
out_mapFile = mapFile.collect()

# imprime linha a linha
for lines in out_mapFile:
    print(lines)

('# Apache Spark', 14)
('', 0)
('Spark is a unified analytics engine for large-scale data processing. It provides', 80)
('high-level APIs in Scala, Java, Python, and R, and an optimized engine that', 75)
('supports general computation graphs for data analysis. It also supports a', 73)
('rich set of higher-level tools including Spark SQL for SQL and DataFrames,', 74)
('pandas API on Spark for pandas workloads, MLlib for machine learning, GraphX for graph processing,', 98)
('and Structured Streaming for stream processing.', 47)
('', 0)
('<https://spark.apache.org/>', 27)
('', 0)
('[![GitHub Action Build](https://github.com/apache/spark/actions/workflows/build_and_test.yml/badge.svg?branch=master&event=push)](https://github.com/apache/spark/actions/workflows/build_and_test.yml?query=branch%3Amaster+event%3Apush)', 234)
('[![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwa

##flatMap(func)

In [0]:
flatFile = data.flatMap(lambda line : line.split())
# Saída para os workers (não imprime na tela)
flatFile.foreach(print)

# Pega os dados dos workers e traz para o Master usando a função collect
out_flatFile = flatFile.collect()

# imprime linha a linha
for lines in out_flatFile:
    print(lines)

#
Apache
Spark
Spark
is
a
unified
analytics
engine
for
large-scale
data
processing.
It
provides
high-level
APIs
in
Scala,
Java,
Python,
and
R,
and
an
optimized
engine
that
supports
general
computation
graphs
for
data
analysis.
It
also
supports
a
rich
set
of
higher-level
tools
including
Spark
SQL
for
SQL
and
DataFrames,
pandas
API
on
Spark
for
pandas
workloads,
MLlib
for
machine
learning,
GraphX
for
graph
processing,
and
Structured
Streaming
for
stream
processing.
<https://spark.apache.org/>
[![GitHub
Action
Build](https://github.com/apache/spark/actions/workflows/build_and_test.yml/badge.svg?branch=master&event=push)](https://github.com/apache/spark/actions/workflows/build_and_test.yml?query=branch%3Amaster+event%3Apush)
[![AppVeyor
Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)
[![PySpark
Coverage](https://codecov.io/gh/apache/spark/branch/master/graph/bad

##filter(func)

In [0]:
flatFile = (data
            .flatMap(lambda line : line.split())
            .filter(lambda word: word.startswith("a")))
# Saída para os workers (não imprime na tela)
flatFile.foreach(print)

# Pega os dados dos workers e traz para o Master usando a função collect
out_flatFile = flatFile.collect()

# imprime linha a linha
for lines in out_flatFile:
    print(lines)

a
analytics
and
and
an
analysis.
also
a
and
and
a
and
a
available
at
an
also
also
a
a
and
also
an
abbreviated
are
a
also
a
and
against
at
and
a
and
an


##reduceByKey(func)

In [0]:
list = ["um", "um", "dois", "dois", "tres"]

rdd = spark.sparkContext.parallelize(list)
rdd2 = (rdd.map(lambda w : (w, 1))
        .reduceByKey(lambda a,b: a+b))

# Saída para os workers (não imprime na tela)
rdd2.foreach(print)

# Pega os dados dos workers e traz para o Master usando a função collect
out_rdd = rdd2.collect()

# imprime linha a linha
for lines in out_rdd:
    print(lines)

('tres', 1)
('um', 2)
('dois', 2)


##sortByKey(func)

In [0]:
list = ["um", "um", "dois", "dois", "tres"]

rdd = spark.sparkContext.parallelize(list)
rdd2 = (rdd.map(lambda w : (w, 1))
        .reduceByKey(lambda a,b: a+b)
        .sortByKey("asc"))

# Saída para os workers (não imprime na tela)
rdd2.foreach(print)

# Pega os dados dos workers e traz para o Master usando a função collect
out_rdd = rdd2.collect()

# imprime linha a linha
for lines in out_rdd:
    print(lines)

('dois', 2)
('tres', 1)
('um', 2)


##union(rdd)

In [0]:
list = ["um", "um", "dois", "dois", "tres"]
list2 = ["quatro", "cinco"]


rdd = spark.sparkContext.parallelize(list)
rdd2 = spark.sparkContext.parallelize(list2)

rddUnion = rdd.union(rdd2)

# Saída para os workers (não imprime na tela)
rddUnion.foreach(print)

# Pega os dados dos workers e traz para o Master usando a função collect
out_rdd = rddUnion.collect()

# imprime linha a linha
for lines in out_rdd:
    print(lines)

um
um
dois
dois
tres
quatro
cinco


## intersection(rdd)

In [0]:
list = ["um", "um", "dois", "dois", "tres"]
list2 = ["um", "cinco"]


rdd = spark.sparkContext.parallelize(list)
rdd2 = spark.sparkContext.parallelize(list2)

rddIntersection = rdd.intersection(rdd2)

# Saída para os workers (não imprime na tela)
rddIntersection.foreach(print)

# Pega os dados dos workers e traz para o Master usando a função collect
out_rdd = rddIntersection.collect()

# imprime linha a linha
for lines in out_rdd:
    print(lines)

um


##distinct(rdd)

In [0]:
list = ["um", "um", "dois", "dois", "tres"]

rdd = spark.sparkContext.parallelize(list)

rddDistinct = rdd.distinct()

# Saída para os workers (não imprime na tela)
rddDistinct.foreach(print)

# Pega os dados dos workers e traz para o Master usando a função collect
out_rdd = rddDistinct.collect()

# imprime linha a linha
for lines in out_rdd:
    print(lines)

tres
um
dois


##join(rdd)

In [0]:
list = [("Pedro", 38), ("Maria", 21)]
list2 = [("Pedro", "BH"), ("Maria", "SP")]


rdd = spark.sparkContext.parallelize(list)
rdd2 = spark.sparkContext.parallelize(list2)

rddJoin = rdd.join(rdd2)

# Saída para os workers (não imprime na tela)
rddJoin.foreach(print)

# Pega os dados dos workers e traz para o Master usando a função collect
out_rdd = rddJoin.collect()

# imprime linha a linha
for lines in out_rdd:
    print(lines)

('Maria', (21, 'SP'))
('Pedro', (38, 'BH'))


##Ações

In [0]:
list = ["um", "um", "dois", "dois", "tres"]
list2 = ["um", "quatro", "cinco"]


rdd = spark.sparkContext.parallelize(list)
rdd2 = spark.sparkContext.parallelize(list2)

rddJoin = rdd.union(rdd2)

# foreach -> Executa uma operação sobre cada elemento do RDD
rddJoin.foreach(print)

# collect() -> Retorna todo o conteúdo do RDD para o driver, em memória
print(rddJoin.collect())

# count()
print(rddJoin.count())

['um', 'um', 'dois', 'dois', 'tres', 'um', 'quatro', 'cinco']
8


In [0]:
# take(n)
rddJoin.take(1)

Out[42]: ['um']

In [0]:
#top(k)
rddJoin.top(3)

Out[43]: ['um', 'um', 'um']

In [0]:
#countByValue
rddJoin.countByValue()

Out[44]: defaultdict(int, {'um': 3, 'dois': 2, 'tres': 1, 'quatro': 1, 'cinco': 1})

In [0]:
#reduce(func)
rddJoin.reduce(lambda a,b: a + ' ' + b)

Out[45]: 'um um dois dois tres um quatro cinco'

In [0]:
#saveAsTextFile(path)
rddJoin.saveAsTextFile("dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/")

In [0]:
%fs
ls "dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/"

path,name,size,modificationTime
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/_SUCCESS,_SUCCESS,0,1654990640000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/part-00000,part-00000,0,1654990639000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/part-00001,part-00001,3,1654990639000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/part-00002,part-00002,0,1654990639000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/part-00003,part-00003,3,1654990639000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/part-00004,part-00004,5,1654990639000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/part-00005,part-00005,0,1654990639000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/part-00006,part-00006,5,1654990639000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/part-00007,part-00007,5,1654990639000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao/part-00008,part-00008,0,1654990640000


In [0]:
list = ["um", "um", "dois", "dois", "tres"]
list2 = ["um", "quatro", "cinco"]


rdd = spark.sparkContext.parallelize(list)
rdd2 = spark.sparkContext.parallelize(list2)

rddJoin = rdd.union(rdd2)

rddJoin.repartition(1).saveAsTextFile("dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao-OnePartition/")

In [0]:
%fs
ls "dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao-OnePartition/"

path,name,size,modificationTime
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao-OnePartition/_SUCCESS,_SUCCESS,0,1655302033000
dbfs:/FileStore/shared_uploads/ericscunha@hotmail.com/TesteAcao-OnePartition/part-00000,part-00000,37,1655302032000
