In [2]:
# import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

In [3]:
# Estabilish connection and load data into memory

spark = (SparkSession.builder
        .appName("Map")
        .getOrCreate())

# Get data
data = spark.sparkContext.textFile("/opt/homebrew/Cellar/apache-spark/3.3.1/README.md")

23/01/31 02:32:53 WARN Utils: Your hostname, Deboras-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.68.104 instead (on interface en0)
23/01/31 02:32:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/31 02:32:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/31 02:32:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Transformation

### map()

In [4]:
# Use map()
#For every line in the doc, create a tuple containing its content and lenght (content, lenght)
mapFile = data.map(lambda line : (line, len(line)))

# Print tuples
# foreach() is a action that starts the execution
mapFile.foreach(print)

('# Apache Spark', 14)
('', 0)
('Spark is a unified analytics engine for large-scale data processing. It provides', 80)
('high-level APIs in Scala, Java, Python, and R, and an optimized engine that', 75)
('supports general computation graphs for data analysis. It also supports a', 73)
('rich set of higher-level tools including Spark SQL for SQL and DataFrames,', 74)
('pandas API on Spark for pandas workloads, MLlib for machine learning, GraphX for graph processing,', 98)
('and Structured Streaming for stream processing.', 47)
('', 0)
('<https://spark.apache.org/>', 27)
('', 0)
('', 0)
('[![GitHub Action Build](https://github.com/apache/spark/actions/workflows/build_and_test.yml/badge.svg?branch=master&event=push)](https://github.com/apache/spark/actions/workflows/build_and_test.yml?query=branch%3Amaster+event%3Apush)', 234)
('```python', 9)
('>>> spark.range(1000 * 1000 * 1000).count()', 43)
('[![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.s

### flatMap()

In [5]:
# Use flatMap()
# Split line into words
flatFile = data.flatMap(lambda line : line.split())

# Print words
flatFile.foreach(print)

#
Apache
Spark
Spark
is
a
unified
analytics
engine
for
large-scale
data
processing.
It
provides
high-level
APIs
in
Scala,
Java,
Python,
and
R,
and
an
optimized
engine
that
supports
general
computation
graphs
for
data
analysis.
It
also
supports
a
rich
set
of
higher-level
tools
including
Spark
SQL
for
SQL
and
DataFrames,
pandas
API
on
Spark
for
pandas
workloads,
MLlib
for
machine
learning,
GraphX
for
graph
processing,
and
Structured
Streaming
for
stream
processing.
<https://spark.apache.org/>
[![GitHub
Action
Build](https://github.com/apache/spark/actions/workflows/build_and_test.yml/badge.svg?branch=master&event=push)](https://github.com/apache/spark/actions/workflows/build_and_test.yml?query=branch%3Amaster+event%3Apush)
[![AppVeyor
Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)
[![PySpark
Coverage](https://codecov.io/gh/apache/spark/branch/master/graph/bad

### filter(func)

In [6]:
# Use filter()
# Only words starts with "a"
filterFile = flatFile.filter(lambda word : word.startswith("a"))

# Print words
filterFile.foreach(print)

a
analytics
and
and
an
analysis.
also
a
and
and
a
and
a
available
at
an
also
also
a
a
and
also
an
abbreviated
are
a
also
a
and
against
at
and
a
and
an


### reduceByKey(func)

In [7]:
list = ["um", "um", "dois", "dois", "três"]

rdd = spark.sparkContext.parallelize(list) #transform list in RDD

# map list into tuple and use reduceByKey to count frequency of every word
rdd2 = rdd.map(lambda x: (x, 1)).reduceByKey(lambda a,b : a+b)

rdd2.foreach(print)

('três', 1)
('dois', 2)
('um', 2)


### sortByKey(func)

In [8]:
# map list into tuple, use reduceByKey to count frequency of every word and sort by key
rdd2 = rdd.map(lambda x: (x, 1)).reduceByKey(lambda a,b : a+b).sortByKey("asc")

rdd2.foreach(print)

('um', 2)
('dois', 2)
('três', 1)


### union(rdd)

In [9]:
list2 = ["um", "quatro", "cinco"]

rdd2 = spark.sparkContext.parallelize(list2)

rddUnion = rdd.union(rdd2)

rddUnion.foreach(print)

dois
três
um
um
dois
um
quatro
cinco


### intersection(rdd)

In [10]:
rddIntersection = rdd.intersection(rdd2)
rddIntersection.foreach(print)

um


### distinct(rdd)

In [11]:
rddDistinct = rdd.distinct()
rddDistinct.foreach(print)

dois
um
três


### join(rdd)

In [12]:
list = [("Pedro", 38), ("Maria", 42), ("João", 12)]
list2 = [("Pedro", "BH"), ("Maria", "DF")]

rdd = spark.sparkContext.parallelize(list)
rdd2 = spark.sparkContext.parallelize(list2)

rddJoin = rdd.join(rdd2)

rddJoin.foreach(print)

('Maria', (42, 'DF'))
('Pedro', (38, 'BH'))


## Action

### foreach(func)

In [13]:
list = [("Pedro", 38), ("Maria", 42), ("João", 12)]
list2 = [("Pedro", "BH"), ("Maria", "DF")]

rdd = spark.sparkContext.parallelize(list)
rdd2 = spark.sparkContext.parallelize(list2)

rddJoin = rdd.join(rdd2)
#rddJoin.foreach(print)

### collect()

In [14]:
print(rddJoin.collect())

[('Maria', (42, 'DF')), ('Pedro', (38, 'BH'))]


### count()

In [15]:
print(rddJoin.count())

2


### take(n)

In [16]:
# rddUnion.foreach(print)
rddUnion.take(4)

['um', 'um', 'dois', 'dois']

### top(k)

In [17]:
# rddUnion.foreach(print)
rddUnion.top(4)

['um', 'um', 'um', 'três']

### countByValue()

In [18]:
rddUnion.countByValue()

defaultdict(int, {'um': 3, 'dois': 2, 'três': 1, 'quatro': 1, 'cinco': 1})

### reduce(func)

In [19]:
rddUnion.reduce(lambda a,b : a + ' ' + b)

'um um dois dois três um quatro cinco'

In [20]:
rddUnion

UnionRDD[23] at union at NativeMethodAccessorImpl.java:0

### saveAsTextFile(path)

In [21]:
rddUnion.saveAsTextFile('./MeuRDD')