<a href="https://colab.research.google.com/github/elbyvaz/data_engineering/blob/main/spark/spark_readme_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# !pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=cafece1c186eb5f5c61ec4564af05f984644df005dd71d51318281f1a81cce4d
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [3]:
# using spark and veritying its version
from pyspark.sql import SparkSession

spark = SparkSession \
                    .builder \
                    .appName("Readme exercise") \
                    .getOrCreate()

spark.version

'3.5.0'

In [5]:
# count total lines of the file
filename = "/content/sample_data/README.md"
linesRdd = spark.sparkContext.textFile(filename)
linesRdd.count()

125

In [6]:
# map
mapRdd = linesRdd.map(lambda line: (line, len(line))) # tuple: each line will have the line content and its length
mapRdd.collect()

[('# Apache Spark', 14),
 ('', 0),
 ('Spark is a unified analytics engine for large-scale data processing. It provides',
  80),
 ('high-level APIs in Scala, Java, Python, and R, and an optimized engine that',
  75),
 ('supports general computation graphs for data analysis. It also supports a',
  73),
 ('rich set of higher-level tools including Spark SQL for SQL and DataFrames,',
  74),
 ('pandas API on Spark for pandas workloads, MLlib for machine learning, GraphX for graph processing,',
  98),
 ('and Structured Streaming for stream processing.', 47),
 ('', 0),
 ('<https://spark.apache.org/>', 27),
 ('', 0),
 ('[![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_main.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_main.yml)',
  167),
 ('[![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)',
  189),


In [8]:
# flatMap: breaking content line in words
flatMapRdd = linesRdd.flatMap(lambda line: line.split())
flatMapRdd.collect()

['#',
 'Apache',
 'Spark',
 'Spark',
 'is',
 'a',
 'unified',
 'analytics',
 'engine',
 'for',
 'large-scale',
 'data',
 'processing.',
 'It',
 'provides',
 'high-level',
 'APIs',
 'in',
 'Scala,',
 'Java,',
 'Python,',
 'and',
 'R,',
 'and',
 'an',
 'optimized',
 'engine',
 'that',
 'supports',
 'general',
 'computation',
 'graphs',
 'for',
 'data',
 'analysis.',
 'It',
 'also',
 'supports',
 'a',
 'rich',
 'set',
 'of',
 'higher-level',
 'tools',
 'including',
 'Spark',
 'SQL',
 'for',
 'SQL',
 'and',
 'DataFrames,',
 'pandas',
 'API',
 'on',
 'Spark',
 'for',
 'pandas',
 'workloads,',
 'MLlib',
 'for',
 'machine',
 'learning,',
 'GraphX',
 'for',
 'graph',
 'processing,',
 'and',
 'Structured',
 'Streaming',
 'for',
 'stream',
 'processing.',
 '<https://spark.apache.org/>',
 '[![GitHub',
 'Actions',
 'Build](https://github.com/apache/spark/actions/workflows/build_main.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_main.yml)',
 '[![AppVeyor',
 'Build](https:/

In [9]:
# filter: only words that begins with 'a'
filterRdd = linesRdd.flatMap(lambda line: line.split()) \
                    .filter(lambda word: word.startswith('a'))

filterRdd.collect()

['a',
 'analytics',
 'and',
 'and',
 'an',
 'analysis.',
 'also',
 'a',
 'and',
 'and',
 'a',
 'and',
 'a',
 'available',
 'at',
 'an',
 'also',
 'also',
 'a',
 'a',
 'and',
 'also',
 'an',
 'abbreviated',
 'are',
 'a',
 'also',
 'a',
 'and',
 'against',
 'at',
 'and',
 'a',
 'and',
 'an']

In [10]:
lista = ['um', 'um', 'dois', 'tres']
rdd = spark.sparkContext.parallelize(lista)
# total occurency of each word
rdd2 = rdd.map(lambda w: (w, 1)) \
          .reduceByKey(lambda a,b: a+b)

rdd2.collect()

[('um', 2), ('dois', 1), ('tres', 1)]

In [11]:
# sort by key
lista = ['um', 'um', 'dois', 'tres']
rdd = spark.sparkContext.parallelize(lista)
# sorting by word (key)
rdd2 = rdd.map(lambda w: (w, 1)) \
          .reduceByKey(lambda a,b: a+b) \
          .sortByKey('asc')

rdd2.collect()

[('dois', 1), ('tres', 1), ('um', 2)]

In [12]:
# sort by
lista = ['um', 'um', 'dois', 'tres']
rdd = spark.sparkContext.parallelize(lista)
# sorting by value (occurency0
rdd2 = rdd.map(lambda w: (w, 1)) \
          .reduceByKey(lambda a,b: a+b) \
          .sortBy(lambda t: t[1])

rdd2.collect()

[('dois', 1), ('tres', 1), ('um', 2)]

In [13]:
# union
lista1 = ['um', 'um', 'dois', 'tres']
lista2 = ['quatro', 'cinco']

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddUnion = rdd1.union(rdd2)
rddUnion.collect()

['um', 'um', 'dois', 'tres', 'quatro', 'cinco']

In [15]:
# intersection
lista1 = ['um', 'um', 'dois', 'tres']
lista2 = ['um', 'quatro', 'cinco']

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddIntersection = rdd1.intersection(rdd2)
rddIntersection.collect()

['um']

In [16]:
# distinct
lista1 = ['um', 'um', 'dois', 'tres']

rdd1 = spark.sparkContext.parallelize(lista1)

rddDistinct = rdd1.distinct()
rddDistinct.collect()

['um', 'dois', 'tres']

In [19]:
# join
lista1 = [('Pedro', 39), ('Maria', 30)]
lista2 = [('Pedro', 'BH'), ('Maria', 'SP'), ('João', 'RJ')]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddJoin = rdd1.join(rdd2)
rddJoin.collect()

[('Pedro', (39, 'BH')), ('Maria', (30, 'SP'))]

In [20]:
# action: the data are stored in memory
rddJoin.collect()

[('Pedro', (39, 'BH')), ('Maria', (30, 'SP'))]

In [21]:
# count
rddJoin.count()

2

In [24]:
# take: take n random registers
rddJoin.take(1)

[('Pedro', (39, 'BH'))]

In [25]:
# top n registers
lista1 = ['um', 'um', 'dois', 'tres']
lista2 = ['um', 'quatro', 'cinco']

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddIntersection = rdd1.intersection(rdd2)
rddIntersection.collect()

rddUnion.top(3)

[('Pedro', (39, 'BH')), ('Maria', (30, 'SP'))]

In [26]:
# countByValue: count the times of occurency of the item
lista1 = ['um', 'um', 'dois', 'tres']
lista2 = ['um', 'quatro', 'cinco']

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddIntersection = rdd1.intersection(rdd2)
rddIntersection.collect()

rddUnion.countByValue()

defaultdict(int, {('Pedro', (39, 'BH')): 1, ('Maria', (30, 'SP')): 1})

In [27]:
# saving result in a file
rddUnion.saveAsTextFile('result.txt')