<a href="https://colab.research.google.com/github/eder1985/igti-bootcamp-eng-dados-cloud/blob/main/modulo_2/igti_edc_mod2_aulas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><center>IGTI/XP Educação/Bootcamp Engenharia de Dados Cloud - Módulo 2 - Aulas
</center></h1>

### Installing Spark

Install Dependencies:


1.   Java 8
2.   Apache Spark with hadoop and
3.   Findspark (used to locate the spark in the system)


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

Set Environment Variables:

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
!ls

sample_data  spark-3.1.1-bin-hadoop3.2	spark-3.1.1-bin-hadoop3.2.tgz


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

### Spark RDD and NumberCount

In [None]:
# Generate random numbers

from random import randint

random_file = open("sample_data/random.txt", "w+" )
# 100M numbers
count = 100 * 1000 * 1000
for _ in range(count):
  # random number between 0 and 10
  value = randint(0,10)
  if value != 5:
    random_file.write(str(value))
    random_file.write(" ")
  else:
    random_file.write(str(value))
    random_file.write("\n")


random_file.close()


In [None]:
!head -n 10 sample_data/random.txt

7 6 0 1 6 3 7 2 7 3 2 9 8 6 10 6 2 9 8 3 0 1 1 0 2 2 3 0 3 3 0 10 4 6 3 7 10 9 5
1 7 1 6 6 10 5
1 3 2 9 10 5
3 5
4 8 2 5
9 6 2 9 5
9 2 3 9 6 10 9 4 8 6 6 10 6 2 5
2 10 7 1 9 5
6 3 10 2 6 10 4 7 2 10 9 6 5
8 1 10 2 10 9 6 3 2 4 2 5


In [None]:
!wc -l sample_data/random.txt

9093414 sample_data/random.txt


In [None]:
filename = 'sample_data/random.txt'
linesRdd = spark.read.text(filename).rdd.map(lambda r: r[0])
print("Number of partitions: {}".format(linesRdd.getNumPartitions()))
print("Number of lines: {}".format(linesRdd.count()))

Number of partitions: 2
Number of lines: 9093415


In [None]:
from operator import add

countsRdd = linesRdd.sample(False,0.01) \
  .flatMap(lambda line: line.split(' ')) \
  .filter(lambda number: int(number) % 2 == 0) \
  .map(lambda number: (number, 1)) \
  .reduceByKey(add)

In [None]:
output = countsRdd.collect()
for (number, count) in output:
    print(number, count)

10 90873
4 90491
8 90918
0 90564
2 90834
6 90659


### Spark actions and transformations

In [None]:
linesRdd = spark.sparkContext.textFile("sample_data/README.md")
linesRdd.count()

19

In [None]:
mapRdd = linesRdd.map(lambda line: (line, len(line)))
mapRdd.collect()

[('This directory includes a few sample datasets to get you started.', 65),
 ('', 0),
 ('*   `california_housing_data*.csv` is California housing data from the 1990 US',
  78),
 ('    Census; more information is available at:', 45),
 ('    https://developers.google.com/machine-learning/crash-course/california-housing-data-description',
  99),
 ('', 0),
 ('*   `mnist_*.csv` is a small sample of the', 42),
 ('    [MNIST database](https://en.wikipedia.org/wiki/MNIST_database), which is',
  76),
 ('    described at: http://yann.lecun.com/exdb/mnist/', 51),
 ('', 0),
 ('*   `anscombe.json` contains a copy of', 38),
 ("    [Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet); it",
  80),
 ('    was originally described in', 31),
 ('', 0),
 ("    Anscombe, F. J. (1973). 'Graphs in Statistical Analysis'. American",
  70),
 ('    Statistician. 27 (1): 17-21. JSTOR 2682899.', 47),
 ('', 0),
 ('    and our copy was prepared by the', 36),
 ('    [vega_datasets library](https://

In [None]:
filterRdd = linesRdd.flatMap(lambda line: line.split()) \
                    .filter(lambda word : word.startswith("a"))
filterRdd.collect()

['a', 'available', 'at:', 'a', 'at:', 'a', 'and']

In [None]:
lista = ["um", "um", "dois", "tres"]

rdd = spark.sparkContext.parallelize(lista)
rdd2 = rdd.map(lambda w: (w, 1)) \
          .reduceByKey(lambda a,b: a+b)
rdd2.collect()

[('um', 2), ('dois', 1), ('tres', 1)]

In [None]:
lista = ["um", "um", "dois", "tres"]

rdd = spark.sparkContext.parallelize(lista)
rdd2 = rdd.map(lambda w: (w, 1)) \
          .reduceByKey(lambda a,b: a+b) \
          .sortByKey("asc")
rdd2.collect()

[('dois', 1), ('tres', 1), ('um', 2)]

In [None]:
lista = ["um", "um", "dois", "tres"]

rdd = spark.sparkContext.parallelize(lista)
rdd2 = rdd.map(lambda w: (w, 1)) \
          .reduceByKey(lambda a,b: a+b) \
          .sortBy(lambda t: t[1])
rdd2.collect()

[('dois', 1), ('tres', 1), ('um', 2)]

In [None]:
lista1 = ["um", "um", "dois", "tres"]
lista2 = ["quatro", "cinco"]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)
rddUnion = rdd1.union(rdd2)
rddUnion.collect()

['um', 'um', 'dois', 'tres', 'quatro', 'cinco']

In [None]:
lista1 = ["um", "um", "dois", "tres"]
lista2 = ["um", "quatro", "cinco"]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)
rddUnion = rdd1.intersection(rdd2)
rddUnion.collect()

['um']

In [None]:
lista1 = ["um", "um", "dois", "tres"]

rdd1 = spark.sparkContext.parallelize(lista1)
rddDistinct = rdd1.distinct()
rddDistinct.collect()

['um', 'dois', 'tres']

In [None]:
lista1 = [("Pedro", 39), ("Maria", 30)]
lista2 = [("Pedro", "BH"), ("Maria", "SP"), ("João", "RJ")]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddJoin = rdd1.join(rdd2)

rddJoin.collect()

[('Pedro', (39, 'BH')), ('Maria', (30, 'SP'))]

In [None]:
rddJoin.count()

2

In [None]:
rddJoin.take(1)

[('Pedro', (39, 'BH'))]

In [None]:
lista1 = ["um", "um", "dois", "tres"]
lista2 = ["quatro", "cinco"]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)
rddUnion = rdd1.union(rdd2)
rddUnion.collect()
rddUnion.top(3)

['um', 'um', 'tres']

In [None]:
lista1 = ["um", "um", "dois", "tres"]
lista2 = ["quatro", "cinco"]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)
rddUnion = rdd1.union(rdd2)
rddUnion.collect()
rddUnion.countByValue()

defaultdict(int, {'um': 2, 'dois': 1, 'tres': 1, 'quatro': 1, 'cinco': 1})

In [None]:
rddUnion.saveAsTextFile('sample_data/rdd/out')

### Spark cache and optmization strategies

In [None]:
filename = 'sample_data/random.txt'
data = spark.read.text(filename).rdd.map(lambda r: r[0])
ones = data.sample(False,0.01) \
  .flatMap(lambda line: line.split(' ')) \
  .filter(lambda number: int(number) == 1)

print("Counting numbers ...")
print(ones.count())
print("Counting numbers again...")
print(ones.count())

Counting numbers ...
Counting numbers again...


91090

In [None]:
print("Counting numbers ...")
print(ones.count())
print(ones.cache())
print("Counting numbers again...")
print(ones.count())

Counting numbers ...
91090
False
Counting numbers again...
91090
