In [1]:
# Para que o Jupyter consiga carregar o Spark corretamente no notebook
import findspark
findspark.init('/usr/local/Cellar/apache-spark/2.4.1/libexec')  # Caminho default da instalação no MAC OS

In [2]:
# Para que os executors tenham mais memória e não falhem por falta de recursos
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--executor-memory 1G pyspark-shell'

# A partir daqui é código Spark que normalmente é executado com um comando similar ao comando abaixo:
# spark-submit --executor-memory 1G nome_do_script.py
from pyspark import SparkConf, SparkContext

conf = SparkConf()
sc = SparkContext(conf=conf)

### Understanding SparkContext
* SparkContext is an entry point into the world of Spark
* An entry point is a way of connecting to Spark cluster
* An entry point is like a key to the house
* PySpark has a default SparkContext called sc

In [3]:
# Version: To retrieve SparkContext version
sc.version

'2.4.1'

In [4]:
# Python Version: To retrieve Python version of SparkContext
sc.pythonVer

'3.7'

In [5]:
# Master: URL of the cluster or “local” string to run in local mode of SparkContext
sc.master

'local[*]'

### Carregando dados no PySpark

Download do arquivo para local. Assim não precisaremos setar as credenciais do S3.

In [None]:
# Listar os arquivos contidos no bucket
from s3fs import S3FileSystem
s3 = S3FileSystem(anon=True)

details = s3.ls('cesarschool-data-samples/')
print(details)

In [23]:
#!wget --no-check-certificate --no-proxy 'https://s3.us-east-2.amazonaws.com/cesarschool-data-samples/ml-100k/u.data'
#!wget --no-check-certificate --no-proxy 'https://s3.us-east-2.amazonaws.com/cesarschool-data-samples/ml-100k/u.item'
!wget --no-check-certificate --no-proxy 'https://s3.us-east-2.amazonaws.com/cesarschool-data-samples/Shakespeare.txt'


--2019-05-18 06:52:36--  https://s3.us-east-2.amazonaws.com/cesarschool-data-samples/Shakespeare.txt
Resolvendo s3.us-east-2.amazonaws.com (s3.us-east-2.amazonaws.com)... 52.219.96.122
Conectando-se a s3.us-east-2.amazonaws.com (s3.us-east-2.amazonaws.com)|52.219.96.122|:443... conectado.
A requisic~ao HTTP foi enviada, aguardando resposta... 200 OK
Tamanho: 5784591 (5.5M) [text/plain]
Salvando em: "Shakespeare.txt"


2019-05-18 06:52:38 (3.32 MB/s) - "Shakespeare.txt" salvo [5784591/5784591]



In [20]:
ratings = sc.textFile("u.data")
movies = sc.textFile("u.item")

In [21]:
ratings.take(5)

['196\t242\t3\t881250949',
 '186\t302\t3\t891717742',
 '22\t377\t1\t878887116',
 '244\t51\t2\t880606923',
 '166\t346\t1\t886397596']

In [7]:
# SparkContext's parallelize() method
rdd = sc.parallelize([1,2,3,4,5])
helloRDD = sc.parallelize("Hello world")

print(type(helloRDD))

<class 'pyspark.rdd.RDD'>


In [24]:
# SparkContext's textFile() method
rdd2 = sc.textFile("Shakespeare.txt", minPartitions = 6)
print(type(rdd2))
print(rdd2.getNumPartitions())

<class 'pyspark.rdd.RDD'>
6


### Operações

In [16]:
# map()
RDD = sc.parallelize([1,2,3,4])
RDD_map = RDD.map(lambda x: x * x)
RDD_map.collect()

[1, 4, 9, 16]

In [34]:
#Ex: Contagem por valor

import collections

rating_values = ratings.map(lambda x: x.split()[2])
result = rating_values.countByValue()

sortedResults = collections.OrderedDict(sorted(result.items()))
for rating, number_of_ratings in sortedResults.items():
    print("{} usuários colocaram a nota {}".format(number_of_ratings, rating))

6110 usuários colocaram a nota 1
11370 usuários colocaram a nota 2
27145 usuários colocaram a nota 3
34174 usuários colocaram a nota 4
21201 usuários colocaram a nota 5


In [37]:
#rating_values.take(10)
result

defaultdict(int, {'3': 27145, '1': 6110, '2': 11370, '4': 34174, '5': 21201})

In [17]:
# filter()
RDD = sc.parallelize([1,2,3,4])
RDD_filter = RDD.filter(lambda x: x > 2)
RDD_filter.collect()

[3, 4]

In [39]:
# Ex: Filtro

def parse_movie_and_rating(line):
    fields = line.split()
    movie_field = fields[1]
    rating_field = fields[2]
    return (movie_field, rating_field)

ratings_by_movie = ratings.map(parse_movie_and_rating)
star_wars_ratings = ratings_by_movie.filter(lambda x: "50" == x[0])
min_start_wars_rating = star_wars_ratings.reduceByKey(lambda x, y: min(x, y))
results = min_start_wars_rating.collect()

for movie, rating in results:
    print("A pior nota do filme {} foi {}".format(movie, rating))

A pior nota do filme 50 foi 1


In [18]:
# flatMap()
# flatMap() transformation returns multiple values for each element in the original RDD
RDD = sc.parallelize(["hello world", "how are you"])
RDD_flatmap = RDD.flatMap(lambda x: x.split(" "))
RDD_flatmap.collect()

['hello', 'world', 'how', 'are', 'you']

### Transformations on pair RDDs
* All regular transformations work on pair RDD
* Have to pass functions that operate on key value pairs rather than on individual elements
* Examples of paired RDD Transformations
    * `reduceByKey(func)`: Combine values with the same key
    * `groupByKey()`: Group values with the same key
    * `sortByKey()`: Return an RDD sorted by the key
    * `join()`: Join two pair RDDs based on their key

In [None]:
#Ex: Pares RDD - Tuplas

def parse_ratings_as_key(line):
    fields = line.split()
    rating_field = int(fields[2])
    return (rating_field, 1)

ratings_count = ratings.map(parse_ratings_as_key)
ratings_sum = ratings_count.reduceByKey(lambda x, y: x + y)
results = ratings_sum.collect()
for rating, number_of_ratings in results:
    print("{} usuários colocaram a nota {}".format(number_of_ratings, rating))

## Exemplo: Palavas mais comuns em um texto

In [27]:
# Cria um RDD a partir do caminho do arquivo
baseRDD = sc.textFile('Shakespeare.txt')

# Quebra as linhas do baseRDD em palavras
splitRDD = baseRDD.flatMap(lambda x: x.split())

# Conta o número total de palavras
print("Número total de palavras em splitRDD:", splitRDD.count())

Número total de palavras em splitRDD: 961054


In [29]:
#baseRDD.take(5)
#splitRDD.take(5)

['Project', 'Gutenberg’s', 'The', 'Complete', 'Works']

In [30]:
# Stop words são as palavras desinteressantes para a análise, como conectivos, pronomes, etc
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 
              'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 
              'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 
              'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 
              'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 
              'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
              'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 
              'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 
              'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 
              'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
              'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 
              'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 
              'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 
              'just', 'don', 'should', 'now']

# Converte as palavras para minúsculo e remove as stop words
splitRDD_no_stop = splitRDD.filter(lambda x: x.lower() not in stop_words)

# Cria um par RDD (tupla) com a palavra e 1
splitRDD_no_stop_words = splitRDD_no_stop.map(lambda w: (w, 1))

# Conta o número de ocorrências de cada palavra
resultRDD = splitRDD_no_stop_words.reduceByKey(lambda x, y: x + y)

In [31]:
resultRDD.take(10)

[('Project', 85),
 ('Gutenberg’s', 2),
 ('Shakespeare', 6),
 ('use', 288),
 ('anyone', 8),
 ('anywhere', 4),
 ('United', 15),
 ('States', 8),
 ('world', 376),
 ('restrictions', 2)]

In [32]:
# Precisamos ordenar pela contagem

# Inverte as chaves e valores
resultRDD_swap = resultRDD.map(lambda x: (x[1], x[0]))

# Ordena as novas chaves em ordem descendente
resultRDD_swap_sort = resultRDD_swap.sortByKey(ascending=False)

# Mostra as 10 palavras mais frequentes e suas freqências
for word in resultRDD_swap_sort.take(10):
    print("{} aparece {} vezes". format(word[1], word[0]))

thou aparece 4518 vezes
thy aparece 3919 vezes
shall aparece 3248 vezes
good aparece 2171 vezes
would aparece 2132 vezes
Enter aparece 1997 vezes
thee aparece 1886 vezes
hath aparece 1719 vezes
like aparece 1642 vezes
make aparece 1564 vezes


# Spark SQL

In [41]:
from pyspark.sql import SparkSession, Row

def parse_ratings(line):
    fields = line.split()
    return Row(user_id=int(fields[0]), 
               movie_id=int(fields[1]), 
               rating=int(fields[2]), 
               timestamp=int(fields[3]))

def parse_movies(line):
    fields = line.split("|")
    return Row(movie_id=int(fields[0]), 
               name=fields[1])

spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

data = spark.sparkContext.textFile("u.data")
ratings = data.map(parse_ratings)
ratings_df = spark.createDataFrame(ratings).cache()
ratings_df.createOrReplaceTempView("ratings")

data = spark.sparkContext.textFile("u.item")
movies = data.map(parse_movies)
movies_df = spark.createDataFrame(movies).cache()
movies_df.createOrReplaceTempView("movies")

# AVISO: lembre de executar o comando spark.stop() no último bloco de código quando acabar

# SELECT

In [42]:
result = spark.sql("SELECT movie_id, rating FROM ratings WHERE rating = 5 LIMIT 10")

for r in result.collect():
    print(r)

Row(movie_id=465, rating=5)
Row(movie_id=1014, rating=5)
Row(movie_id=222, rating=5)
Row(movie_id=387, rating=5)
Row(movie_id=95, rating=5)
Row(movie_id=234, rating=5)
Row(movie_id=603, rating=5)
Row(movie_id=327, rating=5)
Row(movie_id=201, rating=5)
Row(movie_id=1137, rating=5)


In [44]:
result.show(10)

+--------+------+
|movie_id|rating|
+--------+------+
|     465|     5|
|    1014|     5|
|     222|     5|
|     387|     5|
|      95|     5|
|     234|     5|
|     603|     5|
|     327|     5|
|     201|     5|
|    1137|     5|
+--------+------+

