# NOTEBOOK 3.1 PySpark RDDs

In [None]:
lines = ["whether the weather be fine",
         "or whether the weather be not",
         "whether the weather be cold",
         "or whether the weather be hot",
         "well whether the weather",
         "whatever the weather",
         "whether we like it or not"]

with open("weather.txt", 'w') as f:
  f.write("\n".join(lines))

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName("PySpark RDDs") \
  .getOrCreate()

In [None]:
sc = spark.sparkContext

In [None]:
import numpy as np

num_list = np.random.randint(0, 10, 20)
numbers_rdd = sc.parallelize(num_list)
print(numbers_rdd)
print(numbers_rdd.collect())

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289
[3, 3, 7, 4, 9, 4, 0, 5, 3, 9, 4, 9, 0, 0, 9, 5, 0, 3, 7, 0]


In [None]:
num_list = [5, 2, 3, 5, 6, 9, 5, 0, 8, 3, 8, 6, 5, 2, 2, 3, 5, 4, 3, 1]
numbers_rdd = sc.parallelize(num_list)
print(numbers_rdd.collect())

[5, 2, 3, 5, 6, 9, 5, 0, 8, 3, 8, 6, 5, 2, 2, 3, 5, 4, 3, 1]


In [None]:
distinct_numbers_rdd = numbers_rdd.distinct()
print(distinct_numbers_rdd.collect())

[2, 6, 0, 8, 4, 5, 3, 9, 1]


In [None]:
rdd1 = sc.parallelize([3, 1, 0, 8, 9, 5, 7])
rdd2 = sc.parallelize([1, 6, 0, 9, 7, 2, 5])
print(f"rdd1: {rdd1.collect()}")
print(f"rdd2: {rdd2.collect()}")
print(f"rdd1.intersection(rdd2): {rdd1.intersection(rdd2).collect()}")
print(f"rdd1.subtract(rdd2): {rdd1.subtract(rdd2).collect()}")

rdd1: [3, 1, 0, 8, 9, 5, 7]
rdd2: [1, 6, 0, 9, 7, 2, 5]
rdd1.intersection(rdd2): [0, 1, 9, 5, 7]
rdd1.subtract(rdd2): [8, 3]


In [None]:
multiples_rdd = numbers_rdd.filter(lambda x: x % 3 == 0)
print(multiples_rdd.collect())

[3, 6, 9, 0, 3, 6, 3, 3]


In [None]:
squares_rdd = numbers_rdd.map(lambda x: x * x)
squares_rdd.collect()

[25, 4, 9, 25, 36, 81, 25, 0, 64, 9, 64, 36, 25, 4, 4, 9, 25, 16, 9, 1]

In [None]:
def square_if_odd(x):
  if (x % 2 == 1):
    return x * x
  else:
    return x

numbers_rdd.map(square_if_odd).collect()

[25, 2, 9, 25, 6, 81, 25, 0, 8, 9, 8, 6, 25, 2, 2, 9, 25, 4, 9, 1]

In [None]:
import pprint

text_rdd = sc.textFile("weather.txt")
pprint.pprint(text_rdd.collect())

['whether the weather be fine',
 'or whether the weather be not',
 'whether the weather be cold',
 'or whether the weather be hot',
 'well whether the weather',
 'whatever the weather',
 'whether we like it or not']


In [None]:
def tokenize(text):
  return text.split()

In [None]:
words_rdd = text_rdd.map(tokenize)
words_rdd.collect()

[['whether', 'the', 'weather', 'be', 'fine'],
 ['or', 'whether', 'the', 'weather', 'be', 'not'],
 ['whether', 'the', 'weather', 'be', 'cold'],
 ['or', 'whether', 'the', 'weather', 'be', 'hot'],
 ['well', 'whether', 'the', 'weather'],
 ['whatever', 'the', 'weather'],
 ['whether', 'we', 'like', 'it', 'or', 'not']]

In [None]:
words_flattened_rdd = text_rdd.flatMap(tokenize)
words_flattened_rdd.collect()

['whether',
 'the',
 'weather',
 'be',
 'fine',
 'or',
 'whether',
 'the',
 'weather',
 'be',
 'not',
 'whether',
 'the',
 'weather',
 'be',
 'cold',
 'or',
 'whether',
 'the',
 'weather',
 'be',
 'hot',
 'well',
 'whether',
 'the',
 'weather',
 'whatever',
 'the',
 'weather',
 'whether',
 'we',
 'like',
 'it',
 'or',
 'not']

In [None]:
orders = [('pen', 2), ('ruler', 3), ('eraser', 1),
          ('ruler', 2), ('pen', 5), ('notepad', 4), ('ruler', 3)]
orders_rdd = sc.parallelize(orders)
orders_rdd.collect()

[('pen', 2),
 ('ruler', 3),
 ('eraser', 1),
 ('ruler', 2),
 ('pen', 5),
 ('notepad', 4),
 ('ruler', 3)]

In [None]:
orders = [('pen', 2), ('ruler', 3), ('eraser', 1),
          ('ruler', 2), ('pen', 5), ('notepad', 4), ('ruler', 3)]
orders_rdd.sortBy(lambda x: x[0]).collect()

[('eraser', 1),
 ('notepad', 4),
 ('pen', 2),
 ('pen', 5),
 ('ruler', 3),
 ('ruler', 2),
 ('ruler', 3)]

In [None]:
orders = [('pen', 2), ('ruler', 3), ('eraser', 1),
          ('ruler', 2), ('pen', 5), ('notepad', 4), ('ruler', 3)]
orders_rdd.sortBy(lambda x: x[1]).collect()

[('eraser', 1),
 ('pen', 2),
 ('ruler', 2),
 ('ruler', 3),
 ('ruler', 3),
 ('notepad', 4),
 ('pen', 5)]

In [None]:
spark.stop()