In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
sc = spark.sparkContext

In [4]:
print(spark.version)

2.4.0


In [5]:
lst1 = [1,4,9,16,25,36,100,49,64,81]

In [6]:
lst1

[1, 4, 9, 16, 25, 36, 100, 49, 64, 81]

In [7]:
rdd1 = sc.parallelize(lst1)

In [8]:
rdd1

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [9]:
rdd1.getNumPartitions()

32

In [None]:
rdd2 = rdd1

In [45]:
rdd1.collect()

[1, 4, 9, 16, 25, 36, 100, 49, 64, 81]

In [46]:
rdd2 = rdd1.map(lambda x: x+1)

In [47]:
rdd1 = rdd1.map(lambda x: x**2)

In [48]:
rdd1.collect()

[1, 16, 81, 256, 625, 1296, 10000, 2401, 4096, 6561]

In [11]:
rdd1.count()

10

In [12]:
rddDistinct = rdd1.distinct()

In [13]:
rddDistinct.collect()

[64, 1, 4, 36, 100, 9, 16, 49, 81, 25]

In [15]:
def fiftyPercentile(x):
    return x > 50

In [16]:
rddFilter = rdd1.filter(fiftyPercentile)

In [17]:
rddFilter.collect()

[100, 64, 81]

In [18]:
rddFilter2 = rdd1.filter(lambda x: x>50)

In [19]:
rddFilter2.collect()

[100, 64, 81]

In [20]:
rddSorted = rdd1.sortBy(lambda x: x, True)

In [44]:
print(rddSorted.collect())

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [21]:
rddSorted.collect()

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]

In [16]:
rdd1.sortBy(lambda x: x, True).collect()

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]

In [30]:
rddOriginal = rdd1.map(lambda x: x**0.5)

In [31]:
rddCubes = rddOriginal.map(lambda x: x**3)

In [32]:
rddCubes.collect()

[1.0, 8.0, 27.0, 64.0, 125.0, 216.0, 1000.0, 343.0, 512.0, 729.0]

In [38]:
print(rddOriginal.collect())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 7.0, 8.0, 9.0]


In [28]:
rddStrings = sc.parallelize(["abc", "def", "ghihihih"])

In [29]:
rddStrings.map(lambda x: len(x)).collect()

[3, 3, 8]

In [33]:
rdd1.map(lambda x: (x**0.5)**3).collect()

[1, 8, 27, 64, 125, 216, 343, 512, 729, 1000]

In [33]:
rddStrings.first()

'abc'

In [42]:
rddStrings.collect()

['abc', 'def', 'ghihihih']

In [49]:
rddLarge = sc.textFile("/user/arnavmoutl12edu/module5/sales_cut.csv")

In [50]:
rddLarge.take(5)

['Region Country Item Type',
 'Australia and Oceania Palau Office Supplies',
 'Europe Poland Beverages',
 'North America Canada Cereal',
 'Europe Belarus Snacks']

In [None]:
rddSplit = rddLarge.flatMap(lambda x: x.split())

In [37]:
wordCount = rdd.flatMap(lambda x: x.split()).map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)

In [43]:
wordCount.take(5)

[('Region', 1),
 ('Country', 1),
 ('Australia', 432582),
 ('Palau', 27026),
 ('Office', 416822)]

In [38]:
wordCount.saveAsTextFile("/user/arnavmoutl12edu/module5/sales_out.csv")

# Shared Variables

In [51]:
rdd1.cache()

PythonRDD[44] at collect at <ipython-input-48-75ecb6e3744c>:1

In [52]:
rdd.unpersist()

/user/arnavmoutl12edu/module5/sales_cut.csv MapPartitionsRDD[34] at textFile at NativeMethodAccessorImpl.java:0

In [53]:
dict1 = {"A": "1", "B": "2"}

In [54]:
sc.broadcast(dict1)

<pyspark.broadcast.Broadcast at 0x7f1f39d31390>

In [55]:
rdd2 = sc.parallelize(["A", "B"])

In [56]:
rdd2Broadcast = rdd2.map(lambda x: dict1[x])

In [57]:
rdd2Broadcast.collect()

['1', '2']

In [75]:
accum=sc.accumulator(0)

In [76]:
rdd3=spark.sparkContext.parallelize([1,2,3,4,5])

In [77]:
rdd3.foreach(lambda x:accum.add(x))

In [78]:
accum.value

15

# Joins in PySpark RDD

In [58]:
d1 = [('a', 10), ('a', 11), ('a', 12), ('b', 100), ('b', 200), ('c', 80)]
d2 = [('a', 40), ('a', 50), ('b', 300), ('b', 400), ('d', 90)]
T1 = spark.sparkContext.parallelize(d1)
T2 = spark.sparkContext.parallelize(d2)

In [59]:
#Next, we map these RDDs to include the name of the relation:
t1_mapped = T1.map(lambda x: (x[0], ("T1", x[1])))
t2_mapped = T2.map(lambda x: (x[0], ("T2", x[1])))

In [56]:
combined = t1_mapped.union(t2_mapped)

In [60]:
def cartesian_product(entry):
 T1 = []
 T2 = []
 key = entry[0]
 values = entry[1]
 for tuple in values:
  relation = tuple[0]
  attributes = tuple[1]
  if relation == "T1":
    T1.append(attributes)
  else:
    T2.append(attributes)
  #end-for
 if (len(T1) == 0) or (len(T2) == 0):
  # no common key
  return []
  # len(T1) > 0 AND len(T2) > 0
  joined_elements = []
  # perform cartesian product of T1 and T2
  for v in T1:
   for w in T2:
    joined_elements.append((key, (v, w)))
   #end-for
  #end-for
 return joined_elements
#end-def

In [61]:
combined = t1_mapped.union(t2_mapped)

In [62]:
grouped = combined.groupByKey()

In [63]:
joined = grouped.flatMap(cartesian_product)

In [64]:
joined.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 36 in stage 38.0 failed 1 times, most recent failure: Lost task 36.0 in stage 38.0 (TID 914, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in main
    process()
  File "/opt/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 367, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 390, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-60-43be08a84999>", line 25, in cartesian_product
UnboundLocalError: local variable 'joined_elements' referenced before assignment

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor92.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in main
    process()
  File "/opt/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 367, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 390, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-60-43be08a84999>", line 25, in cartesian_product
UnboundLocalError: local variable 'joined_elements' referenced before assignment

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [50]:
sc.stop()