In [3]:
from pyspark.sql import SparkSession

In [4]:
MAX_NUM_CORES = 10

In [5]:
spark = SparkSession.builder \
    .master("spark://IMCHLT276:7077") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.cores.max", f"{MAX_NUM_CORES}") \
    .config("spark.local.dir", "/opt/tmp/spark-temp/") \
    .appName("DataSkewness") \
    .getOrCreate()

sc = spark.sparkContext
spark

In [6]:
!ls -als


total 1348
  4 drwxrwxr-x  8 mageswarand mageswarand   4096 May  5 22:52 .
  4 drwxr-xr-x 29 mageswarand openvpn       4096 May  1 10:51 ..
  4 drwxrwxr-x  8 mageswarand mageswarand   4096 Mar 18 12:19 .git
  4 drwxrwxr-x  2 mageswarand mageswarand   4096 May  5 22:37 .ipynb_checkpoints
 24 -rw-rw-r--  1 mageswarand mageswarand  24187 Mar  1 00:03 Classification.ipynb
  4 -rw-rw-r--  1 mageswarand mageswarand   1840 May  5 22:52 Joins.ipynb
  4 -rw-rw-r--  1 mageswarand mageswarand    816 Feb 18 18:20 PandasUDF.ipynb
 88 -rw-rw-r--  1 mageswarand mageswarand  88508 May  5 22:12 Partition.ipynb
680 -rw-rw-r--  1 mageswarand mageswarand 694537 Apr  1 19:33 PySpark_SQL_Cheat_Sheet_Python.pdf
 12 -rw-rw-r--  1 mageswarand mageswarand  11084 May  5 22:51 RDDBasics.ipynb
  4 -rw-rw-r--  1 mageswarand mageswarand    916 Feb  3 16:10 README.md
 16 -rw-rw-r--  1 mageswarand mageswarand  14870 Apr  1 19:46 UDF.ipynb
 16 -rw-rw-r--  1 mageswarand mageswarand  15940 Feb  3 13:41 WindowFunction.ipy

In [7]:
!rm -rf data/out/

In [8]:
# Create RDD from text file
rdd = sc.textFile('training.txt') # file:///

# To View the data
print(rdd.take(20))

# Split lines into words
lines_rdd = rdd.flatMap(lambda line: line.split(" "))

# Create Pair RDD by giving 1 to each word
# transformed_rdd = transformed_rdd.map(lambda x: x.strip('[').strip(']'))
words_rdd = lines_rdd.map(lambda word: (word, 1))

# GroupByWord
word_count_rdd = words_rdd.reduceByKey(lambda a, b: a+b)

# To View the data
print(word_count_rdd.take(20))

# Write data to out path
word_count_rdd.saveAsTextFile('data/out/')

['1\tThe Da Vinci Code book is just awesome.', "1\tthis was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this.", '1\ti liked the Da Vinci Code a lot.', '1\ti liked the Da Vinci Code a lot.', "1\tI liked the Da Vinci Code but it ultimatly didn't seem to hold it's own.", "1\tthat's not even an exaggeration ) and at midnight we went to Wal-Mart to buy the Da Vinci Code, which is amazing of course.", '1\tI loved the Da Vinci Code, but now I want something better and different!..', '1\ti thought da vinci code was great, same with kite runner.', '1\tThe Da Vinci Code is actually a good movie...', '1\tI thought the Da Vinci Code was a pretty good book.', '1\tThe Da Vinci Code is one of the most beautiful movies ive ever seen.', '1\tThe Da Vinci Code is an * amazing * book, do not get me wrong.', '1\tthen I turn on the light and the radio and enjoy my Da Vinci Code.', '1\tThe Da Vinci Code was REALLY good.', '1\ti love da vinci c

In [9]:
rdd.flatMap(lambda line: line.split()).take(10)

['1', 'The', 'Da', 'Vinci', 'Code', 'book', 'is', 'just', 'awesome.', '1']

In [10]:
data = [1,3,5,67,8,4,43,4,43,4,3,3,32,78,7,9,2,4,22,6,2]
rdd1 = sc.parallelize(data)
rdd1.collect()
rdd1.getNumPartitions() # equals to number of cores in the cluster

8

In [11]:
# [(word, count)...]
word_count_rdd.filter(lambda wc: wc[1]>1000).collect()

[('was', 1164),
 ('is', 1346),
 ('love', 1253),
 ('Da', 1122),
 ('Vinci', 1474),
 ('Code', 1201),
 ('the', 2080),
 ('and', 2004),
 ('a', 1209),
 ('1\tI', 1273),
 ('I', 1983),
 ('Harry', 1876)]

In [12]:
print(word_count_rdd.sortBy(lambda wc: wc[1]).toDebugString().decode("utf-8"))

(2) PythonRDD[20] at RDD at PythonRDD.scala:53 []
 |  MapPartitionsRDD[19] at mapPartitions at PythonRDD.scala:133 []
 |  ShuffledRDD[18] at partitionBy at NativeMethodAccessorImpl.java:0 []
 +-(2) PairwiseRDD[17] at sortBy at <ipython-input-12-abbb684fb72a>:1 []
    |  PythonRDD[16] at sortBy at <ipython-input-12-abbb684fb72a>:1 []
    |  MapPartitionsRDD[6] at mapPartitions at PythonRDD.scala:133 []
    |  ShuffledRDD[5] at partitionBy at NativeMethodAccessorImpl.java:0 []
    +-(2) PairwiseRDD[4] at reduceByKey at <ipython-input-8-754b9e2c7702>:15 []
       |  PythonRDD[3] at reduceByKey at <ipython-input-8-754b9e2c7702>:15 []
       |  training.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []
       |  training.txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []


----------------------
For example, when we perform reduceByKey() operation, PySpark does the following
- PySpark first runs map tasks on all partitions which groups all values for a single key.
- The results of the map tasks are kept in memory.
- When results do not fit in memory, PySpark stores the data into a disk.
- PySpark shuffles the mapped data across partitions, some times it also stores the shuffled data into a disk for reuse when it needs to recalculate.
- Run the garbage collection
- Finally runs reduce tasks on each partition based on key.

-------------------

Broadcast and accumulators

In [14]:
broadcastVar = sc.broadcast([0, 1, 2, 3])
broadcastVar.value

[0, 1, 2, 3]

In [19]:
accum = sc.accumulator(0)
sc.parallelize([1, 2, 3]).foreach(lambda x: accum.add(x))
accum

Accumulator<id=1, value=6>