In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_NUM_CORES = 10

In [3]:
spark = SparkSession.builder \
    .master("spark://IMCHLT276:7077") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.cores.max", f"{MAX_NUM_CORES}") \
    .config("spark.local.dir", "/opt/tmp/spark-temp/") \
    .appName("DataSkewness") \
    .getOrCreate()

sc = spark.sparkContext

In [11]:
# Create RDD from text file
rdd = sc.textFile('data/simple_text.txt') # file:///

# Split lines into words
lines_rdd = rdd.flatMap(lambda line: line.split(" "))

# Create Pair RDD by giving 1 to each word
# transformed_rdd = transformed_rdd.map(lambda x: x.strip('[').strip(']'))
words_rdd = lines_rdd.map(lambda word: (word, 1))

# GroupByWord
word_count_rdd = words_rdd.reduceByKey(lambda a, b: a+b)

# To View the data
print(word_count_rdd.take(20))

# Write data to out path
word_count_rdd.saveAsTextFile('data/out/')

[('18:35:34', 13), ('SampleClass6', 1), ('[INFO]', 1), ('everything', 1), ('[FATAL]', 1), ('at', 1), ('SampleClass3', 2), ('detail', 6), ('1304807656', 1), ('verbose', 4), ('[ERROR]', 1), ('incorrect', 1), ('', 1), ('SampleClass8', 1), ('1903854437', 1), ('SampleClass7', 3), ('is', 3), ('test', 3), ('case1', 2), ('biggest', 1)]


In [4]:
spark

**Test how partition size affects the output file numbers**

**Test 1** : Number of partition is equal to the cores

In [21]:
df = spark.range(100000)
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+
only showing top 20 rows



In [11]:
df.rdd.getNumPartitions() == MAX_NUM_CORES

True

In [9]:
! rm -rf /tmp/df_tes/
df.write.parquet("/tmp/df_tes/")
!ls /tmp/df_tes/

**Test 2** : Repartition will affect the number ouput files

In [23]:
df = spark.range(100000)
df = df.repartition(20)
df.rdd.getNumPartitions()

20

In [14]:
! rm -rf /tmp/df_tes/
df.write.parquet("/tmp/df_tes/")
!ls /tmp/df_tes/

_SUCCESS
part-00000-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00001-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00002-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00003-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00004-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00005-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00006-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00007-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00008-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00009-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00010-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00011-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00012-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00013-1876d46c-f683-4ebd-81a7-e99f246916a8-c000.snappy.parquet
part-00014-1876d46c-f683-4ebd-81a7-e99f

**Test 3** : Repartition to 1 and see waht happens?

In [50]:
df = spark.range(10000000)
df = df.repartition(1)
df.rdd.getNumPartitions()

1

In [52]:
! rm -rf /tmp/df_tes/
df.write.parquet("/tmp/df_tes/")
!ls /tmp/df_tes/ -alh

total 39M
drwxrwxr-x  2 mageswarand mageswarand 4.0K Feb 18 13:26 .
drwxrwxrwt 36 root        root        4.0K Feb 18 13:26 ..
-rw-r--r--  1 mageswarand mageswarand    8 Feb 18 13:26 ._SUCCESS.crc
-rw-r--r--  1 mageswarand mageswarand 306K Feb 18 13:26 .part-00000-40d5a79d-70bf-4762-bddb-22a29d64db6e-c000.snappy.parquet.crc
-rw-r--r--  1 mageswarand mageswarand    0 Feb 18 13:26 _SUCCESS
-rw-r--r--  1 mageswarand mageswarand  39M Feb 18 13:26 part-00000-40d5a79d-70bf-4762-bddb-22a29d64db6e-c000.snappy.parquet


**Test 4** : coalesce

In [27]:
df = spark.range(100000)
df = df.coalesce(1)
df.rdd.getNumPartitions()

1

In [28]:
! rm -rf /tmp/df_tes/
df.write.parquet("/tmp/df_tes/")
!ls /tmp/df_tes/ -alh

_SUCCESS  part-00000-2b4e54a3-e11e-4a5a-b53d-96f0e42d9c51-c000.snappy.parquet


**Test 5** : Add a text column and repartition to 1 and see waht happens? Size on local disk doesn't matter. On HDFS this may change

In [6]:
import string, random
import pyspark.sql.functions as F
from pyspark.sql.types import *

letters = string.ascii_lowercase
letters_upper = string.ascii_uppercase

for _i in range(0, 10):
    letters += letters

for _i in range(0, 10):
    letters += letters_upper

print("Number of chars to choose from", len(letters))
sample_string = random.sample(letters, 500)
# print("sample_string", ''.join(sample_string))

def random_string(stringLength=200):
    """Generate a random string of fixed length """
    return ''.join(random.sample(letters, stringLength))

random_string_udf = F.udf(random_string,StringType())

Number of chars to choose from 26884


In [7]:
df = spark.range(1000000)
df = df.withColumn("data", random_string_udf())

In [8]:
df = df.repartition(1, F.col("data"))
df = df.select("data")

In [9]:
df.rdd.getNumPartitions()

1

In [10]:
%time
! rm -rf /tmp/df_tes/
df.write.parquet("/tmp/df_tes/")
!ls /tmp/df_tes/ -alh

CPU times: user 6 µs, sys: 4 µs, total: 10 µs
Wall time: 18.1 µs
total 197M
drwxrwxr-x  2 mageswarand mageswarand 4.0K Feb 18 14:32 .
drwxrwxrwt 36 root        root        4.0K Feb 18 14:32 ..
-rw-r--r--  1 mageswarand mageswarand    8 Feb 18 14:32 ._SUCCESS.crc
-rw-r--r--  1 mageswarand mageswarand 1.6M Feb 18 14:32 .part-00000-90cd6cf3-8eb1-445d-a85e-c9a464f4a094-c000.snappy.parquet.crc
-rw-r--r--  1 mageswarand mageswarand    0 Feb 18 14:32 _SUCCESS
-rw-r--r--  1 mageswarand mageswarand 195M Feb 18 14:32 part-00000-90cd6cf3-8eb1-445d-a85e-c9a464f4a094-c000.snappy.parquet


In [17]:
from pyspark.sql.functions import spark_partition_id

df.groupBy(spark_partition_id()).count().show()

+--------------------+-------+
|SPARK_PARTITION_ID()|  count|
+--------------------+-------+
|                   0|1000000|
+--------------------+-------+



**Test 6** : Read back the stored DF with 1 partition and see how many partitions are there? Equals to number of cores

In [20]:
df = spark.read.parquet("/tmp/df_tes/")
df.rdd.getNumPartitions()

10

**Test 7** Store as many paritions and read it back

In [21]:
df = spark.range(1000000)
df = df.withColumn("data", random_string_udf())
df = df.repartition(32, F.col("data"))
df = df.select("data")

In [22]:
%time
! rm -rf /tmp/df_tes/
df.write.parquet("/tmp/df_tes/")
!ls /tmp/df_tes/ -alh

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 8.82 µs
total 197M
drwxrwxr-x  2 mageswarand mageswarand  12K Feb 18 15:38 .
drwxrwxrwt 36 root        root        4.0K Feb 18 15:38 ..
-rw-r--r--  1 mageswarand mageswarand    8 Feb 18 15:38 ._SUCCESS.crc
-rw-r--r--  1 mageswarand mageswarand  49K Feb 18 15:38 .part-00000-87fc8316-4176-4e93-aa56-2b347163f81a-c000.snappy.parquet.crc
-rw-r--r--  1 mageswarand mageswarand  49K Feb 18 15:38 .part-00001-87fc8316-4176-4e93-aa56-2b347163f81a-c000.snappy.parquet.crc
-rw-r--r--  1 mageswarand mageswarand  49K Feb 18 15:38 .part-00002-87fc8316-4176-4e93-aa56-2b347163f81a-c000.snappy.parquet.crc
-rw-r--r--  1 mageswarand mageswarand  49K Feb 18 15:38 .part-00003-87fc8316-4176-4e93-aa56-2b347163f81a-c000.snappy.parquet.crc
-rw-r--r--  1 mageswarand mageswarand  49K Feb 18 15:38 .part-00004-87fc8316-4176-4e93-aa56-2b347163f81a-c000.snappy.parquet.crc
-rw-r--r--  1 mageswarand mageswarand  49K Feb 18 15:38 .part-00005-87fc8316-4176-4e93-aa56-2

In [23]:
df = spark.read.parquet("/tmp/df_tes/")
df.rdd.getNumPartitions()

11

In [26]:
df.groupBy(spark_partition_id()).count().show()

+--------------------+-----+
|SPARK_PARTITION_ID()|count|
+--------------------+-----+
|                   1|94124|
|                   6|93649|
|                   3|93967|
|                   5|93696|
|                   9|93054|
|                   4|93792|
|                   8|93361|
|                   7|93577|
|                  10|61894|
|                   2|94036|
|                   0|94850|
+--------------------+-----+

