In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Row
from numpy.random import rand
from pyspark.sql.types import IntegerType, StringType

My machine has following configuration...
- 6 cores with 12vCores
- 32GB RAM

Spark Standalone server:
```
cd /opt/softwares/spark-3.0.1-bin-hadoop3.2/

export PYSPARK_PYTHON=/opt/envs/ai4e/bin/python
export PYSPARK_DRIVER_PYTHON=/opt/envs/ai4e/bin/python

sbin/start-all.sh
sbin/stop-all.sh
```
Spark UI: [http://localhost:8080](http://localhost:8080)   
Spark Master URL : spark://IMCHLT276:7077

In [2]:
spark = SparkSession.builder \
    .master("spark://IMCHLT276:7077") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.cores.max", "10") \
    .config("spark.local.dir", "/opt/tmp/spark-temp/") \
    .appName("DataSkewness") \
    .getOrCreate()

# https://stackoverflow.com/questions/27774443/is-it-safe-to-temporarily-rename-tmp-and-then-create-a-tmp-symlink-to-a-differe
# sudo mount --bind /path/to/dir/with/plenty/of/space /tmp
# sudo lsof /tmp # check for apps
# sudo umount /tmp

In [3]:
spark

## Dataset

Products  
--------
: product_id - The product ID  
: product_name  - The product name   
: price - The product price   

Sellers  
-------
: seller_id  - The seller ID   
: seller_name  - The seller name    
: daily_target - The number of items (regardless of the product type) that the seller needs to hit his/her quota. For example, if the daily target is 100,000, the employee needs to sell 100,000 products he can hit the quota by selling 100,000 units of product_0, but also selling 30,000 units of product_1 and 70,000 units of product_2   

Sales  
-----
: order_id  - The order ID   
: product_id  - The single product sold in the order. All orders have exactly one product)   
: seller_id  - The selling employee ID that sold the product   
: date - The date of the order.    
: num_pieces_sold - The number of units sold for the specific product in the order    
: bill_raw_text  -  A string that represents the raw text of the bill associated with the order   


In [4]:
def read_df(path, is_print=False):
    if is_print: print(f"\nReading {path}")
    df = spark.read.parquet(path)
    if is_print: df.show()
    if is_print: print(f"Number of records in {path} is {df.count()}" )
    if is_print: print("_"*80 + "\n")
    return df


In [5]:
! ls

DataGenerator.ipynb	 products.csv	   sales.csv	  sellers_parquet
DataSkewExercises.ipynb  products_parquet  sales_parquet
data			 requirements.txt  sellers.csv


In [6]:
products_df, sales_df, sellers_df = None, None, None
def reload(is_print=False):
    global products_df, sales_df, sellers_df
    products_df = read_df("data/products_parquet", is_print=is_print)
    sellers_df = read_df("data/sellers_parquet", is_print=is_print)
    sales_df = read_df("data/sales_parquet/", is_print=is_print)
#     return products_df, sales_df, sellers_df

In [7]:
reload(True)


Reading data/products_parquet
+----------+----------------+-------------+
|product_id|    product_name|product_price|
+----------+----------------+-------------+
|   7958964| product_7958964|          101|
|   6315293| product_6315293|          122|
|  22900640|product_22900640|           97|
|  27724440|product_27724440|           23|
|  32428960|product_32428960|           21|
|  14157374|product_14157374|          141|
|  27120261|product_27120261|          100|
|   8052753| product_8052753|           26|
|  11731791|product_11731791|           12|
|   5418200| product_5418200|           72|
|  21935231|product_21935231|            7|
|  18203831|product_18203831|          128|
|  20165155|product_20165155|           97|
|  34717159|product_34717159|          102|
|   4666041| product_4666041|            2|
|  31736408|product_31736408|           65|
|  22269340|product_22269340|           77|
|   7909585| product_7909585|          104|
|  14193727|product_14193727|           86|
|

In [8]:
sales_df.filter(F.col('order_id') == 1).show()

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2020-07-04|              6|zlecsnopmizucbnyn...|
+--------+----------+---------+----------+---------------+--------------------+



**1. Find out how many orders, how many products and how many sellers are in the data.**

In [9]:
%%time
print(f"Sellers no: {sellers_df.count()}")

Sellers no: 10


In [10]:
%%time
print(f"Products no: {products_df.count()}")

Products no: 75000000


In [11]:
%%time
print(f"Sales no: {sales_df.count()}")

Sales no: 20000000


**2. How many products have been sold at least once?**

In [13]:
%%time
print("Number of products sold at least once")
sales_df.agg(F.countDistinct(F.col("product_id"))).show()
# sales_df.agg({"product_id" : "countDistinct"}).show()

Number of products sold at least once
+-----------------+
|count(product_id)|
+-----------------+
|           993420|
+-----------------+



In [14]:
%%time
sales_df.select(F.countDistinct(F.col("product_id"))).show()

+--------------------------+
|count(DISTINCT product_id)|
+--------------------------+
|                    993420|
+--------------------------+



**3. Which is the product contained in more orders?**

In [12]:
%%time
sales_df.groupBy("product_id")\
    .agg(F.count("*").alias("cnt"))\
    .orderBy(F.col("cnt").desc())\
    .limit(1).show()

+----------+--------+
|product_id|     cnt|
+----------+--------+
|         0|19000000|
+----------+--------+

CPU times: user 4.14 ms, sys: 7.11 ms, total: 11.2 ms
Wall time: 5.49 s


**4. How many distinct products have been sold in each day?**

In [13]:
%%time
sales_df.groupBy("date") \
        .agg(F.countDistinct(F.col("product_id")).alias("cnt"))\
        .orderBy(F.col("cnt").desc()) \
        .show()

+----------+------+
|      date|   cnt|
+----------+------+
|2020-07-01|100319|
|2020-07-02|100227|
|2020-07-09|100146|
|2020-07-08|100059|
|2020-07-04| 99905|
|2020-07-07| 99894|
|2020-07-03| 99865|
|2020-07-10| 99801|
|2020-07-06| 99692|
|2020-07-05| 99427|
+----------+------+

CPU times: user 10.4 ms, sys: 0 ns, total: 10.4 ms
Wall time: 8.42 s


**5.What is the average revenue of the orders?**

In [14]:
%%time
reload()

CPU times: user 6.18 ms, sys: 1.33 ms, total: 7.51 ms
Wall time: 208 ms


Check the DAG visualization of following join...

In [15]:
%%time
products_df.join(sales_df, on="product_id", how="inner").show()

+----------+--------------+-------------+--------+---------+----------+---------------+--------------------+
|product_id|  product_name|product_price|order_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+----------+--------------+-------------+--------+---------+----------+---------------+--------------------+
|     15237| product_15237|           74|19568472|        8|2020-07-06|             74|kefitmblrbpzetkhe...|
|     32954| product_32954|           62|19572695|        3|2020-07-02|             57|iftmxvmheuruxjaju...|
|     50219| product_50219|          149|19256164|        1|2020-07-02|              1|koygorokoebfchlzr...|
|     59861| product_59861|           40|19369311|        6|2020-07-09|             19|aosqakuirnhvhzjhe...|
|     62273| product_62273|           57|19510383|        1|2020-07-06|             25|qpvfgdggcfsbgruxt...|
|     74783| product_74783|           11|19381052|        7|2020-07-08|              2|bpmqrdmvjyrhuihjl...|
|     75232| produc

In [20]:
products_df = products_df.repartition(128)
sales_df = sales_df.repartition(256)
products_df.join(sales_df, on="product_id").count() ### Will end up as timeout error

20000000

In [16]:
products_df = products_df.withColumnRenamed("product_id", "repartition_id").repartition(512, F.col("repartition_id"))
sales_df = sales_df.withColumnRenamed("product_id", "repartition_id").repartition(512, F.col("repartition_id"))     
products_df.join(sales_df, on="repartition_id").count() ### Will end up as timeout error

19999999

In [24]:
def saltify(df, col_name, number_of_partition):
    """
    Adds a new column names `col_name`_salted, which has concatenated values of `col_name` and number in the range of 0 to number_of_partition
    Note: import pyspark.sql.functions as F
    """
    salted_col = col_name + "_salted"  
    return df.withColumn("dummy", F.monotonically_increasing_id() % number_of_partition)\
            .withColumn(salted_col, F.concat(F.col(col_name), F.lit("-"),F.col("dummy")))\
            .drop(F.col("dummy")).repartition(number_of_partition, F.col(salted_col)) 

In [25]:
products_df = saltify(df=products_df, col_name="product_id", number_of_partition=1024)

In [26]:
# products_df.show()

In [27]:
sales_df = saltify(df=sales_df, col_name="product_id", number_of_partition=1024)

In [28]:
# sales_df.groupBy("product_id_salted")\
#     .agg(F.count("*").alias("cnt"))\
#     .orderBy(F.col("cnt").desc())\
#     .limit(1).show()

In [29]:
%timeit
products_df.join(sales_df, on="product_id_salted", how="inner").agg(F.avg(products_df["price"] * sales_df["num_pieces_sold"])).show()

+------------------------------+
|avg((price * num_pieces_sold))|
+------------------------------+
|             1242.751587464154|
+------------------------------+



In [13]:
# Step 1 - Check and select the skewed keys 
# In this case we are retrieving the top 100 keys: these will be the only salted keys.
results = sales_df.groupby(sales_df["product_id"]).count().sort(F.col("count").desc()).limit(100).collect()


In [14]:
# Step 2 - What we want to do is:
#  a. Duplicate the entries that we have in the dimension table for the most common products, e.g.
#       product_0 will become: product_0-1, product_0-2, product_0-3 and so on
#  b. On the sales table, we are going to replace "product_0" with a random duplicate (e.g. some of them 
#     will be replaced with product_0-1, others with product_0-2, etc.)
# Using the new "salted" key will unskew the join

# Let's create a dataset to do the trick
REPLICATION_FACTOR = 101
l = []
replicated_products = []
for _r in results:
    replicated_products.append(_r["product_id"])
    for _rep in range(0, REPLICATION_FACTOR):
        l.append((_r["product_id"], _rep))
        
rdd = spark.sparkContext.parallelize(l)
replicated_df = rdd.map(lambda x: Row(product_id=x[0], replication=int(x[1])))
replicated_df = spark.createDataFrame(replicated_df)
# replicated_df.show()

In [15]:
#   Step 3: Generate the salted key
products_df = products_df.join(F.broadcast(replicated_df), products_df["product_id"] == replicated_df["product_id"], "left"). \
    withColumn("salted_join_key", F.when(replicated_df["replication"].isNull(), products_df["product_id"]).otherwise(
    F.concat(replicated_df["product_id"], F.lit("-"), replicated_df["replication"])))

# products_df.show()


In [17]:
sales_df = sales_df.withColumn("salted_join_key", F.when(sales_df["product_id"].isin(replicated_products),
                                                             F.concat(sales_df["product_id"], F.lit("-"),
                                                                    F.lit(round(rand() * (REPLICATION_FACTOR - 1), 0)).cast(
                                                                        IntegerType()))).otherwise(sales_df["product_id"]))
# sales_df.show()


In [18]:
#   Step 4: Finally let's do the join
print(sales_df.join(products_df, sales_df["salted_join_key"] == products_df["salted_join_key"],
                       "inner").
      agg(F.avg(products_df["price"] * sales_df["num_pieces_sold"])).show())

print("Ok")

+------------------------------+
|avg((price * num_pieces_sold))|
+------------------------------+
|            1246.1338560822878|
+------------------------------+

None
Ok


**6. For each seller, what is the average % contribution of an order to the seller's daily quota?**

Example   
If Seller_0 with `quota=250` has 3 orders:Order 1: 10 products sold    
Order 2: 8 products sold    
Order 3: 7 products soldThe average % contribution of orders to the seller's quota would be:    
Order 1: 10/105 = 0.04   
Order 2: 8/105 = 0.032     
Order 3: 7/105 = 0.028Average % Contribution = (0.04+0.032+0.028)/3 = 0.03333    

In [121]:
reload()

In [122]:
sellers_df.show()

+---------+-----------+------------+
|seller_id|seller_name|daily_target|
+---------+-----------+------------+
|        0|   seller_0|     2500000|
|        1|   seller_1|      257237|
|        2|   seller_2|      754188|
|        3|   seller_3|      310462|
|        4|   seller_4|     1532808|
|        5|   seller_5|     1199693|
|        6|   seller_6|     1055915|
|        7|   seller_7|     1946998|
|        8|   seller_8|      547320|
|        9|   seller_9|     1318051|
+---------+-----------+------------+



In [123]:
sales_df.show()

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2020-07-10|             26|kyeibuumwlyhuwksx...|
|       2|         0|        0|2020-07-08|             13|jfyuoyfkeyqkckwbu...|
|       3|         0|        0|2020-07-05|             38|uyjihlzhzcswxcccx...|
|       4|         0|        0|2020-07-05|             56|umnxvoqbdzpbwjqmz...|
|       5|         0|        0|2020-07-05|             11|zmqexmaawmvdpqhih...|
|       6|         0|        0|2020-07-01|             82|lmuhhkpyuoyslwmvX...|
|       7|         0|        0|2020-07-04|             15|zoqweontumefxbgvu...|
|       8|         0|        0|2020-07-08|             79|sgldfgtcxufasnvsc...|
|       9|         0|        0|2020-07-10|             25|jnykelwjjebgkwgmu...|
|      10|         0|        0|2020-07-0

In [134]:
sales_df.join(F.broadcast(sellers_df), on="seller_id", how="inner")\
.withColumn("ratio", sales_df["num_pieces_sold"]/sellers_df["daily_target"])\
.groupBy("seller_id").agg(F.avg("ratio")).alias("percent").show()

+---------+--------------------+
|seller_id|          avg(ratio)|
+---------+--------------------+
|        7|2.595228787788171E-5|
|        3|1.628885370565939...|
|        8| 9.21303037540886E-5|
|        0|2.019885898946922...|
|        5|4.211073965904021E-5|
|        6|4.782147194369122E-5|
|        9|3.837913136180238E-5|
|        1|1.964233366461015E-4|
|        4|3.296428039825816E-5|
|        2|6.690408001060484E-5|
+---------+--------------------+



**6.Who are the second most selling and the least selling persons (sellers) for each product? Who are those for product with `product_id = 0`**

**7.**
Create a new column called "hashed_bill" defined as follows:

- if the order_id is even: apply MD5 hashing iteratively to the bill_raw_text field, once for each 'A' (capital 'A') present in the text. E.g. if the bill text is 'nbAAnllA', you would apply hashing three times iteratively (only if the order number is even)

- if the order_id is odd: apply SHA256 hashing to the bill textFinally, check if there are any duplicate on the new column

In [152]:
reload()
sales_df.show()
sales_df.printSchema()

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2020-07-10|             26|kyeibuumwlyhuwksx...|
|       2|         0|        0|2020-07-08|             13|jfyuoyfkeyqkckwbu...|
|       3|         0|        0|2020-07-05|             38|uyjihlzhzcswxcccx...|
|       4|         0|        0|2020-07-05|             56|umnxvoqbdzpbwjqmz...|
|       5|         0|        0|2020-07-05|             11|zmqexmaawmvdpqhih...|
|       6|         0|        0|2020-07-01|             82|lmuhhkpyuoyslwmvX...|
|       7|         0|        0|2020-07-04|             15|zoqweontumefxbgvu...|
|       8|         0|        0|2020-07-08|             79|sgldfgtcxufasnvsc...|
|       9|         0|        0|2020-07-10|             25|jnykelwjjebgkwgmu...|
|      10|         0|        0|2020-07-0

In [184]:
import hashlib 
def get_hash(order_id, text):
    res = text
    if int(order_id) % 2 == 0:
        res = hashlib.md5(bytes(res, 'utf-8')).hexdigest()
        for c in text:
            if c == 'A':
                res = hashlib.md5(bytes(res, 'utf-8')).hexdigest()
    else:
        res = hashlib.sha256(bytes(res, 'utf-8')).hexdigest()
    return res
        
get_hash_udf = F.udf(get_hash, StringType())   

In [185]:
get_hash("24", "jfyuoyfkAAeyqkckwbu")

'67d25a783609ce62d5456dc297c05dfd'

In [178]:
# sales_df.withColumn("hashed_bill", get_hash_udf("order_id", "bill_raw_text")).show()
hashed_df = sales_df.withColumn("hashed_bill", get_hash_udf(F.col("order_id"), F.col("bill_raw_text")))

In [179]:
hashed_df.show()

+--------+----------+---------+----------+---------------+--------------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|         hashed_bill|
+--------+----------+---------+----------+---------------+--------------------+--------------------+
|       1|         0|        0|2020-07-10|             26|kyeibuumwlyhuwksx...|f6fa2a8be04a4ead6...|
|       2|         0|        0|2020-07-08|             13|jfyuoyfkeyqkckwbu...|jfyuoyfkeyqkckwbu...|
|       3|         0|        0|2020-07-05|             38|uyjihlzhzcswxcccx...|416376a64cd652e7b...|
|       4|         0|        0|2020-07-05|             56|umnxvoqbdzpbwjqmz...|umnxvoqbdzpbwjqmz...|
|       5|         0|        0|2020-07-05|             11|zmqexmaawmvdpqhih...|787d361b162a6aa1a...|
|       6|         0|        0|2020-07-01|             82|lmuhhkpyuoyslwmvX...|lmuhhkpyuoyslwmvX...|
|       7|         0|        0|2020-07-04|             15|zoqweontumefxbgvu...|4540f452a7c4

In [180]:
hashed_df.groupby(F.col("hashed_bill")).agg(F.count("*").alias("cnt")).where(F.col("cnt") > 1).show()

+-----------+---+
|hashed_bill|cnt|
+-----------+---+
+-----------+---+



In [181]:
hashed_df.select("hashed_bill").count()

20000040

In [183]:
hashed_df.agg(F.countDistinct(F.col("hashed_bill"))).show()

+---------------------------+
|count(DISTINCT hashed_bill)|
+---------------------------+
|                   20000040|
+---------------------------+



In [35]:
df = spark.range(1000000).select(F.col("id"))

In [36]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+
only showing top 20 rows

