# 0. Abstract

The purpose of this notebook is to explore on arrayType filtering to replace wildcard filtering. We desire this because this might enhance the performance of our algorithms against a large set of cutomer table and offer table.

# 1. Random Table Generator

In [1]:
import scala.util.Random
import math.{ round, min, max }

trait Dummy_Data_Generator {
    
    /**
     * Simulate a double of range 0 (inclusive) to `value` (exclusive).
     */
    def random_double(
        rand: Random = new Random, 
        value: Double = 1): Double = {
        rand.nextDouble * value
    }
    
    /**
     * Randomly select some items (size less than or equal to `max_item` but greater than or
     * equal to `min_item`) from the given array and output a string with items seperated
     * by `sep`.
     */
    def random_array_to_string[T](
        rand: Random = new Random, 
        array: Array[T],
        min_item: Int = 1,
        max_item: Int = 3,
        sep: String = ",",
        duplicate: Boolean = false,
        prob_array: Array[Double] = Array()): String = {
        
      if(array.isEmpty)
        throw new Exception("Invalid configuration: simulate from empty array")
      else if (prob_array.length != array.length && !prob_array.isEmpty)
        throw new Exception("Invalid configuration: different length of prob_array and array.")
      else {     
        val len: Int = array.length
        
        // Declare output variable.
        var output = "" + sep  
          
        // If prob_array is not provided, then assume equal probability for each item.
        lazy val prob_each: Double = 1.0 / len 
        val probArray: Array[Double] = 
          if (prob_array.isEmpty) array.map(x => prob_each) else prob_array
        
        // Number of items that will be in the list
        var num = max(min(rand.nextInt(len), max_item), min_item)
          
        // Tail recursive method of simulating from `array`.
        def gen[T](array: Array[T], probArray: Array[Double], sim: Double): T = {
          if (array.length == 1 || sim <= probArray.head)
            array.head
          else
            gen(array.tail, probArray.tail, sim - probArray.head)        
        }
        
        // Simulate a random double from 0 (inclusive) to `probArray.sum` (exclusive).
        var sim = 0.5
          
        // Generate the output list  
        while (num > 0) {
            sim = random_double(rand, probArray.sum)
            output = output + 
                gen(array.tail, probArray.tail, sim - probArray.head).toString + sep
            num = num - 1
        }
        
        // Remove the first and last element of the output, which are `sep`'s.
        output.drop(1).dropRight(1)
      }
    }
    
}

defined trait Dummy_Data_Generator


In [2]:
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.sql.{ SparkSession, SQLContext, Row }
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import scala.util.Random 

class Dummy_Table_Generator(spark: SparkSession, rand: Random) extends Dummy_Data_Generator {
    
  // Generate sparkContext from sparkSession.
  val sc = spark.sparkContext
    
  /**
   * Simulate a random table with `num_row` rows according to `col_schema` and `col_map`.
   */
  def random_table(num_row: Int, col_schema: Array[StructField], col_map: Int => Row) = {
    import spark.implicits._
      
    val rdd = sc.makeRDD(Range(1, num_row + 1).map(col_map))
    spark.createDataFrame(rdd, StructType(col_schema))
    }

}

defined class Dummy_Table_Generator


In [3]:
// Set up a SparkSession object.
val spark = SparkSession.builder
  .master("local[*]")
  .appName("Dummy Table Generator Example 1")
  .config("spark.some.config.option", "some-value")
  .getOrCreate()

import spark.implicits._

// Set seed for random number generator.
val rand = new Random(588)

// Table generator
val table_gen = new Dummy_Table_Generator(spark, rand)

spark = org.apache.spark.sql.SparkSession@16ccc45b
rand = scala.util.Random@5eada09e
table_gen = Dummy_Table_Generator@7b2694f


Dummy_Table_Generator@7b2694f

# 2. Customer Table

In [4]:
// List of customer descriptions
val descList = Array("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9", "L10",
                     "L11", "L12", "L13", "L14", "L15", "L16", "L17", "L18", "L19", "L20",
                     "L21", "L22", "L23", "L24", "L25", "L26", "L27", "L28", "L29", "L30")

// Column schema for the customer table.
val custColSchema = Array(
    StructField("Cust_ID", IntegerType, true),
    StructField("Cust_Desc", StringType, true))

// Column mapping for the customer table.
val custColMap = (x: Int) => Row(
    100000 + x,
    table_gen.random_array_to_string(rand, descList, 0, 3))

// The customer table.
val customer = table_gen.random_table(10, custColSchema, custColMap)

customer.show(false)

+-------+-----------+
|Cust_ID|Cust_Desc  |
+-------+-----------+
|100001 |L2,L20,L27 |
|100002 |L9,L30,L17 |
|100003 |L27,L3,L4  |
|100004 |L21,L26,L6 |
|100005 |L21,L17,L13|
|100006 |L8,L15,L3  |
|100007 |L16,L10,L5 |
|100008 |L3,L3,L9   |
|100009 |L28,L30,L22|
|100010 |L17,L19,L29|
+-------+-----------+



descList = Array(L1, L2, L3, L4, L5, L6, L7, L8, L9, L10, L11, L12, L13, L14, L15, L16, L17, L18, L19, L20, L21, L22, L23, L24, L25, L26, L27, L28, L29, L30)
custColSchema = Array(StructField(Cust_ID,IntegerType,true), StructField(Cust_Desc,StringType,true))
custColMap = > org.apache.spark.sql.Row = <function1>
customer = [Cust_ID: int, Cust_Desc: string]


[Cust_ID: int, Cust_Desc: string]

# 3. Offer Table

In [5]:
// Offer schema for the offer table.
val offerColSchema = Array(
    StructField("Offer_ID", IntegerType, true),
    StructField("Cust_Desc_from_Offer", StringType, true))

// Offer mapping for the offer table.
val offerColMap = (x: Int) => Row(
    1000 + x,
    table_gen.random_array_to_string(rand, descList, 1, 2))

// The offer table.
val offer = table_gen.random_table(10, offerColSchema, offerColMap)

offer.show(false)

+--------+--------------------+
|Offer_ID|Cust_Desc_from_Offer|
+--------+--------------------+
|1001    |L30,L25             |
|1002    |L4,L16              |
|1003    |L16,L26             |
|1004    |L15                 |
|1005    |L29,L29             |
|1006    |L9,L2               |
|1007    |L28,L4              |
|1008    |L8,L8               |
|1009    |L27,L2              |
|1010    |L21                 |
+--------+--------------------+



offerColSchema = Array(StructField(Offer_ID,IntegerType,true), StructField(Cust_Desc_from_Offer,StringType,true))
offerColMap = > org.apache.spark.sql.Row = <function1>
offer = [Offer_ID: int, Cust_Desc_from_Offer: string]


[Offer_ID: int, Cust_Desc_from_Offer: string]

# 4. ArrayType Filter

In [6]:
// Demonstration of arrayType in Spark Dataframe
customer.withColumn("Cust_Desc_Array", split(col("Cust_Desc"), "\\,")).show(false)

+-------+-----------+---------------+
|Cust_ID|Cust_Desc  |Cust_Desc_Array|
+-------+-----------+---------------+
|100001 |L2,L20,L27 |[L2, L20, L27] |
|100002 |L9,L30,L17 |[L9, L30, L17] |
|100003 |L27,L3,L4  |[L27, L3, L4]  |
|100004 |L21,L26,L6 |[L21, L26, L6] |
|100005 |L21,L17,L13|[L21, L17, L13]|
|100006 |L8,L15,L3  |[L8, L15, L3]  |
|100007 |L16,L10,L5 |[L16, L10, L5] |
|100008 |L3,L3,L9   |[L3, L3, L9]   |
|100009 |L28,L30,L22|[L28, L30, L22]|
|100010 |L17,L19,L29|[L17, L19, L29]|
+-------+-----------+---------------+



In [7]:
import scala.collection.mutable.WrappedArray

// User defined function to check intersection.
val custDescIntersect = udf {
    (dfArray1: WrappedArray[String], dfArray2: WrappedArray[String]) => 
        ((dfArray1.toList.intersect(dfArray2.toList)).size > 0)
}

custDescIntersect = UserDefinedFunction(<function2>,BooleanType,Some(List(ArrayType(StringType,true), ArrayType(StringType,true))))


UserDefinedFunction(<function2>,BooleanType,Some(List(ArrayType(StringType,true), ArrayType(StringType,true))))

In [8]:
// Find intersection.
val tgt = 
  customer.withColumn("Cust_Desc_Array", split(col("Cust_Desc"), "\\,"))
    .crossJoin(offer.withColumn("Cust_Desc_Offer_Array",
                                split(col("Cust_Desc_from_Offer"), "\\,")))
    .withColumn("Intersect", custDescIntersect($"Cust_Desc_Array", $"Cust_Desc_Offer_Array"))

tgt.selectExpr(
    "Cust_ID",
    "Cust_Desc",
    "Offer_ID",
    "Cust_Desc_from_Offer",
    "Intersect").show(20, false)

+-------+----------+--------+--------------------+---------+
|Cust_ID|Cust_Desc |Offer_ID|Cust_Desc_from_Offer|Intersect|
+-------+----------+--------+--------------------+---------+
|100001 |L2,L20,L27|1001    |L30,L25             |false    |
|100001 |L2,L20,L27|1002    |L4,L16              |false    |
|100002 |L9,L30,L17|1001    |L30,L25             |true     |
|100002 |L9,L30,L17|1002    |L4,L16              |false    |
|100001 |L2,L20,L27|1003    |L16,L26             |false    |
|100001 |L2,L20,L27|1004    |L15                 |false    |
|100001 |L2,L20,L27|1005    |L29,L29             |false    |
|100002 |L9,L30,L17|1003    |L16,L26             |false    |
|100002 |L9,L30,L17|1004    |L15                 |false    |
|100002 |L9,L30,L17|1005    |L29,L29             |false    |
|100001 |L2,L20,L27|1006    |L9,L2               |true     |
|100001 |L2,L20,L27|1007    |L28,L4              |false    |
|100002 |L9,L30,L17|1006    |L9,L2               |true     |
|100002 |L9,L30,L17|1007

tgt = [Cust_ID: int, Cust_Desc: string ... 5 more fields]


[Cust_ID: int, Cust_Desc: string ... 5 more fields]