# Low Level API: Distributed Shared Variables
Lookup table is an array or matrix of data that contains items that are searched. Broadcast Variable is commonly used as a lookup table.

In [1]:
import findspark 
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
pyspark = SparkSession.builder \
.master("local[4]")\
.appName("PairRDD")\
.config("spark.executer.memory","2g")\
.config("spark.driver.memory","2g")\
.getOrCreate()

## ---> Reading products

In [4]:
def read_product():
    products_text_wrapper = open("data/retail_db/products.csv", "r", encoding="utf-8")
    products = products_text_wrapper.readlines()
    product_id_name = {}
    
    for line in products:
        if "productCategoryId" not in line:
            product_id = int(line.split(",")[0])
            product_category = int(line.split(",")[1])
            product_name = line.split(",")[2]
            product_id_name.update({product_id: (product_name, product_category)})
            
    return product_id_name

In [5]:
products = read_product()

In [6]:
broadcast_products = sc.broadcast(products)

In [7]:
broadcast_products.value

{1: ('Quest Q64 10 FT. x 10 FT. Slant Leg Instant U', 2),
 2: ("Under Armour Men's Highlight MC Football Clea", 2),
 3: ("Under Armour Men's Renegade D Mid Football Cl", 2),
 4: ("Under Armour Men's Renegade D Mid Football Cl", 2),
 5: ('Riddell Youth Revolution Speed Custom Footbal', 2),
 6: ("Jordan Men's VI Retro TD Football Cleat", 2),
 7: ('Schutt Youth Recruit Hybrid Custom Football H', 2),
 8: ("Nike Men's Vapor Carbon Elite TD Football Cle", 2),
 9: ('Nike Adult Vapor Jet 3.0 Receiver Gloves', 2),
 10: ("Under Armour Men's Highlight MC Football Clea", 2),
 11: ('Fitness Gear 300 lb Olympic Weight Set', 2),
 12: ("Under Armour Men's Highlight MC Alter Ego Fla", 2),
 13: ("Under Armour Men's Renegade D Mid Football Cl", 2),
 14: ('Quik Shade Summit SX170 10 FT. x 10 FT. Canop', 2),
 15: ("Under Armour Kids' Highlight RM Alter Ego Sup", 2),
 16: ('Riddell Youth 360 Custom Football Helmet', 2),
 17: ("Under Armour Men's Highlight MC Football Clea", 2),
 18: ("Reebok Men's Full Zip 

In [8]:
broadcast_products.value.get(218)

('Elevation Training Mask 2.0', 11)

## ---> Reading Categories

In [9]:
def read_categories():
    categories_text_wrapper = open("data/retail_db/categories.csv", "r", encoding="utf-8")
    categories = categories_text_wrapper.readlines()
    category_id_name = {}
    
    for line in categories:
        if "categoryName" not in line:
            category_id = int(line.split(",")[0])
            category_name = line.split(",")[2]
            category_id_name.update({category_id: category_name.rstrip()})
            
    return category_id_name

In [10]:
categories = read_categories()

In [11]:
broadcast_categories = sc.broadcast(categories)

In [12]:
broadcast_categories.value

{1: 'Football',
 2: 'Soccer',
 3: 'Baseball & Softball',
 4: 'Basketball',
 5: 'Lacrosse',
 6: 'Tennis & Racquet',
 7: 'Hockey',
 8: 'More Sports',
 9: 'Cardio Equipment',
 10: 'Strength Training',
 11: 'Fitness Accessories',
 12: 'Boxing & MMA',
 13: 'Electronics',
 14: 'Yoga & Pilates',
 15: 'Training by Sport',
 16: 'As Seen on  TV!',
 17: 'Cleats',
 18: "Men's Footwear",
 19: "Women's Footwear",
 20: "Kids' Footwear",
 21: 'Featured Shops',
 22: 'Accessories',
 23: "Men's Apparel",
 24: "Women's Apparel",
 25: "Boys' Apparel",
 26: "Girls' Apparel",
 27: 'Accessories',
 28: 'Top Brands',
 29: 'Shop By Sport',
 30: "Men's Golf Clubs",
 31: "Women's Golf Clubs",
 32: 'Golf Apparel',
 33: 'Golf Shoes',
 34: 'Golf Bags & Carts',
 35: 'Golf Gloves',
 36: 'Golf Balls',
 37: 'Electronics',
 38: "Kids' Golf Clubs",
 39: 'Team Shop',
 40: 'Accessories',
 41: 'Trade-In',
 42: 'Bike & Skate Shop',
 43: 'Camping & Hiking',
 44: 'Hunting & Shooting',
 45: 'Fishing',
 46: 'Indoor/Outdoor Games',

In [13]:
broadcast_categories.value.get(58)

'NFL Players'

In [14]:
for i in range(1):
    print(broadcast_categories.value)

{1: 'Football', 2: 'Soccer', 3: 'Baseball & Softball', 4: 'Basketball', 5: 'Lacrosse', 6: 'Tennis & Racquet', 7: 'Hockey', 8: 'More Sports', 9: 'Cardio Equipment', 10: 'Strength Training', 11: 'Fitness Accessories', 12: 'Boxing & MMA', 13: 'Electronics', 14: 'Yoga & Pilates', 15: 'Training by Sport', 16: 'As Seen on  TV!', 17: 'Cleats', 18: "Men's Footwear", 19: "Women's Footwear", 20: "Kids' Footwear", 21: 'Featured Shops', 22: 'Accessories', 23: "Men's Apparel", 24: "Women's Apparel", 25: "Boys' Apparel", 26: "Girls' Apparel", 27: 'Accessories', 28: 'Top Brands', 29: 'Shop By Sport', 30: "Men's Golf Clubs", 31: "Women's Golf Clubs", 32: 'Golf Apparel', 33: 'Golf Shoes', 34: 'Golf Bags & Carts', 35: 'Golf Gloves', 36: 'Golf Balls', 37: 'Electronics', 38: "Kids' Golf Clubs", 39: 'Team Shop', 40: 'Accessories', 41: 'Trade-In', 42: 'Bike & Skate Shop', 43: 'Camping & Hiking', 44: 'Hunting & Shooting', 45: 'Fishing', 46: 'Indoor/Outdoor Games', 47: 'Boating', 48: 'Water Sports', 49: 'MLB'

## ---> Reading Orders
#### Taking only completed order and (orderId, customerId)

In [15]:
def read_orders():
    orders_text_wrapper = open("data/retail_db/orders.csv", "r", encoding="utf-8")
    orders = orders_text_wrapper.readlines()
    orders_orderId_customerId = {}
    
    for line in orders:
        if "orderId" not in line:
            if "COMPLETE" in line:
                order_id = int(line.split(",")[0])
                customer_id = int(line.split(",")[2])
                orders_orderId_customerId.update({order_id: customer_id})
            
    return orders_orderId_customerId

In [16]:
orders = read_orders()

In [17]:
broadcast_orders = sc.broadcast(orders)

In [18]:
print("(orderId, customerId)")
broadcast_orders.value

(orderId, customerId)


{3: 12111,
 5: 11318,
 6: 7130,
 7: 4530,
 15: 2568,
 17: 2667,
 22: 333,
 26: 7562,
 28: 656,
 32: 3960,
 35: 4840,
 45: 2636,
 56: 10519,
 63: 1148,
 65: 5903,
 67: 1406,
 71: 8646,
 72: 4349,
 76: 6898,
 80: 3007,
 83: 1265,
 88: 3809,
 91: 8912,
 92: 6932,
 95: 9032,
 98: 5243,
 102: 8027,
 105: 8220,
 107: 1845,
 110: 2746,
 114: 2091,
 118: 1737,
 124: 2374,
 126: 610,
 135: 7738,
 137: 4211,
 139: 7933,
 141: 12128,
 143: 10495,
 146: 8177,
 149: 11431,
 152: 3611,
 153: 8876,
 160: 6762,
 163: 3348,
 171: 1980,
 172: 11382,
 174: 3358,
 175: 384,
 178: 8503,
 179: 6506,
 182: 10984,
 183: 12105,
 184: 210,
 186: 1104,
 207: 11643,
 210: 10102,
 214: 3925,
 218: 3935,
 220: 1383,
 225: 11275,
 231: 3960,
 233: 12316,
 236: 9785,
 239: 5058,
 247: 173,
 248: 11707,
 250: 11668,
 252: 9708,
 253: 45,
 254: 9395,
 255: 10346,
 258: 1137,
 259: 3454,
 261: 5128,
 263: 8026,
 264: 230,
 266: 11085,
 270: 5689,
 271: 815,
 279: 10372,
 283: 11466,
 287: 8122,
 288: 8667,
 290: 6446,
 

In [19]:
len(broadcast_orders.value)

22899

## ---> Reading Customers

In [20]:
def read_customers():
    customer_text_wrapper = open("data/retail_db/customers.csv", "r", encoding="utf-8")
    customers = customer_text_wrapper.readlines()
    customerId_Dictionary= {}
    
    for line in customers:
        if "customerId" not in line:
            customerId = int(line.split(",")[0])
            customerFirstName = line.split(",")[1]
            customerLastName = line.split(",")[2]
            customerCity = line.split(",")[6]
            customerId_Dictionary.update({customerId: (customerFirstName + ' ' + customerLastName, customerCity)})
            
    return customerId_Dictionary

In [21]:
customers = read_customers()

In [22]:
broadcast_customers = sc.broadcast(customers)

In [23]:
broadcast_customers.value

{1: ('Richard Hernandez', 'Brownsville'),
 2: ('Mary Barrett', 'Littleton'),
 3: ('Ann Smith', 'Caguas'),
 4: ('Mary Jones', 'San Marcos'),
 5: ('Robert Hudson', 'Caguas'),
 6: ('Mary Smith', 'Passaic'),
 7: ('Melissa Wilcox', 'Caguas'),
 8: ('Megan Smith', 'Lawrence'),
 9: ('Mary Perez', 'Caguas'),
 10: ('Melissa Smith', 'Stafford'),
 11: ('Mary Huffman', 'Caguas'),
 12: ('Christopher Smith', 'San Antonio'),
 13: ('Mary Baldwin', 'Caguas'),
 14: ('Katherine Smith', 'Pico Rivera'),
 15: ('Jane Luna', 'Fontana'),
 16: ('Tiffany Smith', 'Caguas'),
 17: ('Mary Robinson', 'Taylor'),
 18: ('Robert Smith', 'Martinez'),
 19: ('Stephanie Mitchell', 'Caguas'),
 20: ('Mary Ellis', 'West New York'),
 21: ('William Zimmerman', 'Caguas'),
 22: ('Joseph Smith', 'North Bergen'),
 23: ('Benjamin Duarte', 'San Juan'),
 24: ('Mary Smith', 'Caguas'),
 25: ('Paul Richardson', 'Peoria'),
 26: ('Johnny Hood', 'Glenview'),
 27: ('Mary Vincent', 'Caguas'),
 28: ('Timothy Smith', 'Longview'),
 29: ('Mary Humph

## --> Reading order_items 

In [24]:
order_item_rdd = sc.textFile("data/retail_db/order_items.csv")

In [25]:
order_item_rdd = order_item_rdd.filter(lambda x: "orderItemOrderId" not in x)

In [26]:
order_item_rdd.take(5)

['1,1,957,1,299.98,299.98',
 '2,2,1073,1,199.99,199.99',
 '3,2,502,5,250.0,50.0',
 '4,2,403,1,129.99,129.99',
 '5,4,897,2,49.98,24.99']

## Creating Pair RDD from Order_Items table

In [27]:
def make_orderItems_pairRdd(line):
    orderItems_productId = int(line.split(",")[2])
    orderItems_subTotal = float(line.split(",")[4])
    
    return (orderItems_productId, orderItems_subTotal)

In [28]:
orderItem_pairRdd = order_item_rdd.map(make_orderItems_pairRdd)

In [29]:
print("(orderItems_ProductId, orderItems_subTotal)")
orderItem_pairRdd.take(5)

(orderItems_ProductId, orderItems_subTotal)


[(957, 299.98), (1073, 199.99), (502, 250.0), (403, 129.99), (897, 49.98)]

### [Example]: Total Subtotal by using multiple map()

In [30]:
print("(orderItems_ProductId, orderItems_subTotal)")
orderItem_pairRdd.reduceByKey(lambda x,y: (x+y))\
.map(lambda x: (x[1],x[0]))\
.sortByKey(False)\
.map(lambda x: (x[1],x[0]))\
.take(10)

(orderItems_ProductId, orderItems_subTotal)


[(1004, 6929653.499999708),
 (365, 4421143.019999639),
 (957, 4118425.419999785),
 (191, 3667633.1999997487),
 (502, 3147800.0),
 (1073, 3099844.999999871),
 (403, 2891757.5399998166),
 (1014, 2888993.9399996493),
 (627, 1269082.649999932),
 (565, 67830.0)]

### [Example]: Total SubTotal by using sortBy(lambda)

In [31]:
print("(ProductId, Total_subTotal)")
orderItem_pairRdd.reduceByKey(lambda x,y: (x+y))\
.sortBy(lambda x: x[1], ascending = False)\
.take(10)

(ProductId, Total_subTotal)


[(1004, 6929653.499999708),
 (365, 4421143.019999639),
 (957, 4118425.419999785),
 (191, 3667633.1999997487),
 (502, 3147800.0),
 (1073, 3099844.999999871),
 (403, 2891757.5399998166),
 (1014, 2888993.9399996493),
 (627, 1269082.649999932),
 (565, 67830.0)]

In [32]:
sorted_orders = orderItem_pairRdd.reduceByKey(lambda x,y: (x+y)).sortBy(lambda x: x[1], ascending = False)

In [33]:
sorted_orders.take(10)

[(1004, 6929653.499999708),
 (365, 4421143.019999639),
 (957, 4118425.419999785),
 (191, 3667633.1999997487),
 (502, 3147800.0),
 (1073, 3099844.999999871),
 (403, 2891757.5399998166),
 (1014, 2888993.9399996493),
 (627, 1269082.649999932),
 (565, 67830.0)]

### [Example]: Best-Seller (Most sold) 10 products

In [34]:
sorted_orders_with_productName = sorted_orders.map(lambda x: (broadcast_products.value.get(x[0]),x[1]))

In [35]:
sorted_orders_with_productName.map(lambda x: (x[0][0],x[1])).take(10)

[('Field & Stream Sportsman 16 Gun Fire Safe', 6929653.499999708),
 ('Perfect Fitness Perfect Rip Deck', 4421143.019999639),
 ("Diamondback Women's Serene Classic Comfort Bi", 4118425.419999785),
 ("Nike Men's Free 5.0+ Running Shoe", 3667633.1999997487),
 ("Nike Men's Dri-FIT Victory Golf Polo", 3147800.0),
 ('Pelican Sunstream 100 Kayak', 3099844.999999871),
 ("Nike Men's CJ Elite 2 TD Football Cleat", 2891757.5399998166),
 ("O'Brien Men's Neoprene Life Vest", 2888993.9399996493),
 ("Under Armour Girls' Toddler Spine Surge Runni", 1269082.649999932),
 ('adidas Youth Germany Black/Red Away Match Soc', 67830.0)]

In [36]:
sortedOrders_category_price = sorted_orders_with_productName.map(lambda x: (x[0][1],x[1]))

In [37]:
sortedOrders_category_price.take(10)

[(45, 6929653.499999708),
 (17, 4421143.019999639),
 (43, 4118425.419999785),
 (9, 3667633.1999997487),
 (24, 3147800.0),
 (48, 3099844.999999871),
 (18, 2891757.5399998166),
 (46, 2888993.9399996493),
 (29, 1269082.649999932),
 (26, 67830.0)]

### [Example]: Most and least profitable Categories

In [38]:
print("(categoryId, SubTotal)")
sortedOrders_category_price.reduceByKey(lambda x,y: (x+y)).take(10)

(categoryId, SubTotal)


[(24, 3147800.0),
 (48, 3113844.599999871),
 (18, 2891757.5399998166),
 (46, 2888993.9399996493),
 (26, 151706.19999999998),
 (12, 85205.40999999995),
 (6, 44585.09000000002),
 (10, 54895.53),
 (44, 56848.419999999955),
 (40, 133671.50999999978)]

In [39]:
sorted_category_subtotal = sortedOrders_category_price.map(lambda x: (broadcast_categories.value.get(x[0]),x[1]))

#### [Example]: Most profitable 10 categories 

In [40]:
print("(categoryName, Subtotal)")
sorted_category_subtotal.sortBy(lambda x: x[1], ascending = False).take(10)

(categoryName, Subtotal)


[('Fishing', 6929653.499999708),
 ('Cleats', 4421143.019999639),
 ('Camping & Hiking', 4118425.419999785),
 ('Cardio Equipment', 3667633.1999997487),
 ("Women's Apparel", 3147800.0),
 ('Water Sports', 3099844.999999871),
 ("Men's Footwear", 2891757.5399998166),
 ('Indoor/Outdoor Games', 2888993.9399996493),
 ('Shop By Sport', 1269082.649999932),
 ("Girls' Apparel", 67830.0)]

#### [Example]: Least profitable 10 categories 

In [41]:
sorted_category_subtotal.sortBy(lambda x: x[1], ascending = True).take(10)

[("Kids' Golf Clubs", 5937.299999999995),
 ('Fitness Accessories', 5999.899999999999),
 ("Kids' Golf Clubs", 6599.8899999999985),
 ("Men's Golf Clubs", 7539.419999999991),
 ('Soccer', 7999.35999999999),
 ('Golf Shoes', 8208.0),
 ('Golf Apparel', 8399.299999999988),
 ('Basketball', 8399.719999999996),
 ('Basketball', 8699.709999999995),
 ('Golf Apparel', 8839.319999999989)]

### [Example]: Highest and Lowest SubTotals by Shopping Cart (OrderId)

In [42]:
def make_orderItems_pairRdd(line):
    orderItems_orderId = int(line.split(",")[1])
    orderItems_subTotal = float(line.split(",")[4])
    
    return (orderItems_orderId, orderItems_subTotal)

In [43]:
orderItem_SubTotal = order_item_rdd.map(make_orderItems_pairRdd)

In [44]:
print("(OrderId, SubTotal)")
orderItem_SubTotal.take(10)

(OrderId, SubTotal)


[(1, 299.98),
 (2, 199.99),
 (2, 250.0),
 (2, 129.99),
 (4, 49.98),
 (4, 299.95),
 (4, 150.0),
 (4, 199.92),
 (5, 299.98),
 (5, 299.95)]

#### [Example]: Highest 10 SubTotals in shopping cart

In [45]:
print("(orderId, Total_SubTotal)")
orderItem_SubTotal.reduceByKey(lambda x,y: (x+y))\
.map(lambda x: (x[1],x[0]))\
.sortByKey(False)\
.map(lambda x: (x[1],x[0]))\
.take(10)

(orderId, Total_SubTotal)


[(68703, 3449.9100000000003),
 (68724, 2859.8900000000003),
 (68858, 2839.91),
 (68809, 2779.86),
 (68766, 2699.9),
 (68806, 2629.92),
 (68821, 2629.92),
 (68778, 2629.9),
 (68848, 2399.96),
 (68875, 2399.95)]

#### [Example]: Lowest  10 SubTotals in shopping cart

In [46]:
print("(orderId, Total_SubTotal)")
orderItem_SubTotal.reduceByKey(lambda x,y: (x+y))\
.map(lambda x: (x[1],x[0]))\
.sortByKey(True)\
.map(lambda x: (x[1],x[0]))\
.take(10)

(orderId, Total_SubTotal)


[(1944, 9.99),
 (11102, 9.99),
 (12380, 9.99),
 (21816, 9.99),
 (23322, 9.99),
 (41098, 9.99),
 (5557, 9.99),
 (7530, 14.99),
 (18772, 14.99),
 (26612, 14.99)]

### [Example]: Finding Most and Least profitable customers and cities

In [47]:
customerId_SubTotal = orderItem_SubTotal.map(lambda x: (broadcast_orders.value.get(x[0]),x[1])).filter(lambda x: x[0] != None)

In [48]:
print("(customerId, subTotal)")
customerId_SubTotal.take(10)

(customerId, subTotal)


[(11318, 299.98),
 (11318, 299.95),
 (11318, 99.96),
 (11318, 299.98),
 (11318, 129.99),
 (4530, 199.99),
 (4530, 299.98),
 (4530, 79.95),
 (2568, 50.0),
 (2568, 199.99)]

#### customerId == 9515 --> orders' subtotals

In [49]:
print("(customerId = 9515)")
customerId_SubTotal.filter(lambda x: x[0] == 9515).take(100)

(customerId = 9515)


[(9515, 119.98),
 (9515, 199.99),
 (9515, 199.99),
 (9515, 250.0),
 (9515, 599.99),
 (9515, 199.95),
 (9515, 1999.99),
 (9515, 399.98)]

### Preparing of Customer and Orders Pair RDDs

In [50]:
customerId_SubTotal.reduceByKey(lambda x,y: (x+y)).take(10)

[(11318, 2489.71),
 (4530, 1499.7900000000002),
 (2568, 925.9100000000001),
 (656, 1875.8000000000002),
 (4840, 1199.8799999999999),
 (2636, 1039.79),
 (1148, 4109.710000000001),
 (1406, 1601.8500000000001),
 (8646, 1639.83),
 (8912, 1819.7800000000002)]

In [51]:
customerId_Total = customerId_SubTotal.reduceByKey(lambda x,y: (x+y))\
.map(lambda x: (x[1],x[0]))\
.map(lambda x: (x[1],x[0]))

### [Example]: Most profitable 10 customers

In [52]:
print("(customerId, Total)")
customerId_Total.sortBy(lambda x: x[1], ascending=False).take(10)

(customerId, Total)


[(9337, 6585.330000000002),
 (3710, 6169.4),
 (10744, 5799.5),
 (749, 5759.539999999999),
 (5411, 5174.56),
 (8314, 5169.449999999999),
 (173, 5149.37),
 (5186, 5113.57),
 (7802, 4904.450000000001),
 (8290, 4754.369999999999)]

In [53]:
profitableCustomerName = customerId_Total.map(lambda x: (broadcast_customers.value.get(x[0]),x[1]))

In [54]:
print("(CustomerName, TotalAmount)")
profitableCustomerName.map(lambda x: (x[0][0],x[1])).sortBy(lambda x: x[1], ascending=False).take(10)

(CustomerName, TotalAmount)


[('Mary Smith', 6585.330000000002),
 ('Ashley Smith', 6169.4),
 ('Samantha Smith', 5799.5),
 ('Jesse Matthews', 5759.539999999999),
 ('Robert Crane', 5174.56),
 ('Angela Walsh', 5169.449999999999),
 ('Jose Smith', 5149.37),
 ('Jason Robinson', 5113.57),
 ('Mary Acevedo', 4904.450000000001),
 ('Benjamin Jennings', 4754.369999999999)]

### [Example]: Least profitable 10 customers

In [55]:
print("(CustomerName, TotalAmount)")
profitableCustomerName.map(lambda x: (x[0][0],x[1])).sortBy(lambda x: x[1], ascending = True).take(10)

(CustomerName, TotalAmount)


[('Mary Jenkins', 14.99),
 ('Mary Wood', 24.99),
 ('Frances Smith', 24.99),
 ('Mary Robinson', 30.0),
 ('Mary Kim', 30.0),
 ('Mary Long', 31.98),
 ('Nathan Morton', 31.99),
 ('Cynthia Crawford', 31.99),
 ('Mary Williams', 34.99),
 ('Mary Meyer', 39.98)]

### [Example]: Sales most total amount by Cities

In [56]:
profitableCustomerName.map(lambda x: (x[0][1],x[1])).reduceByKey(lambda x,y: (x+y)).take(10)

[('Memphis', 37013.700000000004),
 ('Irving', 12244.79),
 ('Augusta', 16462.95),
 ('Brooklyn', 201293.47000000003),
 ('Orlando', 32547.890000000003),
 ('Pharr', 16408.399999999998),
 ('Clearfield', 10603.990000000002),
 ('Florissant', 18462.910000000003),
 ('Los Angeles', 223489.27000000005),
 ('Greeley', 16578.010000000002)]

### [Example]: Sales least total amount by Cities

In [57]:
profitableCustomerName.map(lambda x: (x[0][1],x[1]))\
.reduceByKey(lambda x,y: (x+y))\
.sortBy(lambda x: x[1], ascending = True).take(10)

[('Malden', 299.98),
 ('Nashville', 799.9),
 ('Norwalk', 839.87),
 ('Conway', 959.94),
 ('Sumner', 1018.88),
 ('San Pedro', 1179.8),
 ('Toa Alta', 1297.62),
 ('Palo Alto', 1349.73),
 ('Grove City', 1349.96),
 ('Dubuque', 1448.9)]