In [4]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import os

In [2]:
spark = SparkSession.builder \
.appName("join_products") \
.config("spark.executor.memory", "2g") \
.config("spark.driver.memory", "4g") \
.master("local[*]") \
.getOrCreate()

23/04/15 14:04:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [5]:
current_dir = os.getcwd()
products_path = f"{current_dir}/products.csv"
order_items_path = f"{current_dir}/order_items.csv"

In [35]:
productsRDD = sc.textFile(products_path) \
.filter(lambda line: "productCategoryId" not in line)

order_itemsRDD = sc.textFile(order_items_path) \
.filter(lambda line: "orderItemName" not in line)

In [36]:
print(productsRDD.take(5))
print("\n ==== \n")
print(order_itemsRDD.take(5))

['1,2,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,,59.98,http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy', "2,2,Under Armour Men's Highlight MC Football Clea,,129.99,http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat", "3,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat", "4,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat", '5,2,Riddell Youth Revolution Speed Custom Footbal,,199.99,http://images.acmesports.sports/Riddell+Youth+Revolution+Speed+Custom+Football+Helmet']

 ==== 

['1,1,957,1,299.98,299.98', '2,2,1073,1,199.99,199.99', '3,2,502,5,250.0,50.0', '4,2,403,1,129.99,129.99', '5,4,897,2,49.98,24.99']


In [37]:
def make_order_items_pair_rdd(line: str):
    orderItemName = line.split(",")[0]
    orderItemOrderId = line.split(",")[1]
    orderItemProductId = line.split(",")[2]
    orderItemQuantity = line.split(",")[3]
    orderItemSubTotal = line.split(",")[4]
    orderItemProductPrice = line.split(",")[5]
    
    return (orderItemProductId, (orderItemName,orderItemOrderId,orderItemQuantity,orderItemSubTotal,orderItemProductPrice))


In [38]:
order_items_pairRDD = order_itemsRDD.map(make_order_items_pair_rdd)
order_items_pairRDD.take(5)

[('957', ('1', '1', '1', '299.98', '299.98')),
 ('1073', ('2', '2', '1', '199.99', '199.99')),
 ('502', ('3', '2', '5', '250.0', '50.0')),
 ('403', ('4', '2', '1', '129.99', '129.99')),
 ('897', ('5', '4', '2', '49.98', '24.99'))]

In [39]:
def make_products_pair_rdd(line: str):
    productId = line.split(",")[0]
    productCategoryId = line.split(",")[1]
    productName = line.split(",")[2]
    productDescription = line.split(",")[3]
    productPrice = line.split(",")[4]
    productImage = line.split(",")[5]
    
    return (productId, (productCategoryId, productName, productDescription, productPrice, productImage))

In [40]:
products_pairRDD = productsRDD.map(make_products_pair_rdd)
products_pairRDD.take(5)

[('1',
  ('2',
   'Quest Q64 10 FT. x 10 FT. Slant Leg Instant U',
   '',
   '59.98',
   'http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy')),
 ('2',
  ('2',
   "Under Armour Men's Highlight MC Football Clea",
   '',
   '129.99',
   'http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat')),
 ('3',
  ('2',
   "Under Armour Men's Renegade D Mid Football Cl",
   '',
   '89.99',
   'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat')),
 ('4',
  ('2',
   "Under Armour Men's Renegade D Mid Football Cl",
   '',
   '89.99',
   'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat')),
 ('5',
  ('2',
   'Riddell Youth Revolution Speed Custom Footbal',
   '',
   '199.99',
   'http://images.acmesports.sports/Riddell+Youth+Revolution+Speed+Custom+Football+Helmet'))]

In [41]:
order_items_products_pairRDD = order_items_pairRDD.join(products_pairRDD)

In [46]:
order_items_products_pairRDD.collect()

[('957',
  (('1', '1', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('9', '5', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('12', '5', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('15', '7', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('34', '12', '1', '299.98', '299.98'),
   ('43',
    "Diamondb

In [43]:
order_items_pairRDD.count()

172198

In [44]:
products_pairRDD.count()

1345

In [45]:
order_items_products_pairRDD.count()

172198