In [1]:
import pyspark
import random
sc = pyspark.SparkContext(appName="PiSolution")

In [2]:
# customersInputPath = "data/Customers.txt" # Useless for this program
purchaseInputPath = "Purchases.txt"
catalogInputPath = "ItemsCatalog.txt"

outputPath1 = "outPart1/"
outputPath2 = "outPart2/"

In [3]:
# Define the rdds associated with the input files

# Input format: Timestamp,Username,itemID,SalePrice
purchaseRDD = sc.textFile(purchaseInputPath)

# Input format: itemID,Name,Category,Timestamp
catalogRDD = sc.textFile(catalogInputPath)

In [None]:
#########################################
# PART 1
#########################################

In [4]:
# Filter only the years 2020 and 2021 and cache this RDD, so that it can be
# reused for the second part
purchase20_21RDD = purchaseRDD\
                    .filter(lambda line: line.startswith("2020/") or line.startswith("2021/"))\
                    .cache()

In [5]:
# Map to a pairRDD with:
# key = itemId
# value = (2020: 0/1, 2021: 0/1)

def mapItemID20202021(line):
    fields = line.split(",")
    year = fields[0].split("/")[0]
    itemId = fields[2]
    
    if (year=="2020"):
        return (itemId, (1, 0))
    else:
        return (itemId, (0, 1))
    

itemsYearsNumPurchsRDD = purchase20_21RDD.map(mapItemID20202021)

In [6]:
# Sum the number of purchases for each item in each of the two years
# key = itemId
# value = (num. purchases in 2020, num. purchases in 2021)
# Finally, select the items with num. purchases in 2020>=10000 and num. purchases in
# 2021>=10000
validItems = itemsYearsNumPurchsRDD\
            .reduceByKey(lambda i1, i2: (i1[0]+i2[0], i1[1]+i2[1]))\
            .filter(lambda pair: pair[1][0] >= 10000 and pair[1][1] >= 10000)

In [8]:
# Store the selected items in the first output folder
# Store only the itemIDs
resPart1 = validItems.keys()

# resPart1.saveAsTextFile(outputPath1)
resPart1.collect()

['itemId100', 'itemId102']

In [None]:
#########################################
# PART 2
#########################################

In [9]:
# Start from previously cached RDD
# and consider only purchases made in 2020
# Map the considered pairRDD to a new PairRDD with
# key = (itemId, month)
# value = userId
# and perform a distinct operation to consider purchases made in each month of
# 2020 by distinct users.

def mapItemIdMonthUserID(line):
    fields = line.split(",")
    month = fields[0].split("/")[1]
    userId = fields[1]
    itemId = fields[2]
    
    return ((itemId, month), userId)
    
    
    
distinctPurchasesPerMoth2020RDD = purchase20_21RDD\
        .filter(lambda line: line.startswith("2020"))\
        .map(mapItemIdMonthUserID)\
        .distinct()

In [10]:
# Count the number of distinct customers for each item+month
# by first mapping the input pairs into the following pairs:
# key = (itemId, month)
# value = +1
# and then use a reduceByKey to sum the values.
# Finally, filter only those months for which the distinct customers were >= 10
itemsMonthsMoreThan9 = distinctPurchasesPerMoth2020RDD\
                    .mapValues(lambda v: 1)\
                    .reduceByKey(lambda i1, i2: i1 + i2)\
                    .filter(lambda it: it[1] >= 10)

In [11]:
# Count the number of months in 2020 in which each itemId was bought by >= 10
# distinct users
# 
# Map to:
# key = itemId
# value = +1
#
# Use reduceByKey to count for each item the number of months of 2020 for which
# the number of distinct customers was >= 10
itemsNumMonthsCustomersMoreThan9 = itemsMonthsMoreThan9\
            .map(lambda it: (it[0][0], 1))\
            .reduceByKey(lambda i1, i2: i1 + i2)

In [12]:
# Select the items with more than 10 months with number of distinct customers >=10
# These items must be discarded because they have at least 11 months each one with
# num. distinct customers >= 10. In other words, they have at most one month
# with less than 10 distinct customers.
itemsWithManyMounthsWithManyCustomers = itemsNumMonthsCustomersMoreThan9\
                                        .filter(lambda it: it[1] > 10)

In [13]:
# Filter only items which were inserted in catalog before 01/01/2020 and
# Map the catalogRDD into a pairRDD with
# key = itemId
# value = Category

def mapItemIdCategory(line):
    fields = line.split(",")
    itemId = fields[0]
    category = fields[2]
    
    return (itemId, category)

itemCategoryRDD = catalogRDD\
            .filter(lambda line: line.split(",")[3]<"2020/01/01")\
            .map(mapItemIdCategory)

In [14]:
# Select the items occurring in itemCategoryRDD but not in
# itemsWithManyMounthsWithManyCustomers
# We need to use this approach in order to consider for each item also the
# months without sales (i.e., without customers). 
# A month without sales has less than 10 distinct customers.
resPart2 = itemCategoryRDD.subtractByKey(itemsWithManyMounthsWithManyCustomers)

In [16]:
# Store the result in the second output folder
# resPart2.saveAsTextFile(outputPath2)
print(resPart2.count())

997
