In [1]:
import pyspark
import random
sc = pyspark.SparkContext(appName="Pi")

In [29]:
custWatchedPath = "CustomerWatched.txt"
espisodesPath = "Episodes.txt"
tvSeriesPath = "TVSeries.txt"

# Useless for this program
# customersPath = "data/Customers.txt"

outputPath1 = "outPart1/"
outputPath2 = "outPart2/"

In [30]:
custWatchedRDD = sc.textFile(custWatchedPath)
espisodesRDD = sc.textFile(espisodesPath)
tvSeriesRDD = sc.textFile(tvSeriesPath)

In [31]:
###########################################################
# Part 1
###########################################################

In [32]:
# Select only comedy TV series
# and map the result to pairs (SID, None) for the join with episodes
def filterComedy(l):
    genre = l.split(",")[2]
    return genre=='Comedy'

def mapSIDNone(l):
    SID = l.split(",")[0]
    return (SID, None)

comedyTVSeries = tvSeriesRDD.filter(filterComedy)\
                            .map(mapSIDNone)

In [33]:
# Map episodes to pairs:
# key = SID
# value = SeasonNumber

def mapSIDSeasonOne(l):
    fields = l.split(",")
    SID = fields[0]
    seasonNumber = fields[1]
    
    return (SID, seasonNumber)

episodesPairs = espisodesRDD.map(mapSIDSeasonOne)

In [34]:
# Join episodesPairs with comedyTVSeries to consider comedy TV series only
#
# Map to pairs
# - key = (SID, seasonNumber) - TV series season identifier
# - value = +1 - One more episode
# and use reduceByKey to count the number of episodes for each TV series season
TVseriesSeasonsNumEpisodes = episodesPairs.join(comedyTVSeries)\
                                    .map(lambda p: ( (p[0], p[1][0]), +1))\
                                    .reduceByKey(lambda v1, v2: v1+v2)

In [36]:
# Compute the average number of episodes per season for each comedy TV series
#
# Map to pairs:
# - key = SID
# - value = (number of episodes, +1)
#
# Sum the two parts of the values and then compute the average inside the mapValues method

TVseriesAvgNumEpisodes = TVseriesSeasonsNumEpisodes.map(lambda p: (p[0][0], (p[1], 1)))\
                                .reduceByKey(lambda p1, p2: (p1[0]+p2[0], p1[1]+p2[1]))\
                                .mapValues(lambda p: p[0]/p[1])

In [38]:
# Store the result in the first output folder
# TVseriesAvgNumEpisodes.saveAsTextFile(outputPath1)
TVseriesAvgNumEpisodes.collect()

[('SID1', 7.5), ('SID3', 1.0)]

In [None]:
###########################################################
# Part 2
###########################################################

In [39]:
# Compute the number of distinct seasons for each TV series
#
# Reuse episodesPairs (it contains one pair (SID, SeasonNumber) for each episode
# Apply distinct to consider each season only once per each TV series
# Then, map to pairs (SID, +1) and apply reduceByKey to compute the number of distinct seasons for each TV series

NumSeasonsTvSeries = episodesPairs.distinct()\
                                .map(lambda p: (p[0], 1))\
                                .reduceByKey(lambda v1, v2: v1+v2)

In [40]:
# Compute for each combination (customer, TV series) the number of distinct seasons of this TV series
# for which this customer watched at least one episode.
#
# Map each line of CustomerWatched.txt to pairs:
# - key = (customer, SID)
# - value = SeasonNumber
# 
# Apply distinct to consider each TV series season only one time for each customer
#
# Map the value part to +1 and then apply reduceByKey to count the number of distinct seasons of this TV series
# for which this customer watched at least one episode.
def mapCustSIDSeasonNum(l):
    fields = l.split(",")
    cid = fields[0]
    sid = fields[2]
    seasonNumber = fields[3]
    
    return ( (cid, sid), seasonNumber)


CustomerTVSeriesNumSeasonsAtLeastOneVisualization = custWatchedRDD.map(mapCustSIDSeasonNum)\
                                                            .distinct()\
                                                            .mapValues(lambda v: +1)\
                                                            .reduceByKey(lambda v1, v2: v1+v2)

In [41]:
# Map CustomerTVSeriesNumSeasonsAtLeastOneVisualization to pairs:
# - key = (SID TV Series, Number of distinct seasons of this TV series for which this customer watched at least one episode)
# - value = CID Customer

SIDNumSeasonsCustomer = CustomerTVSeriesNumSeasonsAtLeastOneVisualization\
                            .map(lambda p: ( (p[0][1], p[1]), p[0][0]) )

In [43]:
SIDNumSeasonsCustomer.collect()

[(('SID1', 1), 'CID1'),
 (('SID2', 2), 'CID2'),
 (('SID1', 1), 'CID3'),
 (('SID3', 1), 'CID4'),
 (('SID1', 1), 'CID2'),
 (('SID3', 1), 'CID1')]

In [45]:
# Map NumSeasonsTvSeries to pairs:
# - key = (SID TV Series, Number of distinct seasons of this TV series)
# - value = None

TvSeriesNumSeasonsNone = NumSeasonsTvSeries\
                            .map(lambda p: ( p, None))

In [46]:
TvSeriesNumSeasonsNone.collect()

[(('SID1', 2), None), (('SID2', 2), None), (('SID3', 1), None)]

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 60736)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =

In [None]:
# Join SIDNumSeasonsCustomer with TvSeriesNumSeasonsNone
# This natural join keeps the combinations (TV series, customers) such that the number of seasons of the TV series
# is equal to the number of seasons of this TV series for which the customer whatched at least one episode.
#
# Finally, use map to extract the selected combinations (SID, CID)

res2 = SIDNumSeasonsCustomer.join(TvSeriesNumSeasonsNone)\
                            .map(lambda p: (p[0][0], p[1][0]) )

In [None]:
# Store the result in the second output folder
res2.saveAsTextFile(outputPath2)