In [1]:
trafficPath = "file:////Users/delshawnkirksey/Development/spark/apps/tutorial_3_dodgers/Dodgers.data.txt"
gamesPath = "file:////Users/delshawnkirksey/Development/spark/apps/tutorial_3_dodgers/Dodgers.events.txt"

In [19]:
traffic = sc.textFile(trafficPath)
games = sc.textFile(gamesPath)

In [3]:
"""
The following will take the traffic RDD
and parse the string dates into date objects
"""

from datetime import datetime
import csv
from io import StringIO

# will return a tuple representative of [Date, num of Cars]
def parseTraffic(row):
    DATE_FMT = "%m/%d/%Y %H:%M" # uses the date pattern from the .data file
    row = row.split(",")
    row[0] = datetime.strptime(row[0], DATE_FMT) # date
    row[1] = int(row[1]) # num of cars
    return (row[0], row[1])

In [4]:
# parse the traffic RDD using the function above
trafficParsed = traffic.map(parseTraffic)

In [8]:
# create a new RDD with the date as the key and the 
# total number of cars that passed by as the value
dailyTrend = trafficParsed.map(lambda x: (x[0].date(), x[1]))\
                .reduceByKey(lambda x,y : x+y)

In [7]:
# sort the data to see which days have the most traffic
# sorted based on the second element -> traffic or number of cars
dailyTrend.sortBy(lambda x:-x[1]).take(10)

[(datetime.date(2005, 7, 28), 7661),
 (datetime.date(2005, 7, 29), 7499),
 (datetime.date(2005, 8, 12), 7287),
 (datetime.date(2005, 7, 27), 7238),
 (datetime.date(2005, 9, 23), 7175),
 (datetime.date(2005, 7, 26), 7163),
 (datetime.date(2005, 5, 20), 7119),
 (datetime.date(2005, 8, 11), 7110),
 (datetime.date(2005, 9, 8), 7107),
 (datetime.date(2005, 9, 7), 7082)]

In [11]:
def parseGames(row):
    DATE_FMT = "%m/%d/%y"
    row = row.split(",")
    # convert the date strings to dates based on the given format
    row[0] = datetime.strptime(row[0], DATE_FMT).date()
    # return the date and the opponent per day
    return (row[0], row[4])

# create a new RDD using the above function
gamesParsed = games.map(parseGames)

In [12]:
# join the rwo parsed RDDs -- using leftOuterJoin to keep all dates
dailyTrendCombined = dailyTrend.leftOuterJoin(gamesParsed)

In [16]:
"""
Checks each row of the combined RDD and returns a tuple
stating whether it was a 'Game Day' or 'Regular Day'
"""
def checkGameDay(row):
    if row[1][1] == None:
        return (row[0],row[1][1], "Regular Day", row[1][0])
    else:
        return (row[0],row[1][1], "Game Day", row[1][0])
    
dailyTrendbyGames = dailyTrendCombined.map(checkGameDay)

In [18]:
# view top 10 highest traffic days
dailyTrendbyGames.sortBy(lambda x: -x[3]).take(10)

[(datetime.date(2005, 7, 28), 'Cincinnati', 'Game Day', 7661),
 (datetime.date(2005, 7, 29), 'St. Louis', 'Game Day', 7499),
 (datetime.date(2005, 8, 12), 'NY Mets', 'Game Day', 7287),
 (datetime.date(2005, 7, 27), 'Cincinnati', 'Game Day', 7238),
 (datetime.date(2005, 9, 23), 'Pittsburgh', 'Game Day', 7175),
 (datetime.date(2005, 7, 26), 'Cincinnati', 'Game Day', 7163),
 (datetime.date(2005, 5, 20), 'LA Angels', 'Game Day', 7119),
 (datetime.date(2005, 8, 11), 'Philadelphia', 'Game Day', 7110),
 (datetime.date(2005, 9, 8), None, 'Regular Day', 7107),
 (datetime.date(2005, 9, 7), 'San Francisco', 'Game Day', 7082)]

In [20]:
"""
Use combineByKey to obtain the average on game day vs. non game day.

Steps:
1. Map the RDD to a new Pair RDD of just Day Type and Traffic
2. Combine the RDD by the key: Day Type -- createCombiner Function
3. Merge the Traffic values based on Day Type -- merge Function
4. Combine the results to get total # of Day Types and total overall Traffic -- mergeCombiners Function
5. Map the VALUES ONLY (Traffic & # Day Types) dividing total Traffic by # of Day Types
6. Collect the data to return the resuls
"""
dailyTrendbyGames.map(lambda x: (x[2], x[3]))\
                    .combineByKey(lambda value : (value, 1),\
                lambda acc, value: (acc[0] + value, acc[1] + 1),\
                lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))\
                .mapValues(lambda x: x[0]/x[1])\
                .collect()

[('Regular Day', 5411.329787234043), ('Game Day', 5948.604938271605)]