In [1]:
sc

<pyspark.context.SparkContext at 0x100776590>

In [2]:
import csv
import pandas as pd
from datetime import datetime
from pyspark.sql import functions

In [107]:
def parseCSV(idx, records):
    for row in csv.reader(records):
        direction = 0
        bus = row[7].split('_')[2]
        tripid = row[7].split('_')[1].split('-')[2]
        start = int(row[0].split('T')[1].split(':')[0])
        minute = int(row[0].split('T')[1].split(':')[1])
        t = datetime.strptime(row[0], '%Y-%m-%dT%H:%M:%SZ')  
        date = t.strftime('%Y-%m-%d')
        tm = t.strftime('%Y-%m-%d %H:%M:%S')
        unique_key = str(date) + str(bus) + str(tripid)
        
        # Create bus direction
        # Achilles changes - changed the values for the bearings and switched directions for Q48
        
        if bus == 'BX1':
            if float(row[4]) < 200:
                direction = 1
            else:
                direction = 2
        elif bus == 'BX6':
            if float(row[4]) < 110:
                direction = 1
            else:
                direction = 2
        elif bus == 'BX13':
            if 70 <= float(row[4]) < 150:
                direction = 1
            else:
                direction = 2
        elif bus == 'Q48':
            if 100 <= float(row[4]) < 150:
                direction = 2
            else:
                direction = 1 
                
        if minute < 15:
            interval = str(start) + str(':00-') + str(start) + str(':15')
        elif 15 <= minute < 30:
            interval = str(start) + str(':15-') + str(start) + str(':30')
        elif 30 <= minute < 45:
            interval = str(start) + str(':30-') + str(start) + str(':45')
        elif 45 <= minute < 60:
            interval = str(start) + str(':45-') + str(start+1) + str(':00')

        yield unique_key, tm, bus, tripid, direction, interval

## Source data file. 
# path = '/Users/JordanVani/Documents/NYU/BDM/nyc-bus-delay-event/Data/BDM_BusData.csv'
path = '/Users/JordanVani/Documents/NYU/BDM/Junk/data1.csv'

## Parse datafile to RDD.
data = sc.textFile(path).mapPartitionsWithIndex(parseCSV)

## For each unique bus line, calculate route start time.
min_by_group = (data
                .map(lambda x: (x[0], x[0:6]))
                .reduceByKey(lambda x1, x2: min(x1, x2, key=lambda x: x[1]))
                .values()
                .map(lambda x: (x[0], (x[1:6]))))

## For each unique bus line, calculate route end time.
max_by_group = (data
                .map(lambda x: (x[0], x[0:2]))
                .reduceByKey(lambda x1, x2: max(x1, x2, key=lambda x: x[1]))
                .values())

## Join start and stop times.
rdd = min_by_group.join(max_by_group)
rdd = rdd.flatMap(lambda x: [[x[0], x[1][0][0], x[1][1], x[1][0][1],
                              x[1][0][2], x[1][0][4]]])

# Calculate duration of bus.
time_diff = rdd.toDF(['id', 'start', 'stop', 'bus', 'tripid', 'interval'])
time_diff = time_diff.select('id', time_diff['start'].cast('timestamp'),
                             time_diff['stop'].cast('timestamp'), 'bus', 'tripid', 'interval')
timeDiff = (functions.unix_timestamp('stop', format="yyyy-MM-dd HH:mm:ss")
            - functions.unix_timestamp('start', format="yyyy-MM-dd HH:mm:ss"))
time_diff = time_diff.withColumn('duration', timeDiff)

# Calculate mean direction
trip_dir = data.toDF(['id_', 'time', 'bus', 'tripid', 'direction', 'interval'])
trip_dir = trip_dir.groupby("id_").agg({'direction': 'avg'})

# Join direction back to data.
master = time_diff.join(trip_dir, time_diff.id == trip_dir.id_, how='left_outer')
master = master.select('id', 'start', 'bus', 'tripid', 'interval', 'duration', 
                       functions.col('avg(direction)').cast('int').alias('direction'))

In [108]:
master.show(5)

+--------------------+--------------------+----+------+---------+--------+---------+
|                  id|               start| bus|tripid| interval|duration|direction|
+--------------------+--------------------+----+------+---------+--------+---------+
| 2014-08-09BX6116500|2014-08-09 00:00:...| BX6|116500|0:00-0:15|    1813|        1|
| 2014-08-09BX6124200|2014-08-09 00:44:...| BX6|124200|0:30-0:45|    3632|        1|
| 2014-08-09Q48117600|2014-08-09 00:00:...| Q48|117600|0:00-0:15|    1137|        1|
| 2014-08-09Q48122000|2014-08-09 00:11:...| Q48|122000|0:00-0:15|    2331|        1|
|2014-08-09BX13128300|2014-08-09 01:21:...|BX13|128300|1:15-1:30|    2051|        2|
+--------------------+--------------------+----+------+---------+--------+---------+
only showing top 5 rows



In [118]:
rdd_times = (master
             .groupby("bus", functions.date_format('start', 'yyyy-MM-dd').alias('date'), 
                      "direction", "interval")
             .agg({"duration": "avg", "id": "count"}))

In [123]:
(rdd_times
 .sort(functions.col('bus'), functions.col('date'), functions.col('interval'))
 .show(100))

+----+----------+---------+---------+------------------+---------+
| bus|      date|direction| interval|     avg(duration)|count(id)|
+----+----------+---------+---------+------------------+---------+
| BX1|2014-08-09|        1|0:00-0:15|3198.5714285714284|        7|
| BX1|2014-08-09|        2|0:00-0:15|            1059.0|        4|
| BX1|2014-08-09|        1|0:30-0:45|            4735.0|        1|
| BX1|2014-08-09|        1|0:45-1:00|            3697.0|        1|
| BX1|2014-08-09|        2|1:00-1:15|               0.0|        1|
| BX1|2014-08-09|        1|1:45-2:00|              92.0|        1|
| BX1|2014-08-09|        2|2:00-2:15|             878.0|        1|
| BX1|2014-08-09|        1|2:15-2:30|             921.0|        1|
|BX13|2014-08-09|        2|0:00-0:15|            1958.0|        1|
|BX13|2014-08-09|        1|0:00-0:15|            1662.0|        9|
|BX13|2014-08-09|        1|0:15-0:30|2115.3333333333335|        3|
|BX13|2014-08-09|        1|0:30-0:45|            1456.8|      