In [1]:
sc

<pyspark.context.SparkContext at 0x100676590>

In [2]:
import csv
import pandas as pd
from datetime import datetime
from pyspark.sql import functions

In [3]:
def parseCSV(idx, records):
    for row in csv.reader(records):
        direction = 0
        bus = row[7].split('_')[2]
        tripid = row[7].split('_')[1].split('-')[2]
        start = int(row[0].split('T')[1].split(':')[0])
        minute = int(row[0].split('T')[1].split(':')[1])
        t = datetime.strptime(row[0], '%Y-%m-%dT%H:%M:%SZ')  
        date = t.strftime('%Y-%m-%d')
        tm = t.strftime('%Y-%m-%d %H:%M:%S')
        unique_key = str(date) + str(bus) + str(tripid)
        
        # Create bus direction
        if bus == 'BX1':
            if float(row[4]) < 190:
                direction = 1
            else:
                direction = 2
        elif bus == 'BX6':
            if float(row[4]) < 290:
                direction = 1
            else:
                direction = 2
        elif bus == 'BX13':
            if 70 <= float(row[4]) < 170:
                direction = 1
            else:
                direction = 2
        elif bus == 'Q48':
            if 100 <= float(row[4]) < 270:
                direction = 1
            else:
                direction = 2 
                
        if minute < 15:
            interval = str(start) + str(':00-') + str(start) + str(':15')
        elif 15 <= minute < 30:
            interval = str(start) + str(':15-') + str(start) + str(':30')
        elif 30 <= minute < 45:
            interval = str(start) + str(':30-') + str(start) + str(':45')
        elif 45 <= minute < 60:
            interval = str(start) + str(':45-') + str(start+1) + str(':00')

        yield unique_key, tm, bus, tripid, direction, interval

## Source data file. 
# path = '/Users/JordanVani/Documents/NYU/BDM/nyc-bus-delay-event/Data/BDM_BusData.csv'
path = '/Users/JordanVani/Documents/NYU/BDM/Junk/data1.csv'

## Parse datafile to RDD.
data = sc.textFile(path).mapPartitionsWithIndex(parseCSV)

## For each unique bus line, calculate route start time.
min_by_group = (data
                .map(lambda x: (x[0], x[0:2]))
                .reduceByKey(lambda x1, x2: min(x1, x2, key=lambda x: x[1]))
                .values())

## For each unique bus line, calculate route end time.
max_by_group = (data
                .map(lambda x: (x[0], x[0:2]))
                .reduceByKey(lambda x1, x2: max(x1, x2, key=lambda x: x[1]))
                .values())

## Join start and stop times.
rdd = min_by_group.join(max_by_group)
rdd = rdd.flatMap(lambda x: [[x[0], x[1][0], x[1][1]]])

## Calculate duration of bus.
time_diff = rdd.toDF(['id', 'start', 'stop'])
time_diff = time_diff.select('id', time_diff['start'].cast('timestamp'), time_diff['stop'].cast('timestamp'))
timeFmt = "yyyy-MM-dd HH:mm:ss"
timeDiff = (functions.unix_timestamp('stop', format=timeFmt)
            - functions.unix_timestamp('start', format=timeFmt))
time_diff = time_diff.withColumn('duration', timeDiff)
time_diff = time_diff.select('id', 'duration')

## Add route time to data.
data_df = data.toDF(['id', 'time', 'bus', 'tripid', 'direction', 'interval'])
d = data_df.join(time_diff, data_df.id == time_diff.id)

In [4]:
# data.take(5)
d.show(100)

+-------------------+-------------------+---+------+---------+---------+-------------------+--------+
|                 id|               time|bus|tripid|direction| interval|                 id|duration|
+-------------------+-------------------+---+------+---------+---------+-------------------+--------+
|2014-08-09BX6116500|2014-08-09 00:00:17|BX6|116500|        2|0:00-0:15|2014-08-09BX6116500|    1813|
|2014-08-09BX6116500|2014-08-09 00:00:19|BX6|116500|        1|0:00-0:15|2014-08-09BX6116500|    1813|
|2014-08-09BX6116500|2014-08-09 00:00:43|BX6|116500|        2|0:00-0:15|2014-08-09BX6116500|    1813|
|2014-08-09BX6116500|2014-08-09 00:01:14|BX6|116500|        2|0:00-0:15|2014-08-09BX6116500|    1813|
|2014-08-09BX6116500|2014-08-09 00:01:20|BX6|116500|        2|0:00-0:15|2014-08-09BX6116500|    1813|
|2014-08-09BX6116500|2014-08-09 00:01:23|BX6|116500|        1|0:00-0:15|2014-08-09BX6116500|    1813|
|2014-08-09BX6116500|2014-08-09 00:02:16|BX6|116500|        2|0:00-0:15|2014-08-09