In [228]:
import pandas as pd
import numpy as np
import glob
import os
import bz2
import codecs
import json
import matplotlib.pylab as plt
import uuid
import random
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, DoubleType, StringType
from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, from_unixtime
%matplotlib inline

In [182]:
conf = SparkConf()

In [183]:
conf.setMaster('local[8]')

<pyspark.conf.SparkConf at 0x7f39a4e1e080>

In [184]:
sc = SparkContext.getOrCreate(conf=conf)
sql_sc = SQLContext(sc)

In [185]:
schema = StructType([
        StructField('VendorID',IntegerType(),True),
        StructField('tpep_pickup_datetime',TimestampType(),True),
        StructField('tpep_dropoff_datetime',TimestampType(),True),
        StructField('passenger_count',IntegerType(),True),
        StructField('trip_distance',DoubleType(),True),
        StructField('pickup_longitude',DoubleType(),True),
        StructField('pickup_latitude',DoubleType(),True),
        StructField('RateCodeID',IntegerType(),True),
        StructField('store_and_fwd_flag',StringType(),True),
        StructField('dropoff_longitude',DoubleType(),True),
        StructField('dropoff_latitude',DoubleType(),True),
        StructField('payment_type',IntegerType(),True),
        StructField('fare_amount',DoubleType(),True),
        StructField('extra',DoubleType(),True),
        StructField('mta_tax',DoubleType(),True),
        StructField('tip_amount',DoubleType(),True),
        StructField('tolls_amount',DoubleType(),True),
        StructField('improvement_surcharge',DoubleType(),True),
        StructField('total_amount',DoubleType(),True),
    ])

In [186]:
infilename = 'yellow_tripdata_2015-03.csv'
#infilename = 'sample.csv'
raw_sdf = (sql_sc
    .read
    .csv(infilename, header=True, schema=schema, timestampFormat='yyyy-MM-dd HH:mm:ss'))

In [187]:
clean_sdf = raw_sdf.withColumn('tpep_pickup_timestamp_ms',  unix_timestamp(raw_sdf['tpep_pickup_datetime' ])*1000 + 5*60*60*1000)
clean_sdf = clean_sdf.withColumn('tpep_dropoff_timestamp_ms', unix_timestamp(raw_sdf['tpep_dropoff_datetime'])*1000 + 5*60*60*1000)

In [188]:
clean_sdf.limit(1).toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 21 columns):
VendorID                     1 non-null int64
tpep_pickup_datetime         1 non-null datetime64[ns]
tpep_dropoff_datetime        1 non-null datetime64[ns]
passenger_count              1 non-null int64
trip_distance                1 non-null float64
pickup_longitude             1 non-null float64
pickup_latitude              1 non-null float64
RateCodeID                   1 non-null int64
store_and_fwd_flag           1 non-null object
dropoff_longitude            1 non-null float64
dropoff_latitude             1 non-null float64
payment_type                 1 non-null int64
fare_amount                  1 non-null float64
extra                        1 non-null float64
mta_tax                      1 non-null float64
tip_amount                   1 non-null float64
tolls_amount                 1 non-null float64
improvement_surcharge        1 non-null float64
total_amount                 1

In [189]:
clean_sdf.limit(5).toPandas()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,...,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tpep_pickup_timestamp_ms,tpep_dropoff_timestamp_ms
0,1,2015-03-06 08:02:31,2015-03-06 08:09:55,1,1.2,-73.990211,40.750969,1,N,-73.987892,...,2,7.0,0.0,0.5,0.0,0.0,0.3,7.8,1425646951000,1425647395000
1,1,2015-03-06 08:02:31,2015-03-06 08:15:23,1,3.2,-73.935188,40.80072,1,N,-73.952553,...,2,11.5,0.0,0.5,0.0,0.0,0.3,12.3,1425646951000,1425647723000
2,1,2015-03-06 08:02:31,2015-03-06 08:12:27,1,1.1,-73.963753,40.767937,1,N,-73.956947,...,2,8.0,0.0,0.5,0.0,0.0,0.3,8.8,1425646951000,1425647547000
3,1,2015-03-06 08:02:31,2015-03-06 08:09:09,1,0.8,-73.997177,40.742168,1,N,-74.008064,...,1,6.0,0.0,0.5,1.0,0.0,0.3,7.8,1425646951000,1425647349000
4,1,2015-03-06 08:02:32,2015-03-06 08:19:37,1,2.7,-74.006844,40.730267,1,N,-73.97686,...,1,13.0,0.0,0.5,2.75,0.0,0.3,16.55,1425646952000,1425647977000


In [213]:
end_timestamp = pd.Timestamp('2015-03-09 00:00:00').tz_localize('Etc/GMT+5')
filtered_sdf = clean_sdf.filter('tpep_dropoff_timestamp_ms <= %d' % int(end_timestamp.value / 1e6) )

In [214]:
filtered_sdf.count()

3333939

In [215]:
filtered_sdf.limit(5).toPandas()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,...,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tpep_pickup_timestamp_ms,tpep_dropoff_timestamp_ms
0,1,2015-03-06 08:02:31,2015-03-06 08:09:55,1,1.2,-73.990211,40.750969,1,N,-73.987892,...,2,7.0,0.0,0.5,0.0,0.0,0.3,7.8,1425646951000,1425647395000
1,1,2015-03-06 08:02:31,2015-03-06 08:15:23,1,3.2,-73.935188,40.80072,1,N,-73.952553,...,2,11.5,0.0,0.5,0.0,0.0,0.3,12.3,1425646951000,1425647723000
2,1,2015-03-06 08:02:31,2015-03-06 08:12:27,1,1.1,-73.963753,40.767937,1,N,-73.956947,...,2,8.0,0.0,0.5,0.0,0.0,0.3,8.8,1425646951000,1425647547000
3,1,2015-03-06 08:02:31,2015-03-06 08:09:09,1,0.8,-73.997177,40.742168,1,N,-74.008064,...,1,6.0,0.0,0.5,1.0,0.0,0.3,7.8,1425646951000,1425647349000
4,1,2015-03-06 08:02:32,2015-03-06 08:19:37,1,2.7,-74.006844,40.730267,1,N,-73.97686,...,1,13.0,0.0,0.5,2.75,0.0,0.3,16.55,1425646952000,1425647977000


In [229]:
def create_events(trip_record):
    events = []
    trip_id = str(uuid.uuid4())
    pickup_datetime = trip_record.tpep_pickup_timestamp_ms
    dropoff_datetime = trip_record.tpep_dropoff_timestamp_ms
    events.append({
        'event_type': 'pickup',
        'timestamp': pickup_datetime,
        'trip_fraction': 0.0,
        'trip_id': trip_id,
        'lat': trip_record.pickup_latitude,
        'lon': trip_record.pickup_longitude,
        'passenger_count': trip_record.passenger_count,
        'trip_distance': 0.0,
        'trip_duration_minutes': 0.0,
        'total_amount': 0.0,
    })
    
    # Create route events every 1 minute.
    # Assume that route is a straight line from pickup to dropoff.
    trip_duration = dropoff_datetime - pickup_datetime
    report_period_mean = 1*60*1000
    report_timestamp = pickup_datetime
    while True:
        # The next report period is a random number. This allows us to get timestamps with non-zero milliseconds.
        report_period = random.randint(0, 2*report_period_mean)
        report_timestamp += report_period
        if report_timestamp >= dropoff_datetime:
            break
        trip_fraction = (report_timestamp - pickup_datetime) / trip_duration
        events.append({
            'event_type': 'route',
            'timestamp': report_timestamp,
            'trip_fraction': trip_fraction,
            'trip_id': trip_id,
            'lat': trip_record.pickup_latitude + (trip_record.dropoff_latitude - trip_record.pickup_latitude) * trip_fraction,
            'lon': trip_record.pickup_longitude + (trip_record.dropoff_longitude - trip_record.pickup_longitude) * trip_fraction,
            'passenger_count': trip_record.passenger_count,
            'trip_distance': 0.0,
            'trip_duration_minutes': 0.0,
            'total_amount': 0.0,
        })

    events.append({
        'event_type': 'dropoff',
        'timestamp': dropoff_datetime,
        'trip_fraction': 1.0,
        'trip_id': trip_id,
        'lat': trip_record.dropoff_latitude,
        'lon': trip_record.dropoff_longitude,
        'passenger_count': trip_record.passenger_count,
        'trip_distance': trip_record.trip_distance,
        'trip_duration_minutes': trip_duration / (60*1000),
        'total_amount': trip_record.total_amount,
    })
    rows = [Row(**event) for event in events]
    return rows

In [230]:
%%time
all_events_rdd = filtered_sdf.rdd.flatMap(create_events)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 172 µs


In [231]:
all_events_rdd.take(2)

[Row(event_type='pickup', lat=40.75096893310547, lon=-73.9902114868164, passenger_count=1, timestamp=1425646951000, total_amount=0.0, trip_distance=0.0, trip_duration_minutes=0.0, trip_fraction=0.0, trip_id='0198c7b9-957f-49ad-a555-c6eb2946549e'),
 Row(event_type='route', lat=40.74926344691096, lon=-73.98990560610659, passenger_count=1, timestamp=1425647009556, total_amount=0.0, trip_distance=0.0, trip_duration_minutes=0.0, trip_fraction=0.13188288288288288, trip_id='0198c7b9-957f-49ad-a555-c6eb2946549e')]

In [232]:
%%time
all_events_sdf = sql_sc.createDataFrame(all_events_rdd)

CPU times: user 20 ms, sys: 4 ms, total: 24 ms
Wall time: 94.4 ms


In [233]:
all_events_sdf.show()

+----------+------------------+------------------+---------------+-------------+------------+-------------+---------------------+-------------------+--------------------+
|event_type|               lat|               lon|passenger_count|    timestamp|total_amount|trip_distance|trip_duration_minutes|      trip_fraction|             trip_id|
+----------+------------------+------------------+---------------+-------------+------------+-------------+---------------------+-------------------+--------------------+
|    pickup| 40.75096893310547| -73.9902114868164|              1|1425646951000|         0.0|          0.0|                  0.0|                0.0|90b7ec34-04fb-435...|
|     route| 40.74926344691096|-73.98990560610659|              1|1425647009556|         0.0|          0.0|                  0.0|0.13188288288288288|90b7ec34-04fb-435...|
|     route| 40.74814647519911|-73.98970527607686|              1|1425647047906|         0.0|          0.0|                  0.0|0.21825675675675

In [234]:
all_events2_sdf = all_events_sdf.withColumn('timestamp_str',  from_unixtime(all_events_sdf['timestamp' ] / 1000))

In [235]:
all_events2_sdf.show()

+----------+------------------+------------------+---------------+-------------+------------+-------------+---------------------+-------------------+--------------------+-------------------+
|event_type|               lat|               lon|passenger_count|    timestamp|total_amount|trip_distance|trip_duration_minutes|      trip_fraction|             trip_id|      timestamp_str|
+----------+------------------+------------------+---------------+-------------+------------+-------------+---------------------+-------------------+--------------------+-------------------+
|    pickup| 40.75096893310547| -73.9902114868164|              1|1425646951000|         0.0|          0.0|                  0.0|                0.0|34a66b63-42d6-473...|2015-03-06 13:02:31|
|     route| 40.74926344691096|-73.98990560610659|              1|1425647009556|         0.0|          0.0|                  0.0|0.13188288288288288|34a66b63-42d6-473...|2015-03-06 13:03:29|
|     route| 40.74814647519911|-73.9897052760

In [236]:
%%time
sorted_sdf = all_events2_sdf.orderBy('timestamp')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 10.3 ms


In [237]:
%%time
sorted_sdf.write.mode('overwrite').format('json').save('data.json')

CPU times: user 168 ms, sys: 40 ms, total: 208 ms
Wall time: 13min 54s


# Playground