In [202]:
import pandas as pd
import numpy as np
import glob
import os
import bz2
import codecs
import json
import matplotlib.pylab as plt
import uuid
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, DoubleType, StringType
from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, from_unixtime
%matplotlib inline

In [182]:
conf = SparkConf()

In [183]:
conf.setMaster('local[8]')

<pyspark.conf.SparkConf at 0x7f39a4e1e080>

In [184]:
sc = SparkContext.getOrCreate(conf=conf)
sql_sc = SQLContext(sc)

In [185]:
schema = StructType([
        StructField('VendorID',IntegerType(),True),
        StructField('tpep_pickup_datetime',TimestampType(),True),
        StructField('tpep_dropoff_datetime',TimestampType(),True),
        StructField('passenger_count',IntegerType(),True),
        StructField('trip_distance',DoubleType(),True),
        StructField('pickup_longitude',DoubleType(),True),
        StructField('pickup_latitude',DoubleType(),True),
        StructField('RateCodeID',IntegerType(),True),
        StructField('store_and_fwd_flag',StringType(),True),
        StructField('dropoff_longitude',DoubleType(),True),
        StructField('dropoff_latitude',DoubleType(),True),
        StructField('payment_type',IntegerType(),True),
        StructField('fare_amount',DoubleType(),True),
        StructField('extra',DoubleType(),True),
        StructField('mta_tax',DoubleType(),True),
        StructField('tip_amount',DoubleType(),True),
        StructField('tolls_amount',DoubleType(),True),
        StructField('improvement_surcharge',DoubleType(),True),
        StructField('total_amount',DoubleType(),True),
    ])

In [186]:
infilename = 'yellow_tripdata_2015-03.csv'
#infilename = 'sample.csv'
raw_sdf = (sql_sc
    .read
    .csv(infilename, header=True, schema=schema, timestampFormat='yyyy-MM-dd HH:mm:ss'))

In [187]:
clean_sdf = raw_sdf.withColumn('tpep_pickup_timestamp_ms',  unix_timestamp(raw_sdf['tpep_pickup_datetime' ])*1000 + 5*60*60*1000)
clean_sdf = clean_sdf.withColumn('tpep_dropoff_timestamp_ms', unix_timestamp(raw_sdf['tpep_dropoff_datetime'])*1000 + 5*60*60*1000)

In [188]:
clean_sdf.limit(1).toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 21 columns):
VendorID                     1 non-null int64
tpep_pickup_datetime         1 non-null datetime64[ns]
tpep_dropoff_datetime        1 non-null datetime64[ns]
passenger_count              1 non-null int64
trip_distance                1 non-null float64
pickup_longitude             1 non-null float64
pickup_latitude              1 non-null float64
RateCodeID                   1 non-null int64
store_and_fwd_flag           1 non-null object
dropoff_longitude            1 non-null float64
dropoff_latitude             1 non-null float64
payment_type                 1 non-null int64
fare_amount                  1 non-null float64
extra                        1 non-null float64
mta_tax                      1 non-null float64
tip_amount                   1 non-null float64
tolls_amount                 1 non-null float64
improvement_surcharge        1 non-null float64
total_amount                 1

In [189]:
clean_sdf.limit(5).toPandas()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,...,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tpep_pickup_timestamp_ms,tpep_dropoff_timestamp_ms
0,1,2015-03-06 08:02:31,2015-03-06 08:09:55,1,1.2,-73.990211,40.750969,1,N,-73.987892,...,2,7.0,0.0,0.5,0.0,0.0,0.3,7.8,1425646951000,1425647395000
1,1,2015-03-06 08:02:31,2015-03-06 08:15:23,1,3.2,-73.935188,40.80072,1,N,-73.952553,...,2,11.5,0.0,0.5,0.0,0.0,0.3,12.3,1425646951000,1425647723000
2,1,2015-03-06 08:02:31,2015-03-06 08:12:27,1,1.1,-73.963753,40.767937,1,N,-73.956947,...,2,8.0,0.0,0.5,0.0,0.0,0.3,8.8,1425646951000,1425647547000
3,1,2015-03-06 08:02:31,2015-03-06 08:09:09,1,0.8,-73.997177,40.742168,1,N,-74.008064,...,1,6.0,0.0,0.5,1.0,0.0,0.3,7.8,1425646951000,1425647349000
4,1,2015-03-06 08:02:32,2015-03-06 08:19:37,1,2.7,-74.006844,40.730267,1,N,-73.97686,...,1,13.0,0.0,0.5,2.75,0.0,0.3,16.55,1425646952000,1425647977000


In [191]:
end_timestamp = pd.Timestamp('2015-03-03 00:00:00').tz_localize('Etc/GMT+5')
filtered_sdf = clean_sdf.filter('tpep_dropoff_timestamp_ms <= %d' % int(end_timestamp.value / 1e6) )

In [192]:
filtered_sdf.count()

771187

In [193]:
filtered_sdf.limit(5).toPandas()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,...,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tpep_pickup_timestamp_ms,tpep_dropoff_timestamp_ms
0,2,2015-03-02 11:16:24,2015-03-02 11:37:03,2,3.23,-73.98748,40.738121,1,N,-73.953362,...,1,15.5,0.0,0.5,3.26,0.0,0.3,19.56,1425312984000,1425314223000
1,1,2015-03-01 16:42:33,2015-03-01 16:59:29,1,2.3,-73.979179,40.777756,1,N,-73.953178,...,1,11.5,0.0,0.5,2.45,0.0,0.3,14.75,1425246153000,1425247169000
2,1,2015-03-01 16:42:33,2015-03-01 17:09:10,1,9.2,-73.865578,40.770863,1,N,-73.984856,...,1,26.5,0.0,0.5,6.5,5.33,0.3,39.13,1425246153000,1425247750000
3,1,2015-03-01 16:42:34,2015-03-01 16:44:52,1,0.4,-73.952545,40.772148,1,N,-73.948593,...,2,3.5,0.0,0.5,0.0,0.0,0.3,4.3,1425246154000,1425246292000
4,1,2015-03-01 16:42:34,2015-03-01 16:47:09,1,0.9,-73.954231,40.778606,1,N,-73.947395,...,1,5.0,0.0,0.5,1.15,0.0,0.3,6.95,1425246154000,1425246429000


In [194]:
def create_events(trip_record):
    events = []
    trip_id = str(uuid.uuid4())
    pickup_datetime = trip_record.tpep_pickup_timestamp_ms
    dropoff_datetime = trip_record.tpep_dropoff_timestamp_ms
    events.append({
        'event_type': 'pickup',
        'timestamp': pickup_datetime,
        'trip_fraction': 0.0,
        'trip_id': trip_id,
        'lat': trip_record.pickup_latitude,
        'lon': trip_record.pickup_longitude,
        'passenger_count': trip_record.passenger_count,
        'trip_distance': 0.0,
        'trip_duration_minutes': 0.0,
        'total_amount': 0.0,
    })
    
    # Create route events every 1 minute.
    # Assume that route is a straight line from pickup to dropoff.
    trip_duration = dropoff_datetime - pickup_datetime
    report_period = 1*60*1000
    report_timestamp = pickup_datetime
    while True:
        report_timestamp += report_period
        if report_timestamp >= dropoff_datetime:
            break
        trip_fraction = (report_timestamp - pickup_datetime) / trip_duration
        events.append({
            'event_type': 'route',
            'timestamp': report_timestamp,
            'trip_fraction': trip_fraction,
            'trip_id': trip_id,
            'lat': trip_record.pickup_latitude + (trip_record.dropoff_latitude - trip_record.pickup_latitude) * trip_fraction,
            'lon': trip_record.pickup_longitude + (trip_record.dropoff_longitude - trip_record.pickup_longitude) * trip_fraction,
            'passenger_count': trip_record.passenger_count,
            'trip_distance': 0.0,
            'trip_duration_minutes': 0.0,
            'total_amount': 0.0,
        })

    events.append({
        'event_type': 'dropoff',
        'timestamp': dropoff_datetime,
        'trip_fraction': 1.0,
        'trip_id': trip_id,
        'lat': trip_record.dropoff_latitude,
        'lon': trip_record.dropoff_longitude,
        'passenger_count': trip_record.passenger_count,
        'trip_distance': trip_record.trip_distance,
        'trip_duration_minutes': trip_duration / (60*1000),
        'total_amount': trip_record.total_amount,
    })
    rows = [Row(**event) for event in events]
    return rows

In [195]:
%%time
all_events_rdd = filtered_sdf.rdd.flatMap(create_events)

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 35.9 ms


In [196]:
all_events_rdd.take(2)

[Row(event_type='pickup', lat=40.738121032714844, lon=-73.98748016357422, passenger_count=2, timestamp=1425312984000, total_amount=0.0, trip_distance=0.0, trip_duration_minutes=0.0, trip_fraction=0.0, trip_id='1fdd710d-7605-42f3-b2fa-6810ffe9c85a'),
 Row(event_type='route', lat=40.73950725490764, lon=-73.98582792859389, passenger_count=2, timestamp=1425313044000, total_amount=0.0, trip_distance=0.0, trip_duration_minutes=0.0, trip_fraction=0.048426150121065374, trip_id='1fdd710d-7605-42f3-b2fa-6810ffe9c85a')]

In [197]:
%%time
all_events_sdf = sql_sc.createDataFrame(all_events_rdd)

CPU times: user 24 ms, sys: 4 ms, total: 28 ms
Wall time: 144 ms


In [198]:
all_events_sdf.show()

+----------+------------------+------------------+---------------+-------------+------------+-------------+---------------------+--------------------+--------------------+
|event_type|               lat|               lon|passenger_count|    timestamp|total_amount|trip_distance|trip_duration_minutes|       trip_fraction|             trip_id|
+----------+------------------+------------------+---------------+-------------+------------+-------------+---------------------+--------------------+--------------------+
|    pickup|40.738121032714844|-73.98748016357422|              2|1425312984000|         0.0|          0.0|                  0.0|                 0.0|181af4c1-dde4-486...|
|     route| 40.73950725490764|-73.98582792859389|              2|1425313044000|         0.0|          0.0|                  0.0|0.048426150121065374|181af4c1-dde4-486...|
|     route|40.740893477100435|-73.98417569361357|              2|1425313104000|         0.0|          0.0|                  0.0| 0.09685230

In [207]:
all_events2_sdf = all_events_sdf.withColumn('timestamp_str',  from_unixtime(all_events_sdf['timestamp' ] / 1000))

In [208]:
all_events2_sdf.show()

+----------+------------------+------------------+---------------+-------------+------------+-------------+---------------------+--------------------+--------------------+-------------------+
|event_type|               lat|               lon|passenger_count|    timestamp|total_amount|trip_distance|trip_duration_minutes|       trip_fraction|             trip_id|      timestamp_str|
+----------+------------------+------------------+---------------+-------------+------------+-------------+---------------------+--------------------+--------------------+-------------------+
|    pickup|40.738121032714844|-73.98748016357422|              2|1425312984000|         0.0|          0.0|                  0.0|                 0.0|662c31aa-e58c-4f4...|2015-03-02 16:16:24|
|     route| 40.73950725490764|-73.98582792859389|              2|1425313044000|         0.0|          0.0|                  0.0|0.048426150121065374|662c31aa-e58c-4f4...|2015-03-02 16:17:24|
|     route|40.740893477100435|-73.98417

In [209]:
%%time
sorted_sdf = all_events2_sdf.orderBy('timestamp')

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 11.6 ms


In [212]:
%%time
sorted_sdf.write.mode('overwrite').format('json').save('data13.json')

CPU times: user 52 ms, sys: 32 ms, total: 84 ms
Wall time: 3min 3s


# Playground