In [54]:
import pandas as pd
import numpy as np
import glob
import os
import bz2
import codecs
import json
import matplotlib.pylab as plt
import uuid
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
%matplotlib inline

In [55]:
conf = SparkConf()
sc = SparkContext.getOrCreate(conf=conf)
sql_sc = SQLContext(sc)

In [91]:
%%time
infilename = 'yellow_tripdata_2015-03.csv'
inchunks = pd.read_csv(infilename, chunksize=100000)
rdd = sc.emptyRDD()
for chunk in inchunks:
    rdd += sc.parallelize(list([row for index,row in chunk.iterrows()]))

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.readRDDFromFile.
: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.api.python.PythonRDD$.readRDDFromFile(PythonRDD.scala:482)
	at org.apache.spark.api.python.PythonRDD.readRDDFromFile(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor43.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [92]:
pd.DataFrame(rdd.head())

AttributeError: 'RDD' object has no attribute 'head'

In [73]:
def create_events(trip_record):
    events = []
    dropoff_datetime = pd.to_datetime(trip_record['tpep_dropoff_datetime']).tz_localize('Etc/GMT+5')
    if dropoff_datetime > pd.to_datetime('2015-03-04 00:00:00-05:00', utc=True):
        return events
    pickup_datetime = pd.to_datetime(trip_record['tpep_pickup_datetime']).tz_localize('Etc/GMT+5')
    trip_id = str(uuid.uuid4())
    events.append({
        'event_type': 'pickup',
        'timestamp': pickup_datetime,
        'trip_fraction': 0.0,
        'trip_id': trip_id,
        'location': {
            'lat': trip_record.pickup_latitude,
            'lon': trip_record.pickup_longitude,
        },
        'passenger_count': trip_record.passenger_count,
    })
    
    # Create route events every 1 minute.
    # Assume that route is a straight line from pickup to dropoff.
    trip_duration = dropoff_datetime - pickup_datetime
    report_period = pd.Timedelta(1, 'm')
    report_timestamp = pickup_datetime
    while True:
        report_timestamp += report_period
        if report_timestamp >= dropoff_datetime:
            break
        trip_fraction = (report_timestamp - pickup_datetime) / trip_duration
        events.append({
            'event_type': 'route',
            'timestamp': report_timestamp,
            'trip_fraction': trip_fraction,
            'trip_id': trip_id,
            'location': {
                'lat': trip_record.pickup_latitude + (trip_record.dropoff_latitude - trip_record.pickup_latitude) * trip_fraction,
                'lon': trip_record.pickup_longitude + (trip_record.dropoff_longitude - trip_record.pickup_longitude) * trip_fraction,
            },
            'passenger_count': trip_record.passenger_count,
        })

    events.append({
        'event_type': 'dropoff',
        'timestamp': dropoff_datetime,
        'trip_fraction': 1.0,
        'trip_id': trip_id,
        'location': {
            'lat': trip_record.dropoff_latitude,
            'lon': trip_record.dropoff_longitude,
        },
        'passenger_count': trip_record.passenger_count,
        'trip_distance': trip_record.trip_distance,
        'trip_duration_minutes': trip_duration / pd.Timedelta(1, 'm'),
        'total_amount': trip_record.total_amount,
    })
    return events

In [75]:
all_events_rdd = rdd.flatMap(create_events)

In [76]:
all_events_rdd.count()

42

In [88]:
def serialize_event(event):
    event['timestamp'] = int(event['timestamp'].value / 1e6)
    return json.dumps(event)

In [89]:
serialized_rdd = all_events_rdd.sortBy(lambda r: r['timestamp']).map(serialize_event)

In [90]:
serialized_rdd.collect()

['{"event_type": "pickup", "timestamp": 1425646951000, "trip_fraction": 0.0, "trip_id": "4273e632-bf82-4213-876a-26cf822f2248", "location": {"lat": 40.75096893310547, "lon": -73.9902114868164}, "passenger_count": 1}',
 '{"event_type": "pickup", "timestamp": 1425646951000, "trip_fraction": 0.0, "trip_id": "a5e85b53-d3b0-431d-a5cc-d1061c90c2fa", "location": {"lat": 40.80072021484375, "lon": -73.93518829345702}, "passenger_count": 1}',
 '{"event_type": "pickup", "timestamp": 1425646951000, "trip_fraction": 0.0, "trip_id": "8710c7c5-ca13-4910-9ae8-adf48b5c4221", "location": {"lat": 40.76793670654297, "lon": -73.96375274658203}, "passenger_count": 1}',
 '{"event_type": "pickup", "timestamp": 1425646951000, "trip_fraction": 0.0, "trip_id": "cac387db-d8ef-4f8d-a931-789b54b12d08", "location": {"lat": 40.74216842651367, "lon": -73.99717712402342}, "passenger_count": 1}',
 '{"event_type": "route", "timestamp": 1425647011000, "trip_fraction": 0.13513513513513514, "trip_id": "4273e632-bf82-4213-87

In [83]:
serialized_rdd.coalesce(1).saveAsTextFile('data2.json')

In [None]:
%%time
all_events_df = pd.DataFrame(all_events).sort_values(['timestamp'])
all_events_df.head(10)

In [None]:
all_events_df.to_json('data.ndjson', orient='records', lines=True)

# Playground

In [41]:
all_events_df.index.to_period(freq='d').start_time

DatetimeIndex(['2015-05-05', '2015-05-05', '2015-05-05', '2015-05-05',
               '2015-05-05', '2015-05-05', '2015-05-05', '2015-05-05',
               '2015-05-05', '2015-05-05', '2015-05-05', '2015-05-05',
               '2015-05-05', '2015-05-05', '2015-05-05', '2015-05-05',
               '2015-05-05', '2015-05-05', '2015-05-05', '2015-05-06'],
              dtype='datetime64[ns]', name='timestamp', freq=None)

In [50]:
begin_playback_period = pd.Period('2015-05-06', 'D')
begin_playback_period

Period('2015-05-06', 'D')

In [54]:
begin_playback_original_timestamp = begin_playback_period.start_time.tz_localize('US/Eastern')
begin_playback_original_timestamp

Timestamp('2015-05-06 00:00:00-0400', tz='US/Eastern')

In [65]:
realtime_timestamp = pd.Timestamp.now('US/Eastern')
realtime_timestamp

Timestamp('2018-02-27 19:40:38.762375-0500', tz='US/Eastern')

In [68]:
realtime_period = pd.Period(realtime_timestamp, freq='D')
realtime_period

Period('2018-02-27', 'D')

In [69]:
delta_periods = realtime_period - begin_playback_period
delta_periods

1028

In [70]:
realtime_timestamp - (realtime_period - begin_playback_period) * realtime_period.freq.delta

Timestamp('2015-05-06 20:40:38.762375-0400', tz='US/Eastern')

In [86]:
t = pd.to_datetime(1430883460000, unit='ms', utc=True).tz_convert('US/Pacific')
str(t)

'2015-05-05 20:37:40-07:00'

In [80]:
pd.Timestamp.now('UTC')

Timestamp('2018-02-28 01:16:44.285336+0000', tz='UTC')

In [83]:
(realtime_period - begin_playback_period) * realtime_period.freq.delta / pd.Timedelta(1, 'ms')

88819200000.0

In [28]:
t = pd.Timestamp.now('UTC')
t

Timestamp('2018-02-28 05:40:28.684808+0000', tz='UTC')

In [30]:
t.value / 1e6

1519796428684.8079