In [1]:
import pandas as pd
import numpy as np
import glob
import os
import bz2
import codecs
import json
import matplotlib.pylab as plt
import uuid
%matplotlib inline

In [2]:
%%time
infilename = 'yellow_tripdata_2015-03.csv'
raw_df = pd.read_csv(infilename)

CPU times: user 49.5 s, sys: 4.83 s, total: 54.3 s
Wall time: 54.6 s


In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13351609 entries, 0 to 13351608
Data columns (total 19 columns):
VendorID                 int64
tpep_pickup_datetime     object
tpep_dropoff_datetime    object
passenger_count          int64
trip_distance            float64
pickup_longitude         float64
pickup_latitude          float64
RateCodeID               int64
store_and_fwd_flag       object
dropoff_longitude        float64
dropoff_latitude         float64
payment_type             int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtypes: float64(12), int64(4), object(3)
memory usage: 1.9+ GB


In [4]:
def clean_data(df):
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'].values).tz_localize('Etc/GMT+5')
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'].values).tz_localize('Etc/GMT+5')
    return df

In [82]:
# df = raw_df.head(1).copy()
df = raw_df.copy()

In [83]:
%%time
df = clean_data(df)
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2015-03-06 08:02:31-05:00,2015-03-06 08:09:55-05:00,1,1.2,-73.990211,40.750969,1,N,-73.987892,40.738037,2,7.0,0.0,0.5,0.0,0.0,0.3,7.8
1,1,2015-03-06 08:02:31-05:00,2015-03-06 08:15:23-05:00,1,3.2,-73.935188,40.80072,1,N,-73.952553,40.765373,2,11.5,0.0,0.5,0.0,0.0,0.3,12.3
2,1,2015-03-06 08:02:31-05:00,2015-03-06 08:12:27-05:00,1,1.1,-73.963753,40.767937,1,N,-73.956947,40.78027,2,8.0,0.0,0.5,0.0,0.0,0.3,8.8
3,1,2015-03-06 08:02:31-05:00,2015-03-06 08:09:09-05:00,1,0.8,-73.997177,40.742168,1,N,-74.008064,40.739281,1,6.0,0.0,0.5,1.0,0.0,0.3,7.8
4,1,2015-03-06 08:02:32-05:00,2015-03-06 08:19:37-05:00,1,2.7,-74.006844,40.730267,1,N,-73.97686,40.750671,1,13.0,0.0,0.5,2.75,0.0,0.3,16.55


In [84]:
df = df[df.tpep_dropoff_datetime < pd.to_datetime('2015-03-04 00:00:00-05:00', utc=True)]
len(df)

1168099

In [85]:
def create_events(trip_record):
    events = []
    trip_id = str(uuid.uuid4())
    events.append({
        'event_type': 'pickup',
        'timestamp': trip_record.tpep_pickup_datetime,
        'trip_fraction': 0.0,
        'trip_id': trip_id,
        'location': {
            'lat': trip_record.pickup_latitude,
            'lon': trip_record.pickup_longitude,
        },
        'passenger_count': trip_record.passenger_count,
    })
    
    # Create route events every 1 minute.
    # Assume that route is a straight line from pickup to dropoff.
    trip_duration = trip_record.tpep_dropoff_datetime - trip_record.tpep_pickup_datetime
    report_period = pd.Timedelta(1, 'm')
    report_timestamp = trip_record.tpep_pickup_datetime
    while True:
        report_timestamp += report_period
        if report_timestamp >= trip_record.tpep_dropoff_datetime:
            break
        trip_fraction = (report_timestamp - trip_record.tpep_pickup_datetime) / trip_duration
        events.append({
            'event_type': 'route',
            'timestamp': report_timestamp,
            'trip_fraction': trip_fraction,
            'trip_id': trip_id,
            'location': {
                'lat': trip_record.pickup_latitude + (trip_record.dropoff_latitude - trip_record.pickup_latitude) * trip_fraction,
                'lon': trip_record.pickup_longitude + (trip_record.dropoff_longitude - trip_record.pickup_longitude) * trip_fraction,
            },
            'passenger_count': trip_record.passenger_count,
        })

    events.append({
        'event_type': 'dropoff',
        'timestamp': trip_record.tpep_dropoff_datetime,
        'trip_fraction': 1.0,
        'trip_id': trip_id,
        'location': {
            'lat': trip_record.dropoff_latitude,
            'lon': trip_record.dropoff_longitude,
        },
        'passenger_count': trip_record.passenger_count,
        'trip_distance': trip_record.trip_distance,
        'trip_duration_minutes': trip_duration / pd.Timedelta(1, 'm'),
        'total_amount': trip_record.total_amount,
    })
    return events

In [86]:
create_events(df.iloc[0])

[{'event_type': 'pickup',
  'location': {'lat': 40.774150848388672, 'lon': -73.872817993164062},
  'passenger_count': 1,
  'timestamp': Timestamp('2015-03-03 18:17:50-0500', tz='Etc/GMT+5'),
  'trip_fraction': 0.0,
  'trip_id': '3c3b4b72-5436-47a5-9792-371a10dabcfc'},
 {'event_type': 'route',
  'location': {'lat': 40.77324652370018, 'lon': -73.871542693190435},
  'passenger_count': 1,
  'timestamp': Timestamp('2015-03-03 18:18:50-0500', tz='Etc/GMT+5'),
  'trip_fraction': 0.016877637130801686,
  'trip_id': '3c3b4b72-5436-47a5-9792-371a10dabcfc'},
 {'event_type': 'route',
  'location': {'lat': 40.772342199011696, 'lon': -73.870267393216807},
  'passenger_count': 1,
  'timestamp': Timestamp('2015-03-03 18:19:50-0500', tz='Etc/GMT+5'),
  'trip_fraction': 0.03375527426160337,
  'trip_id': '3c3b4b72-5436-47a5-9792-371a10dabcfc'},
 {'event_type': 'route',
  'location': {'lat': 40.771437874323205, 'lon': -73.868992093243179},
  'passenger_count': 1,
  'timestamp': Timestamp('2015-03-03 18:20:

In [87]:
def flatten_list(li):
    result = []
    for item in li:
        result.extend(item)
    return result

In [88]:
%%time
all_events = flatten_list([create_events(r) for _, r in df.iterrows()])

CPU times: user 1h 57min 10s, sys: 1min 18s, total: 1h 58min 29s
Wall time: 1h 58min 4s


In [92]:
len(all_events)

18916770

In [93]:
%%time
all_events_df = pd.DataFrame(all_events).sort_values(['timestamp'])
all_events_df.head(10)

CPU times: user 2min 30s, sys: 17.9 s, total: 2min 48s
Wall time: 2min 48s


In [94]:
all_events_df.to_json('data.ndjson', orient='records', lines=True)

MemoryError: 

# Playground

In [41]:
all_events_df.index.to_period(freq='d').start_time

DatetimeIndex(['2015-05-05', '2015-05-05', '2015-05-05', '2015-05-05',
               '2015-05-05', '2015-05-05', '2015-05-05', '2015-05-05',
               '2015-05-05', '2015-05-05', '2015-05-05', '2015-05-05',
               '2015-05-05', '2015-05-05', '2015-05-05', '2015-05-05',
               '2015-05-05', '2015-05-05', '2015-05-05', '2015-05-06'],
              dtype='datetime64[ns]', name='timestamp', freq=None)

In [50]:
begin_playback_period = pd.Period('2015-05-06', 'D')
begin_playback_period

Period('2015-05-06', 'D')

In [54]:
begin_playback_original_timestamp = begin_playback_period.start_time.tz_localize('US/Eastern')
begin_playback_original_timestamp

Timestamp('2015-05-06 00:00:00-0400', tz='US/Eastern')

In [65]:
realtime_timestamp = pd.Timestamp.now('US/Eastern')
realtime_timestamp

Timestamp('2018-02-27 19:40:38.762375-0500', tz='US/Eastern')

In [68]:
realtime_period = pd.Period(realtime_timestamp, freq='D')
realtime_period

Period('2018-02-27', 'D')

In [69]:
delta_periods = realtime_period - begin_playback_period
delta_periods

1028

In [70]:
realtime_timestamp - (realtime_period - begin_playback_period) * realtime_period.freq.delta

Timestamp('2015-05-06 20:40:38.762375-0400', tz='US/Eastern')

In [86]:
t = pd.to_datetime(1430883460000, unit='ms', utc=True).tz_convert('US/Pacific')
str(t)

'2015-05-05 20:37:40-07:00'

In [80]:
pd.Timestamp.now('UTC')

Timestamp('2018-02-28 01:16:44.285336+0000', tz='UTC')

In [83]:
(realtime_period - begin_playback_period) * realtime_period.freq.delta / pd.Timedelta(1, 'ms')

88819200000.0

In [28]:
t = pd.Timestamp.now('UTC')
t

Timestamp('2018-02-28 05:40:28.684808+0000', tz='UTC')

In [30]:
t.value / 1e6

1519796428684.8079

In [3]:
pd.Timestamp('2015-03-06 08:02:31').value / 1e6

1425628951000.0