# Trying to Replicate This

In [1]:
def assign_taxi_zones(df, lon_var, lat_var, locid_var):
    """Joins DataFrame with Taxi Zones shapefile.
    This function takes longitude values provided by `lon_var`, and latitude
    values provided by `lat_var` in DataFrame `df`, and performs a spatial join
    with the NYC taxi_zones shapefile. 
    The shapefile is hard coded in, as this function makes a hard assumption of
    latitude and longitude coordinates. It also assumes latitude=0 and 
    longitude=0 is not a datapoint that can exist in your dataset. Which is 
    reasonable for a dataset of New York, but bad for a global dataset.
    Only rows where `df.lon_var`, `df.lat_var` are reasonably near New York,
    and `df.locid_var` is set to np.nan are updated. 
    Parameters
    ----------
    df : pandas.DataFrame or dask.DataFrame
        DataFrame containing latitudes, longitudes, and location_id columns.
    lon_var : string
        Name of column in `df` containing longitude values. Invalid values 
        should be np.nan.
    lat_var : string
        Name of column in `df` containing latitude values. Invalid values 
        should be np.nan
    locid_var : string
        Name of column in `df` containing taxi_zone location ids. Rows with
        valid, nonzero values are not overwritten. 
    """

    import geopandas
    from shapely.geometry import Point


    localdf = df[[lon_var, lat_var, locid_var]].copy()
    # localdf = localdf.reset_index()
    localdf[lon_var] = localdf[lon_var].fillna(value=0.)
    localdf[lat_var] = localdf[lat_var].fillna(value=0.)
    localdf['replace_locid'] = (localdf[locid_var].isnull()
                                & (localdf[lon_var] != 0.)
                                & (localdf[lat_var] != 0.))

    if (np.any(localdf['replace_locid'])):
        shape_df = geopandas.read_file('../shapefiles/taxi_zones.shp')
        shape_df.drop(['OBJECTID', "Shape_Area", "Shape_Leng", "borough", "zone"],
                      axis=1, inplace=True)
        shape_df = shape_df.to_crs({'init': 'epsg:4326'})

        try:
            local_gdf = geopandas.GeoDataFrame(
                localdf, crs={'init': 'epsg:4326'},
                geometry=[Point(xy) for xy in
                          zip(localdf[lon_var], localdf[lat_var])])

            local_gdf = geopandas.sjoin(
                local_gdf, shape_df, how='left', op='within')

            # one point can intersect more than one zone -- for example if on
            # the boundary between two zones. Deduplicate by taking first valid.
            local_gdf = local_gdf[~local_gdf.index.duplicated(keep='first')]

            local_gdf.LocationID.values[~local_gdf.replace_locid] = (
                (local_gdf[locid_var])[~local_gdf.replace_locid]).values

            return local_gdf.LocationID.rename(locid_var)
        except ValueError as ve:
            print(ve)
            print(ve.stacktrace())
            return df[locid_var].astype(np.float64)
    else:
        return df[locid_var]

# What I have So Far

In [2]:
import os
import dask_cudf
import cuspatial
import numpy as np

In [3]:
dtype_list = {'dropoff_datetime': 'str',  # object, # set by parse_dates in pandas read_csv
              'dropoff_latitude': 'float64',
              'dropoff_taxizone_id': 'float64',
              'dropoff_longitude': 'float64',
              'ehail_fee': 'float64',
              'extra': 'float64',
              'fare_amount': 'float64',
              'improvement_surcharge': 'float64',
              'junk1': 'str',  # object,
              'junk2': 'str',  # object,
              'mta_tax': 'float64',
              'passenger_count': 'str',  # object,
              'payment_type': 'str',  # object,
              'pickup_datetime': 'str',  # object, # set by parse_dates in pandas read_csv
              'pickup_latitude': 'float64',
              'pickup_taxizone_id': 'float64',
              'pickup_longitude': 'float64',
              'rate_code_id': 'str',  # object,
              'store_and_fwd_flag': 'str',  # object,
              'tip_amount': 'float64',
              'tolls_amount': 'float64',
              'total_amount': 'float64',
              'trip_distance': 'float64',
              'trip_type': 'str',  # object,
              'vendor_id': 'str',  # object,
             }

# make dict of paths to data directories
relative_path = '../00_download_scripts/raw_data'
config = {'citibike_raw_data_path': f'{relative_path}/bike/',
          'taxi_raw_data_path': f'{relative_path}/taxi/',
          'uber_raw_data_path': f'{relative_path}/uber/',
          'subway_raw_data_path': f'{relative_path}/subway/',
          'parquet_output_path': f'data/'
         }

def glob(x):
    '''
    Signature: sorted(glob(pathname=x, *, recursive=False))
    Docstring:
    Return a list of paths matching a pathname pattern.

    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.

    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.
    '''
    from glob import glob
    return sorted(glob(x))

In [4]:
def get_yellow():
    # tag file paths to data and column names by schema (x < 2015, 2015 <= x <= 2016.5, 2016.5 < x)
    yellow_schema_pre_2015 = "vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount"
    yellow_glob_pre_2015 = glob(os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_201[0-4]*.csv')) + glob(
                                os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2009*.csv'))
    yellow_schema_2015_2016_h1 = "vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount"
    yellow_glob_2015_2016_h1 = glob(os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2015*.csv')) + glob(
                                    os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2016-0[1-6].csv'))
    yellow_schema_2016_h2_plus = "vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_taxizone_id,dropoff_taxizone_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,junk1,junk2"
    yellow_glob_2016_h2_plus = glob(os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2016-0[7-9].csv')) + glob(
                                    os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2016-1[0-2].csv')) + glob(
                                    os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_201[7-9]*.csv'))

    # create pre 2015 dataframe
    yellow1 = dask_cudf.read_csv(yellow_glob_pre_2015, 
                                 header=0,
                                 na_values=["NA"],
                                 parse_dates=[1, 2],
                                 infer_datetime_format=True,
                                 dtype=dtype_list,
                                 names=yellow_schema_pre_2015.split(',')
                                )
    yellow1['dropoff_taxizone_id'] = -1.0
    yellow1['pickup_taxizone_id'] = -1.0
    yellow1['ehail_fee'] = np.nan
    yellow1['improvement_surcharge'] = np.nan
    yellow1['improvement_surcharge'] = yellow1['improvement_surcharge'].astype('float32')
    yellow1['trip_type'] = -1.0
    
    # create january 2015 - june 2016 dataframe
    yellow2 = dask_cudf.read_csv(yellow_glob_2015_2016_h1, 
                                 header=0,
                                 na_values=["NA"],
                                 parse_dates=[1, 2],
                                 infer_datetime_format=True,
                                 dtype=dtype_list,
                                 names=yellow_schema_2015_2016_h1.split(',')
                                )
    yellow2['dropoff_taxizone_id'] = -1.0
    yellow2['pickup_taxizone_id'] = -1.0
    yellow2['ehail_fee'] = np.nan
    yellow2['trip_type'] = -1.0

    # create post june 2016 dataframe
    yellow3 = dask_cudf.read_csv(yellow_glob_2016_h2_plus, 
                                 header=0,
                                 na_values=["NA"],
                                 parse_dates=[1, 2],
                                 infer_datetime_format=True,
                                 dtype=dtype_list,
                                 names=yellow_schema_2016_h2_plus.split(',')
                                )
    yellow3['dropoff_latitude'] = 0.0
    yellow3['dropoff_longitude'] = 0.0
    yellow3['pickup_latitude'] = 0.0
    yellow3['pickup_longitude'] = 0.0
    yellow3['ehail_fee'] = np.nan
    yellow3['trip_type'] = -1.0
    yellow3 = yellow3.drop(['junk1', 'junk2'], axis=1)

    # join dataframes (alphabetized column order)
    yellow = dask_cudf.concat([yellow1[sorted(yellow1.columns)], 
                               yellow2[sorted(yellow1.columns)], 
                               yellow3[sorted(yellow1.columns)]]
                             )
    for field in list(yellow.columns):
        if field in dtype_list:
            yellow[field] = yellow[field].astype(dtype_list[field])

    yellow['trip_type'] = 'yellow'

    return yellow

In [5]:
all_trips = get_yellow()
type(all_trips)

dask_cudf.core.DataFrame

In [6]:
(len(all_trips.loc[(all_trips.dropoff_taxizone_id == -1) & (all_trips.dropoff_latitude != 0)].compute())/len(all_trips.compute()))*100

71.42036363636363

In [7]:
(len(all_trips.loc[(all_trips.pickup_taxizone_id == -1) & (all_trips.pickup_latitude != 0)].compute())/len(all_trips.compute()))*100

71.40818181818182

In [8]:
len(all_trips.loc[(all_trips.dropoff_taxizone_id == -1) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

100000

In [9]:
len(all_trips.loc[(all_trips.dropoff_taxizone_id == 161) & (all_trips.dropoff_datetime > '2017-01-01 00:00:00')].compute())

11314

In [10]:
len(all_trips.loc[(all_trips.dropoff_taxizone_id == 161) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

11314

In [11]:
len(all_trips.loc[(all_trips.dropoff_taxizone_id == 183) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

11

In [12]:
len(all_trips.loc[(all_trips.dropoff_taxizone_id == 185) & (all_trips.dropoff_datetime > '2017-01-01 00:00:00')].compute())

40

In [13]:
len(all_trips.loc[(all_trips.dropoff_taxizone_id == 185) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

40

In [14]:
len(all_trips.loc[(all_trips.dropoff_taxizone_id == 200) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

83

In [15]:
pip_iterations = list(np.arange(0, 263, 31))
pip_iterations.append(263)

taxi_zones = cuspatial.read_polygon_shapefile('zones/cu_taxi_zones.shp')

def assign_taxi_zones(df, lon_var, lat_var, locid_var):
    """
    Derives Taxi Zones from shapefile.
    
    This function takes longitude values provided by `lon_var`, and latitude
    values provided by `lat_var` in DataFrame `df`, and performs a spatial join
    with the NYC taxi_zones shapefile. 
    
    The shapefile is hard coded in, as this function makes a hard assumption of
    latitude and longitude coordinates. It also assumes latitude=0.0 and 
    longitude=0.0 is not a datapoint that can exist in your dataset. Which is 
    reasonable for a dataset of New York, but a bit edgy for a global dataset.
    
    Only rows where `df.lon_var`, `df.lat_var` are reasonably near New York,
    and `df.locid_var` is set to -1.0 are updated.
    
    Parameters
    ----------
    df : cudf.DataFrame or dask_cudf.DataFrame
        DataFrame containing latitudes, longitudes, and location_id columns.
    lon_var : string
        Name of column in `df` containing longitude values. Invalid values 
        should be -1.0.
    lat_var : string
        Name of column in `df` containing latitude values. Invalid values 
        should be -1.0
    locid_var : string
        Name of column in `df` containing taxi_zone location ids. Rows with
        valid, nonzero values are not overwritten.
        """
    # focus location columns
    localdf = df[[lon_var, lat_var, locid_var]].copy()
    # localdf = localdf.reset_index()
    
    # fill missing lat/long values
    localdf[lon_var] = localdf[lon_var].fillna(value=0.0)
    localdf[lat_var] = localdf[lat_var].fillna(value=0.0)
    
    # (bool column) is location id missing && do we have lat/long coordinates?
    localdf['replace_locid'] = ((localdf[locid_var] == -1.0)
                                & (localdf[lon_var] != 0.0)
                                & (localdf[lat_var] != 0.0)
                               )
    c = 0
    # are there any values to replace?
    if (np.any(localdf['replace_locid'])):  # makes ~28.469% faster
        # go through zones 31 at a time
        for i in range(len(pip_iterations)-1):
            # tag 1st and last zone #s
            start = pip_iterations[i]
            end = pip_iterations[i+1]
            # derive taxi zones from coordinates
            t_zones = cuspatial.point_in_polygon(localdf[lon_var], 
                                                 localdf[lat_var], 
                                                 taxi_zones[0][start:end], 
                                                 taxi_zones[1], 
                                                 taxi_zones[2]['x'], 
                                                 taxi_zones[2]['y'])
            # insert taxi zones into location id columns 
            for j in t_zones.columns:
#                 if j == 200:
#                     print(f'j == {j}')
#                     print(f'np.sum(t_zones[j]) == {np.sum(t_zones[j])}')
#                     print(localdf[locid_var].loc[t_zones[j]].value_counts())
                localdf[locid_var].loc[t_zones[j]] = j
#                 if j == 200:
#                     print(localdf[locid_var].loc[t_zones[j]].value_counts())
#                     print()
#                     print(c)
#                     c += 1
#                     print()
#                     print()
            
        return localdf[locid_var].astype('float64') 

    else:
        localdf[locid_var] = localdf[locid_var].astype('float64')   
        return localdf[locid_var]

In [16]:
# derive & assign pickup & dropoff taxi zones 
all_trips['dropoff_taxizone_id'] = all_trips.map_partitions(assign_taxi_zones, 
                                                            lon_var='dropoff_longitude', 
                                                            lat_var='dropoff_latitude',
                                                            locid_var='dropoff_taxizone_id', 
                                                            meta=('dropoff_taxizone_id', np.float64))
all_trips['pickup_taxizone_id'] = all_trips.map_partitions(assign_taxi_zones, 
                                                           lon_var='pickup_longitude', 
                                                           lat_var='pickup_latitude',
                                                           locid_var='pickup_taxizone_id', 
                                                           meta=('pickup_taxizone_id', np.float64))

In [17]:
# all_trips = all_trips[sorted(all_trips.columns)]
# all_trips = all_trips.repartition(npartitions=1200)

In [18]:
all_trips = all_trips.map_partitions(lambda x: x.sort_values('pickup_datetime'), 
                                     meta=all_trips)

In [19]:
%%time
df = all_trips.compute()
df

CPU times: user 23.8 s, sys: 7.57 s, total: 31.4 s
Wall time: 31.5 s


Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
29265,2010-01-01 00:22:59,40.696554,-73.928529,36.0,,0.5,17.3,,0.5,1,...,-73.969274,237.0,1,0,0.00,0.00,18.30,6.20,yellow,CMT
46774,2010-01-01 00:11:39,0.0,0.0,-1.0,,0.5,8.9,,0.5,1,...,0.000000,-1.0,1,0,2.50,0.00,12.40,2.80,yellow,CMT
16590,2010-01-01 00:07:06,40.733424,-74.00257,251.0,,0.5,4.5,,0.5,1,...,-73.995042,234.0,1,0,0.00,0.00,5.50,0.70,yellow,CMT
93244,2010-01-01 00:37:31,40.759157,-73.995959,248.0,,0.0,45.0,,0.5,1,...,-73.776814,136.0,1,0,0.00,4.57,50.07,18.30,yellow,CMT
42240,2010-01-01 00:35:00,40.761327,-73.980508,229.0,,0.5,14.9,,0.5,5,...,-73.976473,162.0,1,,0.00,0.00,15.90,1.99,yellow,VTS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30398,2019-02-01 00:01:39,0.0,0.0,48.0,,0.5,6.5,0.3,0.5,1,...,0.000000,100.0,1,N,2.34,0.00,10.14,0.99,yellow,2
3419,2019-02-01 00:02:52,0.0,0.0,237.0,,0.5,5.5,0.3,0.5,1,...,0.000000,140.0,1,N,1.35,0.00,8.15,0.50,yellow,1
96017,2019-02-01 00:11:38,0.0,0.0,164.0,,0.5,9.0,0.3,0.5,2,...,0.000000,68.0,1,N,0.00,0.00,10.30,0.80,yellow,1
87599,2019-02-01 00:08:20,0.0,0.0,68.0,,0.5,5.0,0.3,0.5,1,...,0.000000,100.0,1,N,1.89,0.00,8.19,0.68,yellow,2


In [20]:
len(df.loc[df.dropoff_taxizone_id == -1])

17940

In [21]:
len(df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_datetime > '2017-01-01 00:00:00')])

0

In [22]:
len(df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_datetime > '2016-01-01 00:00:00')])

1833

In [23]:
len(df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_datetime > '2016-01-01 00:00:00') & (df.dropoff_latitude != 0)])

270

In [24]:
len(df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_latitude != 0)])

3564

In [25]:
(len(df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_latitude != 0)])/len(df))*100

0.32399999999999995

In [26]:
len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_latitude != 0)])

2616

In [27]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_latitude != 0)])/len(df))*100

0.23781818181818185

In [28]:
df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_datetime > '2016-01-01 00:00:00') & (df.dropoff_latitude != 0)]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
56465,2016-01-01 02:42:03,41.041924,-74.144775,-1.0,,0.0,125.00,0.3,0.5,1,...,-73.982910,232.0,5,N,34.08,10.50,170.38,32.71,yellow,2
22586,2016-01-01 03:12:25,40.765263,-74.025215,-1.0,,0.0,60.00,0.3,0.0,1,...,-74.025215,-1.0,5,N,10.00,0.00,70.30,0.90,yellow,1
34282,2016-01-01 03:32:02,40.739738,-74.028519,-1.0,,0.0,64.00,0.3,0.0,1,...,-74.028526,-1.0,5,N,0.00,0.00,64.30,3.00,yellow,1
32009,2016-01-01 04:34:14,40.831398,-74.208260,-1.0,,0.0,100.00,0.3,0.0,2,...,-74.208252,-1.0,5,N,0.00,10.50,110.80,0.00,yellow,1
38315,2016-01-01 04:54:58,40.758411,-73.958092,-1.0,,0.5,17.50,0.3,0.5,1,...,-74.007683,83.0,1,N,0.00,0.00,18.80,5.90,yellow,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43990,2016-01-31 07:20:03,40.906124,-73.838425,-1.0,,0.0,56.50,0.3,0.5,2,...,-73.983582,47.0,1,N,0.00,0.00,57.30,20.10,yellow,1
64280,2016-01-31 07:42:54,40.631008,-73.719086,-1.0,,0.0,21.00,0.3,0.5,1,...,-73.781784,136.0,1,N,5.45,0.00,27.25,6.81,yellow,2
58588,2016-01-31 09:06:29,40.479477,-74.406914,-1.0,,0.0,232.74,0.3,0.0,1,...,-74.406914,-1.0,5,N,0.00,0.00,233.04,0.00,yellow,1
77308,2016-01-31 10:58:40,40.535858,-74.534012,-1.0,,0.0,300.00,0.3,0.0,1,...,-73.801193,215.0,5,N,0.00,21.58,321.88,55.48,yellow,2


In [29]:
# df.loc[df.dropoff_taxizone_id == -1].head()

In [30]:
len(df.loc[(df.dropoff_taxizone_id == 161) & (df.dropoff_datetime > '2017-01-01 00:00:00')])

11314

In [31]:
len(df.loc[(df.dropoff_taxizone_id == 161) & (df.dropoff_datetime > '2016-01-01 00:00:00')])

14256

In [32]:
len(df.loc[(df.dropoff_taxizone_id == 183) & (df.dropoff_datetime > '2016-01-01 00:00:00')])

12

In [33]:
len(df.loc[(df.dropoff_taxizone_id == 185) & (df.dropoff_datetime > '2017-01-01 00:00:00')])

40

In [34]:
len(df.loc[(df.dropoff_taxizone_id == 185) & (df.dropoff_datetime > '2016-01-01 00:00:00')])

2661

In [35]:
len(df.loc[(df.dropoff_taxizone_id == 200) & (df.dropoff_datetime > '2016-01-01 00:00:00')])

101

In [36]:
df.loc[(df.pickup_latitude == 0) & (df.pickup_datetime < '2016-06-01 00:00:00')].pickup_taxizone_id.value_counts()

-1.0      14510
 193.0        1
 264.0        1
Name: pickup_taxizone_id, dtype: int32

In [37]:
df.loc[(df.pickup_latitude == 0) & (df.pickup_datetime > '2016-06-01 00:00:00')].pickup_taxizone_id.value_counts()

237.0    12506
236.0    11896
161.0    11796
230.0    10616
162.0    10431
         ...  
184.0        1
206.0        1
214.0        1
240.0        1
253.0        1
Name: pickup_taxizone_id, Length: 236, dtype: int32

In [38]:
df.loc[(df.dropoff_latitude == 0) & (df.dropoff_datetime < '2016-06-01 00:00:00')].dropoff_taxizone_id.value_counts()

-1.0      14376
 193.0        2
Name: dropoff_taxizone_id, dtype: int32

In [39]:
df.loc[(df.dropoff_latitude == 0) & (df.dropoff_datetime > '2016-06-01 00:00:00')].dropoff_taxizone_id.value_counts()

236.0    12505
161.0    11314
237.0    11081
170.0     9541
162.0     9204
         ...  
187.0        2
30.0         1
44.0         1
59.0         1
176.0        1
Name: dropoff_taxizone_id, Length: 256, dtype: int32

In [40]:
df.loc[(df.dropoff_latitude == 0) & (df.dropoff_datetime > '2016-06-01 00:00:00')]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
63760,2017-01-01 00:04:57,0.0,0.0,43.0,,0.5,5.5,0.3,0.5,1,...,0.0,237.0,1,N,0.00,0.0,6.80,0.70,yellow,1
90030,2017-01-01 00:22:02,0.0,0.0,189.0,,0.5,19.0,0.3,0.5,3,...,0.0,88.0,1,N,0.00,0.0,20.30,5.10,yellow,1
86198,2017-01-01 00:21:06,0.0,0.0,238.0,,0.5,14.0,0.3,0.5,1,...,0.0,68.0,1,N,3.06,0.0,18.36,3.19,yellow,2
88151,2017-01-01 00:13:01,0.0,0.0,148.0,,0.5,9.0,0.3,0.5,3,...,0.0,234.0,1,N,0.00,0.0,10.30,1.70,yellow,1
27818,2017-01-01 00:14:26,0.0,0.0,114.0,,0.5,8.5,0.3,0.5,5,...,0.0,186.0,1,N,0.00,0.0,9.80,1.65,yellow,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30398,2019-02-01 00:01:39,0.0,0.0,48.0,,0.5,6.5,0.3,0.5,1,...,0.0,100.0,1,N,2.34,0.0,10.14,0.99,yellow,2
3419,2019-02-01 00:02:52,0.0,0.0,237.0,,0.5,5.5,0.3,0.5,1,...,0.0,140.0,1,N,1.35,0.0,8.15,0.50,yellow,1
96017,2019-02-01 00:11:38,0.0,0.0,164.0,,0.5,9.0,0.3,0.5,2,...,0.0,68.0,1,N,0.00,0.0,10.30,0.80,yellow,1
87599,2019-02-01 00:08:20,0.0,0.0,68.0,,0.5,5.0,0.3,0.5,1,...,0.0,100.0,1,N,1.89,0.0,8.19,0.68,yellow,2


In [41]:
df.loc[(df.dropoff_latitude == 0) & (df.dropoff_datetime < '2016-06-01 00:00:00')]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
46774,2010-01-01 00:11:39,0.0,0.0,-1.0,,0.5,8.9,,0.5,1,...,0.0,-1.0,1,0,2.50,0.0,12.40,2.80,yellow,CMT
79628,2010-01-01 00:50:00,0.0,0.0,-1.0,,0.5,12.1,,0.5,3,...,0.0,-1.0,1,,0.00,0.0,13.10,3.23,yellow,VTS
68167,2010-01-01 00:50:00,0.0,0.0,-1.0,,0.5,8.9,,0.5,2,...,0.0,-1.0,1,,0.00,0.0,9.90,2.38,yellow,VTS
28611,2010-01-01 00:52:00,0.0,0.0,-1.0,,0.5,10.1,,0.5,3,...,0.0,-1.0,1,,0.00,0.0,11.10,2.79,yellow,VTS
48707,2010-01-01 01:13:00,0.0,0.0,-1.0,,0.5,9.3,,0.5,2,...,0.0,-1.0,1,,0.00,0.0,10.30,2.44,yellow,VTS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27201,2016-01-31 21:48:58,0.0,0.0,-1.0,,0.5,8.0,0.3,0.5,1,...,0.0,-1.0,1,N,1.85,0.0,11.15,1.80,yellow,1
48618,2016-01-31 22:58:35,0.0,0.0,-1.0,,0.5,8.0,0.3,0.5,1,...,0.0,-1.0,1,N,0.00,0.0,9.30,1.80,yellow,1
2661,2016-01-31 23:10:36,0.0,0.0,-1.0,,0.5,6.5,0.3,0.5,1,...,0.0,-1.0,1,N,1.56,0.0,9.36,1.00,yellow,1
8544,2003-01-01 00:15:05,0.0,0.0,193.0,,0.0,0.0,0.0,0.0,6,...,0.0,264.0,1,N,0.00,0.0,0.00,0.00,yellow,2


In [42]:
df.loc[df.dropoff_latitude == 0]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
46774,2010-01-01 00:11:39,0.0,0.0,-1.0,,0.5,8.9,,0.5,1,...,0.0,-1.0,1,0,2.50,0.0,12.40,2.80,yellow,CMT
79628,2010-01-01 00:50:00,0.0,0.0,-1.0,,0.5,12.1,,0.5,3,...,0.0,-1.0,1,,0.00,0.0,13.10,3.23,yellow,VTS
68167,2010-01-01 00:50:00,0.0,0.0,-1.0,,0.5,8.9,,0.5,2,...,0.0,-1.0,1,,0.00,0.0,9.90,2.38,yellow,VTS
28611,2010-01-01 00:52:00,0.0,0.0,-1.0,,0.5,10.1,,0.5,3,...,0.0,-1.0,1,,0.00,0.0,11.10,2.79,yellow,VTS
48707,2010-01-01 01:13:00,0.0,0.0,-1.0,,0.5,9.3,,0.5,2,...,0.0,-1.0,1,,0.00,0.0,10.30,2.44,yellow,VTS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30398,2019-02-01 00:01:39,0.0,0.0,48.0,,0.5,6.5,0.3,0.5,1,...,0.0,100.0,1,N,2.34,0.0,10.14,0.99,yellow,2
3419,2019-02-01 00:02:52,0.0,0.0,237.0,,0.5,5.5,0.3,0.5,1,...,0.0,140.0,1,N,1.35,0.0,8.15,0.50,yellow,1
96017,2019-02-01 00:11:38,0.0,0.0,164.0,,0.5,9.0,0.3,0.5,2,...,0.0,68.0,1,N,0.00,0.0,10.30,0.80,yellow,1
87599,2019-02-01 00:08:20,0.0,0.0,68.0,,0.5,5.0,0.3,0.5,1,...,0.0,100.0,1,N,1.89,0.0,8.19,0.68,yellow,2


In [43]:
df.loc[(df.dropoff_taxizone_id == -1) & (df.pickup_taxizone_id == 0)]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
9139,2010-01-15 15:58:56,40.696372,-74.184743,-1.0,,0.0,18.3,,0.0,1,...,-74.182008,0.0,1,0,0.0,0.0,18.3,0.5,yellow,CMT
20793,2015-01-16 14:39:17,0.0,0.0,-1.0,,0.0,95.0,0.3,0.0,1,...,-74.182724,0.0,5,N,19.0,0.0,114.3,0.0,yellow,2


In [44]:
df.loc[(df.dropoff_taxizone_id == 0) & (df.pickup_taxizone_id == -1)]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
73002,2011-01-08 07:16:00,40.68992,-74.178097,0.0,,0.0,70.0,,0.0,1,...,-74.185978,-1.0,5,,10.0,8.0,88.0,1.33,yellow,VTS
33148,2013-01-20 13:45:00,40.706888,-74.1615,0.0,,0.0,19.5,,0.5,1,...,-74.159677,-1.0,1,,3.9,0.0,23.9,6.53,yellow,VTS
13979,2013-01-28 11:04:41,40.690467,-74.177546,0.0,,0.0,95.0,,0.0,2,...,-74.185459,-1.0,5,N,0.0,0.0,95.0,2.8,yellow,CMT
82253,2014-01-02 18:58:56,40.69114,-74.177186,0.0,,0.0,52.0,,0.5,1,...,-74.187939,-1.0,2,N,27.5,0.0,80.0,0.9,yellow,CMT
14102,2009-01-23 15:35:02,40.694968,-74.177255,0.0,,0.0,93.9,,-1.0,1,...,-73.137393,-1.0,-1,,18.75,17.8,130.45,37.3,yellow,DDS
75748,2015-01-13 07:45:48,40.68935,-74.18428,0.0,,0.0,0.01,0.3,0.0,1,...,-74.128632,-1.0,5,N,0.0,0.0,0.31,5.4,yellow,1
14216,2015-01-16 11:35:54,40.695068,-74.177147,0.0,,0.0,71.0,0.3,0.0,4,...,0.0,-1.0,3,N,0.0,22.0,93.3,18.7,yellow,1
1692,2015-01-23 14:31:58,40.695438,-74.177673,0.0,,0.0,87.0,0.3,0.5,1,...,0.0,-1.0,5,N,17.56,0.0,105.36,0.0,yellow,2
