# Trying to Replicate This

In [1]:
def assign_taxi_zones(df, lon_var, lat_var, locid_var):
    """Joins DataFrame with Taxi Zones shapefile.
    This function takes longitude values provided by `lon_var`, and latitude
    values provided by `lat_var` in DataFrame `df`, and performs a spatial join
    with the NYC taxi_zones shapefile. 
    The shapefile is hard coded in, as this function makes a hard assumption of
    latitude and longitude coordinates. It also assumes latitude=0 and 
    longitude=0 is not a datapoint that can exist in your dataset. Which is 
    reasonable for a dataset of New York, but bad for a global dataset.
    Only rows where `df.lon_var`, `df.lat_var` are reasonably near New York,
    and `df.locid_var` is set to np.nan are updated. 
    Parameters
    ----------
    df : pandas.DataFrame or dask.DataFrame
        DataFrame containing latitudes, longitudes, and location_id columns.
    lon_var : string
        Name of column in `df` containing longitude values. Invalid values 
        should be np.nan.
    lat_var : string
        Name of column in `df` containing latitude values. Invalid values 
        should be np.nan
    locid_var : string
        Name of column in `df` containing taxi_zone location ids. Rows with
        valid, nonzero values are not overwritten. 
    """

    import geopandas
    from shapely.geometry import Point


    localdf = df[[lon_var, lat_var, locid_var]].copy()
    # localdf = localdf.reset_index()
    localdf[lon_var] = localdf[lon_var].fillna(value=0.)
    localdf[lat_var] = localdf[lat_var].fillna(value=0.)
    localdf['replace_locid'] = (localdf[locid_var].isnull()
                                & (localdf[lon_var] != 0.)
                                & (localdf[lat_var] != 0.))

    if (np.any(localdf['replace_locid'])):
        shape_df = geopandas.read_file('../shapefiles/taxi_zones.shp')
        shape_df.drop(['OBJECTID', "Shape_Area", "Shape_Leng", "borough", "zone"],
                      axis=1, inplace=True)
        shape_df = shape_df.to_crs({'init': 'epsg:4326'})

        try:
            local_gdf = geopandas.GeoDataFrame(
                localdf, crs={'init': 'epsg:4326'},
                geometry=[Point(xy) for xy in
                          zip(localdf[lon_var], localdf[lat_var])])

            local_gdf = geopandas.sjoin(
                local_gdf, shape_df, how='left', op='within')

            # one point can intersect more than one zone -- for example if on
            # the boundary between two zones. Deduplicate by taking first valid.
            local_gdf = local_gdf[~local_gdf.index.duplicated(keep='first')]

            local_gdf.LocationID.values[~local_gdf.replace_locid] = (
                (local_gdf[locid_var])[~local_gdf.replace_locid]).values

            return local_gdf.LocationID.rename(locid_var)
        except ValueError as ve:
            print(ve)
            print(ve.stacktrace())
            return df[locid_var].astype(np.float64)
    else:
        return df[locid_var]

# What I have So Far

In [2]:
import os
import dask_cudf
import cuspatial
import numpy as np

dtype_list = {'dropoff_datetime': 'str',  # object, # set by parse_dates in pandas read_csv
              'dropoff_latitude': 'float64',
              'dropoff_taxizone_id': 'float64',
              'dropoff_longitude': 'float64',
              'ehail_fee': 'float64',
              'extra': 'float64',
              'fare_amount': 'float64',
              'improvement_surcharge': 'float64',
              'junk1': 'str',  # object,
              'junk2': 'str',  # object,
              'mta_tax': 'float64',
              'passenger_count': 'str',  # object,
              'payment_type': 'str',  # object,
              'pickup_datetime': 'str',  # object, # set by parse_dates in pandas read_csv
              'pickup_latitude': 'float64',
              'pickup_taxizone_id': 'float64',
              'pickup_longitude': 'float64',
              'rate_code_id': 'str',  # object,
              'store_and_fwd_flag': 'str',  # object,
              'tip_amount': 'float64',
              'tolls_amount': 'float64',
              'total_amount': 'float64',
              'trip_distance': 'float64',
              'trip_type': 'str',  # object,
              'vendor_id': 'str',  # object,
             }

# make dict of paths to data directories
relative_path = '../00_download_scripts/raw_data'
config = {'citibike_raw_data_path': f'{relative_path}/bike/',
          'taxi_raw_data_path': f'{relative_path}/taxi/',
          'uber_raw_data_path': f'{relative_path}/uber/',
          'subway_raw_data_path': f'{relative_path}/subway/',
          'parquet_output_path': f'data/'
         }

def glob(x):
    '''
    Signature: sorted(glob(pathname=x, *, recursive=False))
    Docstring:
    Return a list of paths matching a pathname pattern.

    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.

    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.
    '''
    from glob import glob
    return sorted(glob(x))

In [3]:
pip_iterations = list(np.arange(0, 263, 31))
pip_iterations.append(263)

taxi_zones = cuspatial.read_polygon_shapefile('zones/cu_taxi_zones.shp')

def assign_taxi_zones(df, lon_var, lat_var, locid_var):
    # focus location columns
    localdf = df[[lon_var, lat_var, locid_var]].copy()
    
#     print(localdf[locid_var].value_counts())
    # localdf = localdf.reset_index()
    
#     print(localdf[locid_var].dtype)
    
#     # fill missing lat/long values
#     print(np.sum(localdf[lon_var].isna()))
    localdf[lon_var] = localdf[lon_var].fillna(value=0.0)
#     print(np.sum(localdf[lat_var].isna()))
    localdf[lat_var] = localdf[lat_var].fillna(value=0.0)
    
    # (bool column) is location id missing && do we have lat/long coordinates?
    localdf['replace_locid'] = ((localdf[locid_var] == -1.0)
                                & (localdf[lon_var] != 0.0)
                                & (localdf[lat_var] != 0.0)
                               )
#     print(localdf['replace_locid'].value_counts())

#     # any missing location id values w/ non-0 lat/long coordinates?
#     if (np.any(localdf['replace_locid'])):
# #         shape_df = geopandas.read_file('../shapefiles/taxi_zones.shp')
# #         shape_df.drop(['OBJECTID', "Shape_Area", "Shape_Leng", "borough", "zone"],
# #                       axis=1, inplace=True)
# #         shape_df = shape_df.to_crs({'init': 'epsg:4326'})

#         try:
#             local_gdf = geopandas.GeoDataFrame(
#                 localdf, crs={'init': 'epsg:4326'},
#                 geometry=[Point(xy) for xy in
#                           zip(localdf[lon_var], localdf[lat_var])])

#             local_gdf = geopandas.sjoin(
#                 local_gdf, shape_df, how='left', op='within')

#             # one point can intersect more than one zone -- for example if on
#             # the boundary between two zones. Deduplicate by taking first valid.
#             local_gdf = local_gdf[~local_gdf.index.duplicated(keep='first')]

#             local_gdf.LocationID.values[~local_gdf.replace_locid] = (
#                 (local_gdf[locid_var])[~local_gdf.replace_locid]).values

#     # generic pickup/dropoff location id columns
#     localdf[locid_var] = 264

    # are there any values to replace?
    if (np.any(localdf['replace_locid'])):  # makes ~28.469% faster
        # go through zones 31 at a time
        for i in range(len(pip_iterations)-1):
            # tag 1st and last zone #s
            start = pip_iterations[i]
            end = pip_iterations[i+1]
            # derive taxi zones from coordinates
            pickups = cuspatial.point_in_polygon(localdf[lon_var], 
                                                 localdf[lat_var], 
                                                 taxi_zones[0][start:end], 
                                                 taxi_zones[1], 
                                                 taxi_zones[2]['x'], 
                                                 taxi_zones[2]['y'])
    #         print(pickups.values.sum())
    #         print(len(pickups.columns))
            # correct location id columns (0 - 262)
            for j in pickups.columns:
                localdf[locid_var].loc[pickups[j]] = j
            
        return localdf[locid_var].astype('float64') 

    else:
        localdf[locid_var] = localdf[locid_var].astype('float64')   
        return localdf[locid_var]

In [4]:
# # WITHOUT np.any(localdf['replace_locid'])

# CPU times: user 31.6 s, sys: 10.6 s, total: 42.2 s
# Wall time: 42.4 s
    
# CPU times: user 31.6 s, sys: 10.6 s, total: 42.1 s
# Wall time: 42.3 s
    
# # WITH np.any(localdf['replace_locid'])

# CPU times: user 24.2 s, sys: 7.65 s, total: 31.9 s
# Wall time: 32 s

# CPU times: user 20.6 s, sys: 7.81 s, total: 28.4 s
# Wall time: 28.6 s

# CPU times: user 20.7 s, sys: 7.77 s, total: 28.5 s
# Wall time: 28.6 s

In [5]:
def get_yellow():
    # tag file paths to data and column names by schema (x < 2015, 2015 <= x <= 2016.5, 2016.5 < x)
    yellow_schema_pre_2015 = "vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount"
    yellow_glob_pre_2015 = glob(os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_201[0-4]*.csv')) + glob(
                                os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2009*.csv'))
    yellow_schema_2015_2016_h1 = "vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount"
    yellow_glob_2015_2016_h1 = glob(os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2015*.csv')) + glob(
                                    os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2016-0[1-6].csv'))
    yellow_schema_2016_h2_plus = "vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_taxizone_id,dropoff_taxizone_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,junk1,junk2"
    yellow_glob_2016_h2_plus = glob(os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2016-0[7-9].csv')) + glob(
                                    os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2016-1[0-2].csv')) + glob(
                                    os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_201[7-9]*.csv'))

    # create pre 2015 dataframe
    yellow1 = dask_cudf.read_csv(yellow_glob_pre_2015, 
                                 header=0,
                                 na_values=["NA"],
                                 parse_dates=[1, 2],
                                 infer_datetime_format=True,
                                 dtype=dtype_list,
                                 names=yellow_schema_pre_2015.split(',')
                                )
    yellow1['dropoff_taxizone_id'] = -1.0
    yellow1['pickup_taxizone_id'] = -1.0
    yellow1['ehail_fee'] = np.nan
    yellow1['improvement_surcharge'] = np.nan
    yellow1['improvement_surcharge'] = yellow1['improvement_surcharge'].astype('float32')
    yellow1['trip_type'] = -1.0
    
    # create january 2015 - june 2016 dataframe
    yellow2 = dask_cudf.read_csv(yellow_glob_2015_2016_h1, 
                                 header=0,
                                 na_values=["NA"],
                                 parse_dates=[1, 2],
                                 infer_datetime_format=True,
                                 dtype=dtype_list,
                                 names=yellow_schema_2015_2016_h1.split(',')
                                )
    yellow2['dropoff_taxizone_id'] = -1.0
    yellow2['pickup_taxizone_id'] = -1.0
    yellow2['ehail_fee'] = np.nan
    yellow2['trip_type'] = -1.0

    # create post june 2016 dataframe
    yellow3 = dask_cudf.read_csv(yellow_glob_2016_h2_plus, 
                                 header=0,
                                 na_values=["NA"],
                                 parse_dates=[1, 2],
                                 infer_datetime_format=True,
                                 dtype=dtype_list,
                                 names=yellow_schema_2016_h2_plus.split(',')
                                )
    yellow3['dropoff_latitude'] = 0.0
    yellow3['dropoff_longitude'] = 0.0
    yellow3['pickup_latitude'] = 0.0
    yellow3['pickup_longitude'] = 0.0
    yellow3['ehail_fee'] = np.nan
    yellow3['trip_type'] = -1.0
    yellow3 = yellow3.drop(['junk1', 'junk2'], axis=1)

    # join dataframes (alphabetized column order)
    yellow = dask_cudf.concat([yellow1[sorted(yellow1.columns)], 
                               yellow2[sorted(yellow1.columns)], 
                               yellow3[sorted(yellow1.columns)]]
                             )
    for field in list(yellow.columns):
        if field in dtype_list:
            yellow[field] = yellow[field].astype(dtype_list[field])

    yellow['trip_type'] = 'yellow'

    return yellow

In [6]:
all_trips = get_yellow()
type(all_trips)

dask_cudf.core.DataFrame

In [7]:
# %%time
# all_trips.compute()

In [8]:
# derive & assign pickup & dropoff taxi zones 
all_trips['dropoff_taxizone_id'] = all_trips.map_partitions(assign_taxi_zones, 
                                                            lon_var='dropoff_longitude', 
                                                            lat_var='dropoff_latitude',
                                                            locid_var='dropoff_taxizone_id', 
                                                            meta=('dropoff_taxizone_id', np.float64))
all_trips['pickup_taxizone_id'] = all_trips.map_partitions(assign_taxi_zones, 
                                                           lon_var='pickup_longitude', 
                                                           lat_var='pickup_latitude',
                                                           locid_var='pickup_taxizone_id', 
                                                           meta=('pickup_taxizone_id', np.float64))

In [9]:
# %%time
# all_trips.dropoff_taxizone_id.head()

In [10]:
# all_trips = all_trips[sorted(all_trips.columns)]
# all_trips = all_trips.repartition(npartitions=1200)

In [11]:
# cudf.Series(['NaN']).astype('float64').values[0]

In [12]:
all_trips = all_trips.map_partitions(lambda x: x.sort_values('pickup_datetime'), 
                                     meta=all_trips)

In [13]:
%%time
df = all_trips.compute()
df

CPU times: user 23.9 s, sys: 7.75 s, total: 31.6 s
Wall time: 31.8 s


Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
29265,2010-01-01 00:22:59,40.696554,-73.928529,36.0,,0.5,17.3,,0.5,1,...,-73.969274,237.0,1,0,0.00,0.00,18.30,6.20,yellow,CMT
46774,2010-01-01 00:11:39,0.0,0.0,-1.0,,0.5,8.9,,0.5,1,...,0.000000,-1.0,1,0,2.50,0.00,12.40,2.80,yellow,CMT
16590,2010-01-01 00:07:06,40.733424,-74.00257,251.0,,0.5,4.5,,0.5,1,...,-73.995042,234.0,1,0,0.00,0.00,5.50,0.70,yellow,CMT
93244,2010-01-01 00:37:31,40.759157,-73.995959,248.0,,0.0,45.0,,0.5,1,...,-73.776814,136.0,1,0,0.00,4.57,50.07,18.30,yellow,CMT
42240,2010-01-01 00:35:00,40.761327,-73.980508,229.0,,0.5,14.9,,0.5,5,...,-73.976473,162.0,1,,0.00,0.00,15.90,1.99,yellow,VTS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30398,2019-02-01 00:01:39,0.0,0.0,48.0,,0.5,6.5,0.3,0.5,1,...,0.000000,100.0,1,N,2.34,0.00,10.14,0.99,yellow,2
3419,2019-02-01 00:02:52,0.0,0.0,237.0,,0.5,5.5,0.3,0.5,1,...,0.000000,140.0,1,N,1.35,0.00,8.15,0.50,yellow,1
96017,2019-02-01 00:11:38,0.0,0.0,164.0,,0.5,9.0,0.3,0.5,2,...,0.000000,68.0,1,N,0.00,0.00,10.30,0.80,yellow,1
87599,2019-02-01 00:08:20,0.0,0.0,68.0,,0.5,5.0,0.3,0.5,1,...,0.000000,100.0,1,N,1.89,0.00,8.19,0.68,yellow,2


In [14]:
# # WITHOUT
# # 264.0    317126
# # 237.0     29957
# # 160.0     28804
# # 236.0     27873
# # 229.0     27792
# #           ...  
# # 104.0         1
# # 105.0         1
# # 155.0         1
# # 207.0         1
# # 254.0         1
# # Name: pickup_taxizone_id, Length: 250, dtype: int32

# # WITH
# 1.0      738887
# 0.0      314548
# 251.0     18189
# 248.0     11152
# 262.0      9459
# 261.0      3499
# 257.0      2080
# 258.0      1412
# 260.0       560
# 249.0        89
# 252.0        44
# 253.0        39
# 255.0        19
# 250.0         8
# 256.0         8
# 259.0         6
# 254.0         1
# Name: pickup_taxizone_id, dtype: int32

In [15]:
df.pickup_taxizone_id.value_counts()
# -1.0      800000
#  237.0     12506
#  236.0     11896
#  161.0     11796
#  230.0     10616
#            ...  
#  184.0         1
#  206.0         1
#  214.0         1
#  240.0         1
#  253.0         1
# Name: pickup_taxizone_id, Length: 237, dtype: int32

237.0    42463
236.0    39769
161.0    38110
234.0    36110
229.0    33589
         ...  
58.0         1
86.0         1
103.0        1
104.0        1
105.0        1
Name: pickup_taxizone_id, Length: 263, dtype: int32

In [16]:
# # WITHOUT 
# # 264.0    317940
# # 160.0     31278
# # 236.0     28284
# # 237.0     26682
# # 229.0     25947
# #           ...  
# # 188.0         3
# # 32.0          2
# # 105.0         2
# # 99.0          1
# # 203.0         1
# # Name: dropoff_taxizone_id, Length: 261, dtype: int32

# # WITH
# 1.0      735292
# 0.0      314414
# 251.0     15414
# 248.0     13979
# 262.0     10027
# 261.0      3536
# 258.0      2825
# 257.0      2481
# 260.0      1023
# 252.0       436
# 249.0       201
# 255.0       116
# 253.0        89
# 259.0        69
# 250.0        48
# 256.0        45
# 254.0         5
# Name: dropoff_taxizone_id, dtype: int32

In [17]:
df.dropoff_taxizone_id.value_counts()
# -1.0      800000
#  236.0     12505
#  161.0     11314
#  237.0     11081
#  170.0      9541
#            ...  
#  187.0         2
#  30.0          1
#  44.0          1
#  59.0          1
#  176.0         1
# Name: dropoff_taxizone_id, Length: 257, dtype: int32

236.0    40789
237.0    37763
161.0    35326
164.0    31822
234.0    31734
         ...  
109.0       13
58.0        10
199.0        8
105.0        2
99.0         1
Name: dropoff_taxizone_id, Length: 264, dtype: int32

In [18]:
df.loc[(df.dropoff_datetime > '2015-01-01 00:00:00') & (df.dropoff_taxizone_id == 1)]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
8229,2017-01-01 06:20:52,0.0,0.0,1.0,,0.0,63.5,0.3,0.0,2,...,0.0,48.0,3,N,0.00,16.50,80.30,17.30,yellow,1
33703,2017-01-01 13:09:12,0.0,0.0,1.0,,0.0,59.5,0.3,0.0,1,...,0.0,234.0,3,N,19.21,17.05,96.06,14.67,yellow,2
9176,2017-01-01 14:54:00,0.0,0.0,1.0,,0.0,71.5,0.3,0.0,1,...,0.0,43.0,3,N,0.00,12.50,84.30,18.80,yellow,1
83745,2017-01-01 15:19:54,0.0,0.0,1.0,,0.0,68.5,0.3,0.0,1,...,0.0,100.0,3,N,0.00,12.50,81.30,18.94,yellow,2
12581,2017-01-01 22:58:16,0.0,0.0,1.0,,0.5,46.5,0.3,0.5,2,...,0.0,48.0,1,N,0.00,10.50,58.30,16.60,yellow,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71500,2019-01-31 10:07:35,0.0,0.0,1.0,,0.0,70.0,0.3,0.0,1,...,0.0,163.0,3,N,0.00,24.00,94.30,18.00,yellow,1
95492,2019-01-31 14:36:32,0.0,0.0,1.0,,0.0,66.0,0.3,0.0,1,...,0.0,230.0,3,N,15.35,10.50,92.15,17.50,yellow,1
52045,2019-01-31 15:01:47,0.0,0.0,1.0,,0.0,72.5,0.3,0.0,1,...,0.0,230.0,3,N,26.04,14.00,112.84,19.44,yellow,2
45394,2019-01-31 15:33:58,0.0,0.0,1.0,,0.0,66.5,0.3,0.0,4,...,0.0,163.0,3,N,16.44,15.40,98.64,17.87,yellow,2


In [19]:
df.loc[df.dropoff_taxizone_id == 1]#.to_pandas().sample(25)

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
90635,2011-01-23 20:14:00,40.620568,-73.794635,1.0,,0.5,6.9,,0.5,1,...,-73.787460,136.0,1,,2.00,0.00,9.90,1.51,yellow,VTS
89956,2013-01-09 14:34:00,40.620177,-73.860112,1.0,,0.0,32.0,,0.5,1,...,-73.860112,1.0,1,,12.00,0.00,44.50,10.96,yellow,VTS
81880,2014-01-26 11:15:31,40.623990,-73.803349,1.0,,0.0,8.0,,0.5,1,...,-73.803349,1.0,1,N,1.70,0.00,10.20,1.90,yellow,CMT
68584,2009-01-12 00:27:54,40.632522,-73.837519,1.0,,0.0,22.2,,-1.0,1,...,-73.837519,1.0,-1,,3.33,0.00,25.53,9.50,yellow,CMT
73081,2009-01-21 12:52:14,40.620347,-73.855868,1.0,,0.0,45.0,,-1.0,2,...,-73.855868,1.0,-1,,0.00,4.15,49.15,17.30,yellow,CMT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71500,2019-01-31 10:07:35,0.000000,0.000000,1.0,,0.0,70.0,0.3,0.0,1,...,0.000000,163.0,3,N,0.00,24.00,94.30,18.00,yellow,1
95492,2019-01-31 14:36:32,0.000000,0.000000,1.0,,0.0,66.0,0.3,0.0,1,...,0.000000,230.0,3,N,15.35,10.50,92.15,17.50,yellow,1
52045,2019-01-31 15:01:47,0.000000,0.000000,1.0,,0.0,72.5,0.3,0.0,1,...,0.000000,230.0,3,N,26.04,14.00,112.84,19.44,yellow,2
45394,2019-01-31 15:33:58,0.000000,0.000000,1.0,,0.0,66.5,0.3,0.0,4,...,0.000000,163.0,3,N,16.44,15.40,98.64,17.87,yellow,2


In [20]:
df.loc[df.dropoff_taxizone_id == 251].to_pandas().sample(3)

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
41446,2013-01-04 21:09:00,40.734762,-74.002175,251.0,,0.5,7.5,,0.5,1,...,-73.991878,100.0,1,,2.0,0.0,10.5,1.67,yellow,VTS
70063,2011-01-14 22:07:14,40.730375,-74.004715,251.0,,0.5,21.3,,0.5,1,...,-73.950234,244.0,1,N,0.0,0.0,22.3,7.4,yellow,CMT
20495,2009-01-14 19:05:37,40.736088,-73.997786,251.0,,0.0,7.1,,-1.0,1,...,-73.990501,100.0,-1,,3.0,0.0,10.1,1.3,yellow,CMT


In [21]:
df.loc[(df.dropoff_datetime > '2019-01-01 00:00:00') & (df.dropoff_taxizone_id == 1)]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
86663,2019-01-01 09:31:09,0.0,0.0,1.0,,0.0,60.5,0.3,0.0,1,...,0.0,161.0,3,N,0.00,12.5,73.30,15.80,yellow,2
52386,2019-01-01 12:46:36,0.0,0.0,1.0,,0.0,70.5,0.3,0.0,4,...,0.0,233.0,3,N,0.00,16.5,87.30,19.08,yellow,2
36848,2019-01-01 14:53:42,0.0,0.0,1.0,,0.0,56.0,0.3,0.0,2,...,0.0,144.0,3,N,10.50,12.5,79.30,13.03,yellow,2
95742,2019-01-01 18:04:51,0.0,0.0,1.0,,0.0,57.0,0.3,0.0,1,...,0.0,231.0,3,N,13.96,12.5,83.76,13.70,yellow,1
16547,2019-01-01 18:20:44,0.0,0.0,1.0,,0.0,57.5,0.3,0.0,1,...,0.0,125.0,3,N,10.00,22.0,89.80,14.32,yellow,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71500,2019-01-31 10:07:35,0.0,0.0,1.0,,0.0,70.0,0.3,0.0,1,...,0.0,163.0,3,N,0.00,24.0,94.30,18.00,yellow,1
95492,2019-01-31 14:36:32,0.0,0.0,1.0,,0.0,66.0,0.3,0.0,1,...,0.0,230.0,3,N,15.35,10.5,92.15,17.50,yellow,1
52045,2019-01-31 15:01:47,0.0,0.0,1.0,,0.0,72.5,0.3,0.0,1,...,0.0,230.0,3,N,26.04,14.0,112.84,19.44,yellow,2
45394,2019-01-31 15:33:58,0.0,0.0,1.0,,0.0,66.5,0.3,0.0,4,...,0.0,163.0,3,N,16.44,15.4,98.64,17.87,yellow,2


In [22]:
df.loc[(df.dropoff_datetime > '2010-01-01 00:00:00') & (df.dropoff_taxizone_id == 1)]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
90635,2011-01-23 20:14:00,40.620568,-73.794635,1.0,,0.5,6.9,,0.5,1,...,-73.787460,136.0,1,,2.00,0.00,9.90,1.51,yellow,VTS
89956,2013-01-09 14:34:00,40.620177,-73.860112,1.0,,0.0,32.0,,0.5,1,...,-73.860112,1.0,1,,12.00,0.00,44.50,10.96,yellow,VTS
81880,2014-01-26 11:15:31,40.623990,-73.803349,1.0,,0.0,8.0,,0.5,1,...,-73.803349,1.0,1,N,1.70,0.00,10.20,1.90,yellow,CMT
8229,2017-01-01 06:20:52,0.000000,0.000000,1.0,,0.0,63.5,0.3,0.0,2,...,0.000000,48.0,3,N,0.00,16.50,80.30,17.30,yellow,1
33703,2017-01-01 13:09:12,0.000000,0.000000,1.0,,0.0,59.5,0.3,0.0,1,...,0.000000,234.0,3,N,19.21,17.05,96.06,14.67,yellow,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71500,2019-01-31 10:07:35,0.000000,0.000000,1.0,,0.0,70.0,0.3,0.0,1,...,0.000000,163.0,3,N,0.00,24.00,94.30,18.00,yellow,1
95492,2019-01-31 14:36:32,0.000000,0.000000,1.0,,0.0,66.0,0.3,0.0,1,...,0.000000,230.0,3,N,15.35,10.50,92.15,17.50,yellow,1
52045,2019-01-31 15:01:47,0.000000,0.000000,1.0,,0.0,72.5,0.3,0.0,1,...,0.000000,230.0,3,N,26.04,14.00,112.84,19.44,yellow,2
45394,2019-01-31 15:33:58,0.000000,0.000000,1.0,,0.0,66.5,0.3,0.0,4,...,0.000000,163.0,3,N,16.44,15.40,98.64,17.87,yellow,2
