# Trying to Replicate This

In [1]:
def assign_taxi_zones(df, lon_var, lat_var, locid_var):
    """Joins DataFrame with Taxi Zones shapefile.
    This function takes longitude values provided by `lon_var`, and latitude
    values provided by `lat_var` in DataFrame `df`, and performs a spatial join
    with the NYC taxi_zones shapefile. 
    The shapefile is hard coded in, as this function makes a hard assumption of
    latitude and longitude coordinates. It also assumes latitude=0 and 
    longitude=0 is not a datapoint that can exist in your dataset. Which is 
    reasonable for a dataset of New York, but bad for a global dataset.
    Only rows where `df.lon_var`, `df.lat_var` are reasonably near New York,
    and `df.locid_var` is set to np.nan are updated. 
    Parameters
    ----------
    df : pandas.DataFrame or dask.DataFrame
        DataFrame containing latitudes, longitudes, and location_id columns.
    lon_var : string
        Name of column in `df` containing longitude values. Invalid values 
        should be np.nan.
    lat_var : string
        Name of column in `df` containing latitude values. Invalid values 
        should be np.nan
    locid_var : string
        Name of column in `df` containing taxi_zone location ids. Rows with
        valid, nonzero values are not overwritten. 
    """

    import geopandas
    from shapely.geometry import Point


    localdf = df[[lon_var, lat_var, locid_var]].copy()
    # localdf = localdf.reset_index()
    localdf[lon_var] = localdf[lon_var].fillna(value=0.)
    localdf[lat_var] = localdf[lat_var].fillna(value=0.)
    localdf['replace_locid'] = (localdf[locid_var].isnull()
                                & (localdf[lon_var] != 0.)
                                & (localdf[lat_var] != 0.))

    if (np.any(localdf['replace_locid'])):
        shape_df = geopandas.read_file('../shapefiles/taxi_zones.shp')
        shape_df.drop(['OBJECTID', "Shape_Area", "Shape_Leng", "borough", "zone"],
                      axis=1, inplace=True)
        shape_df = shape_df.to_crs({'init': 'epsg:4326'})

        try:
            local_gdf = geopandas.GeoDataFrame(
                localdf, crs={'init': 'epsg:4326'},
                geometry=[Point(xy) for xy in
                          zip(localdf[lon_var], localdf[lat_var])])

            local_gdf = geopandas.sjoin(
                local_gdf, shape_df, how='left', op='within')

            # one point can intersect more than one zone -- for example if on
            # the boundary between two zones. Deduplicate by taking first valid.
            local_gdf = local_gdf[~local_gdf.index.duplicated(keep='first')]

            local_gdf.LocationID.values[~local_gdf.replace_locid] = (
                (local_gdf[locid_var])[~local_gdf.replace_locid]).values

            return local_gdf.LocationID.rename(locid_var)
        except ValueError as ve:
            print(ve)
            print(ve.stacktrace())
            return df[locid_var].astype(np.float64)
    else:
        return df[locid_var]

# What I have So Far

In [2]:
import os
import dask_cudf
import cuspatial
import numpy as np

In [3]:
dtype_list = {'dropoff_datetime': 'str',  # object, # set by parse_dates in pandas read_csv
              'dropoff_latitude': 'float64',
              'dropoff_taxizone_id': 'float64',
              'dropoff_longitude': 'float64',
              'ehail_fee': 'float64',
              'extra': 'float64',
              'fare_amount': 'float64',
              'improvement_surcharge': 'float64',
              'junk1': 'str',  # object,
              'junk2': 'str',  # object,
              'mta_tax': 'float64',
              'passenger_count': 'str',  # object,
              'payment_type': 'str',  # object,
              'pickup_datetime': 'str',  # object, # set by parse_dates in pandas read_csv
              'pickup_latitude': 'float64',
              'pickup_taxizone_id': 'float64',
              'pickup_longitude': 'float64',
              'rate_code_id': 'str',  # object,
              'store_and_fwd_flag': 'str',  # object,
              'tip_amount': 'float64',
              'tolls_amount': 'float64',
              'total_amount': 'float64',
              'trip_distance': 'float64',
              'trip_type': 'str',  # object,
              'vendor_id': 'str',  # object,
             }

# make dict of paths to data directories
relative_path = '../00_download_scripts/raw_data'
config = {'citibike_raw_data_path': f'{relative_path}/bike/',
          'taxi_raw_data_path': f'{relative_path}/taxi/',
          'uber_raw_data_path': f'{relative_path}/uber/',
          'subway_raw_data_path': f'{relative_path}/subway/',
          'parquet_output_path': f'data/'
         }

def glob(x):
    '''
    Signature: sorted(glob(pathname=x, *, recursive=False))
    Docstring:
    Return a list of paths matching a pathname pattern.

    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.

    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.
    '''
    from glob import glob
    return sorted(glob(x))

In [4]:
def get_green():
    green_schema_pre_2015 = "vendor_id,pickup_datetime,dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,total_amount,payment_type,trip_type,junk1,junk2"
    green_glob_pre_2015 = glob(
        os.path.join(config['taxi_raw_data_path'], 'green_tripdata_201[34]*.csv'))

    green_schema_2015_h1 = "vendor_id,pickup_datetime,dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,junk1,junk2"
    green_glob_2015_h1 = glob(
        os.path.join(config['taxi_raw_data_path'], 'green_tripdata_2015-0[1-6].csv'))

    green_schema_2015_h2_2016_h1 = "vendor_id,pickup_datetime,dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type"
    green_glob_2015_h2_2016_h1 = glob(os.path.join(config['taxi_raw_data_path'], 'green_tripdata_2015-0[7-9].csv')) + glob(
                                      os.path.join(config['taxi_raw_data_path'], 'green_tripdata_2015-1[0-2].csv')) + glob(
                                      os.path.join(config['taxi_raw_data_path'], 'green_tripdata_2016-0[1-6].csv'))
    
    green_schema_2016_h2_plus = "vendor_id,pickup_datetime,dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_taxizone_id,dropoff_taxizone_id,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,junk1,junk2"
    green_glob_2016_h2_plus = glob(os.path.join(config['taxi_raw_data_path'], 'green_tripdata_2016-0[7-9].csv')) + glob(
                                   os.path.join(config['taxi_raw_data_path'], 'green_tripdata_2016-1[0-2].csv')) + glob(
                                   os.path.join(config['taxi_raw_data_path'], 'green_tripdata_201[7-9]*.csv'))

    # Green
    green1 = dask_cudf.read_csv(green_glob_pre_2015, 
                                header=0,
                                na_values=["NA"],
                                parse_dates=[1, 2],
                                infer_datetime_format=True,
                                dtype=dtype_list,
                                names=green_schema_pre_2015.split(','))
    green1['dropoff_taxizone_id'] = -1.0
    green1['pickup_taxizone_id'] = -1.0
    green1['improvement_surcharge'] = -1.0
    green1 = green1.drop(['junk1', 'junk2'], axis=1)

    green2 = dask_cudf.read_csv(green_glob_2015_h1, 
                                header=0,
                                na_values=["NA"],
                                parse_dates=[1, 2],
                                infer_datetime_format=True,
                                dtype=dtype_list,
                                names=green_schema_2015_h1.split(','))
    green2['dropoff_taxizone_id'] = -1.0
    green2['pickup_taxizone_id'] = -1.0
    green2 = green2.drop(['junk1', 'junk2'], axis=1)

    green3 = dask_cudf.read_csv(green_glob_2015_h2_2016_h1, 
                                header=0,
                                na_values=["NA"],
                                parse_dates=[1, 2],
                                infer_datetime_format=True,
                                dtype=dtype_list,
                                names=green_schema_2015_h2_2016_h1.split(','))
    green3['dropoff_taxizone_id'] = -1.0
    green3['pickup_taxizone_id'] = -1.0

    green4 = dask_cudf.read_csv(green_glob_2016_h2_plus, 
                                header=0,
                                na_values=["NA"],
                                parse_dates=[1, 2],
                                infer_datetime_format=True,
                                dtype=dtype_list,
                                names=green_schema_2016_h2_plus.split(','))
    green4['dropoff_latitude'] = 0.0
    green4['dropoff_longitude'] = 0.0
    green4['pickup_latitude'] = 0.0
    green4['pickup_longitude'] = 0.0
    green4 = green4.drop(['junk1', 'junk2'], axis=1)

    green = green1[sorted(green1.columns)].append(
            green2[sorted(green1.columns)])
    green = green.append(green3[sorted(green1.columns)])
    green = green.append(green4[sorted(green1.columns)])

    for field in list(green.columns):
        if field in dtype_list:
            green[field] = green[field].astype(dtype_list[field])

    green['trip_type'] = 'green'

    return green

In [5]:
def get_yellow():
    # tag file paths to data and column names by schema (x < 2015, 2015 <= x <= 2016.5, 2016.5 < x)
    yellow_schema_pre_2015 = "vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount"
    yellow_glob_pre_2015 = glob(os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_201[0-4]*.csv')) + glob(
                                os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2009*.csv'))
    yellow_schema_2015_2016_h1 = "vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount"
    yellow_glob_2015_2016_h1 = glob(os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2015*.csv')) + glob(
                                    os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2016-0[1-6].csv'))
    yellow_schema_2016_h2_plus = "vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_taxizone_id,dropoff_taxizone_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,junk1,junk2"
    yellow_glob_2016_h2_plus = glob(os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2016-0[7-9].csv')) + glob(
                                    os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_2016-1[0-2].csv')) + glob(
                                    os.path.join(config['taxi_raw_data_path'], 'yellow_tripdata_201[7-9]*.csv'))

    # create pre 2015 dataframe
    yellow1 = dask_cudf.read_csv(yellow_glob_pre_2015, 
                                 header=0,
                                 na_values=["NA"],
                                 parse_dates=[1, 2],
                                 infer_datetime_format=True,
                                 dtype=dtype_list,
                                 names=yellow_schema_pre_2015.split(',')
                                )
    yellow1['dropoff_taxizone_id'] = -1.0
    yellow1['pickup_taxizone_id'] = -1.0
    yellow1['ehail_fee'] = np.nan
    yellow1['improvement_surcharge'] = np.nan
    yellow1['improvement_surcharge'] = yellow1['improvement_surcharge'].astype('float32')
    yellow1['trip_type'] = -1.0
    
    # create january 2015 - june 2016 dataframe
    yellow2 = dask_cudf.read_csv(yellow_glob_2015_2016_h1, 
                                 header=0,
                                 na_values=["NA"],
                                 parse_dates=[1, 2],
                                 infer_datetime_format=True,
                                 dtype=dtype_list,
                                 names=yellow_schema_2015_2016_h1.split(',')
                                )
    yellow2['dropoff_taxizone_id'] = -1.0
    yellow2['pickup_taxizone_id'] = -1.0
    yellow2['ehail_fee'] = np.nan
    yellow2['trip_type'] = -1.0

    # create post june 2016 dataframe
    yellow3 = dask_cudf.read_csv(yellow_glob_2016_h2_plus, 
                                 header=0,
                                 na_values=["NA"],
                                 parse_dates=[1, 2],
                                 infer_datetime_format=True,
                                 dtype=dtype_list,
                                 names=yellow_schema_2016_h2_plus.split(',')
                                )
    yellow3['dropoff_latitude'] = 0.0
    yellow3['dropoff_longitude'] = 0.0
    yellow3['pickup_latitude'] = 0.0
    yellow3['pickup_longitude'] = 0.0
    yellow3['ehail_fee'] = np.nan
    yellow3['trip_type'] = -1.0
    yellow3 = yellow3.drop(['junk1', 'junk2'], axis=1)

    # join dataframes (alphabetized column order)
    yellow = yellow1[sorted(yellow1.columns)].append(
             yellow2[sorted(yellow1.columns)])
    yellow = yellow.append(yellow3[sorted(yellow1.columns)])

#     yellow = dask_cudf.concat([yellow1[sorted(yellow1.columns)], 
#                                yellow2[sorted(yellow1.columns)], 
#                                yellow3[sorted(yellow1.columns)]]
#                              )
                     
    for field in list(yellow.columns):
        if field in dtype_list:
            yellow[field] = yellow[field].astype(dtype_list[field])

    yellow['trip_type'] = 'yellow'

    return yellow

In [6]:
def get_uber():
    uber_schema_2014 = "pickup_datetime,pickup_latitude,pickup_longitude,junk1"
    uber_glob_2014 = glob(os.path.join(config['uber_raw_data_path'],'uber*-???14.csv'))

    uber1 = dask_cudf.read_csv(uber_glob_2014, 
                               header=0,
                               na_values=["NA"], 
                               parse_dates=[0,],
                               infer_datetime_format = True,
                               dtype=dtype_list,
                               names=uber_schema_2014.split(',')
                              )
    uber1 = uber1.drop(['junk1',], axis=1)
    uber1 = uber1.assign(pickup_taxizone_id=-1.0)

    uber_schema_2015 = "junk1,pickup_datetime,junk2,pickup_taxizone_id"
    uber_glob_2015 = glob(os.path.join(config['uber_raw_data_path'],'uber*15.csv'))

    uber2 = dask_cudf.read_csv(uber_glob_2015, 
                        header=0,
                        na_values=["NA"], 
                        parse_dates=[1,],
                        infer_datetime_format = True,
                        dtype=dtype_list,
                        names=uber_schema_2015.split(',')
                       )
    uber2 = uber2.drop(['junk1', 'junk2'], axis=1)
    uber2 = uber2.assign(pickup_latitude=0.0, pickup_longitude=0.0)

    
    uberdf = dask_cudf.concat([uber1[sorted(uber1.columns)], 
                               uber2[sorted(uber1.columns)]])

#     default_values = {np.float64: np.nan, np.int64: -999, object: ""}

    for field in dtype_list:
        if (field in uberdf.columns):
            uberdf[field] = uberdf[field].astype(dtype_list[field])
        elif field == 'pickup_datetime':
            pass
        else:
            uberdf[field] = np.nan
            uberdf[field] = uberdf[field].astype(dtype_list[field])
    #         uberdf = uberdf.assign(**{field: default_values[dtype_list[field]]})


    uberdf = uberdf.drop(['junk1', 'junk2'], axis=1)

#     uberdf['dropoff_datetime'] = np.datetime64("1970-01-01 00:00:00")
    #uberdf = uberdf.repartition(npartitions=20)

    uberdf['trip_type'] = 'uber'

    uberdf = uberdf[sorted(uberdf.columns)]

    return uberdf


# %%time
# get_uber().compute()

In [7]:
green = get_green()
yellow = get_yellow()
uber = get_uber()

In [8]:
all_trips = uber.append(green).append(yellow)
type(all_trips)

dask_cudf.core.DataFrame

In [9]:
%%time
df = all_trips.compute()

CPU times: user 2.99 s, sys: 487 ms, total: 3.48 s
Wall time: 3.55 s


In [10]:
missing_pickup_taxizone_ids = (len(df.loc[df.pickup_taxizone_id == -1])/len(df))*100
unexpected_missing_pickup_taxizone_ids = (len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_latitude != 0) & (df.pickup_longitude != 0)])/len(df))*100

print(f'MISSING: {str(missing_pickup_taxizone_ids)[:7]}%\nIDK WHY: {str(unexpected_missing_pickup_taxizone_ids)[:7]}%')

MISSING: 70.9256%
IDK WHY: 70.2762%


In [11]:
df.pickup_taxizone_id.value_counts()

-1.0      1707623
 74.0       20472
 75.0       20120
 41.0       18125
 7.0        15255
           ...   
 187.0          3
 245.0          2
 30.0           1
 172.0          1
 176.0          1
Name: pickup_taxizone_id, Length: 253, dtype: int32

In [12]:
df.dropoff_taxizone_id.value_counts()

-1.0      1107623
 NaN       700000
 236.0      17076
 237.0      12967
 74.0       12588
           ...   
 204.0          5
 44.0           4
 30.0           3
 2.0            2
 176.0          2
Name: dropoff_taxizone_id, Length: 261, dtype: int32

In [13]:
del df

In [14]:
# all_trips.compute().tail()

In [15]:
# %%time
# all_trips.pickup_datetime.compute()

In [16]:
# (len(all_trips.loc[(all_trips.dropoff_taxizone_id == -1) & (all_trips.dropoff_latitude != 0)].compute())/len(all_trips.compute()))*100

In [17]:
# (len(all_trips.loc[(all_trips.pickup_taxizone_id == -1) & (all_trips.pickup_latitude != 0)].compute())/len(all_trips.compute()))*100

In [18]:
# len(all_trips.loc[(all_trips.dropoff_taxizone_id == -1) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

In [19]:
# len(all_trips.loc[(all_trips.dropoff_taxizone_id == 161) & (all_trips.dropoff_datetime > '2017-01-01 00:00:00')].compute())

In [20]:
# len(all_trips.loc[(all_trips.dropoff_taxizone_id == 161) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

In [21]:
# len(all_trips.loc[(all_trips.dropoff_taxizone_id == 183) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

In [22]:
# len(all_trips.loc[(all_trips.dropoff_taxizone_id == 185) & (all_trips.dropoff_datetime > '2017-01-01 00:00:00')].compute())

In [23]:
# len(all_trips.loc[(all_trips.dropoff_taxizone_id == 185) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

In [24]:
# len(all_trips.loc[(all_trips.dropoff_taxizone_id == 200) & (all_trips.dropoff_datetime > '2016-01-01 00:00:00')].compute())

In [25]:
pip_iterations = list(np.arange(0, 263, 31))
pip_iterations.append(263)

taxi_zones = cuspatial.read_polygon_shapefile('zones/cu_taxi_zones.shp')

def assign_taxi_zones(df, lon_var, lat_var, locid_var):
    """
    Derives Taxi Zones from shapefile.
    
    This function takes longitude values provided by `lon_var`, and latitude
    values provided by `lat_var` in DataFrame `df`, and performs a spatial join
    with the NYC taxi_zones shapefile. 
    
    The shapefile is hard coded in, as this function makes a hard assumption of
    latitude and longitude coordinates. It also assumes latitude=0.0 and 
    longitude=0.0 is not a datapoint that can exist in your dataset. Which is 
    reasonable for a dataset of New York, but a bit edgy for a global dataset.
    
    Only rows where `df.lon_var`, `df.lat_var` are reasonably near New York,
    and `df.locid_var` is set to -1.0 are updated.
    
    Parameters
    ----------
    df : cudf.DataFrame or dask_cudf.DataFrame
        DataFrame containing latitudes, longitudes, and location_id columns.
    lon_var : string
        Name of column in `df` containing longitude values. Invalid values 
        should be -1.0.
    lat_var : string
        Name of column in `df` containing latitude values. Invalid values 
        should be -1.0
    locid_var : string
        Name of column in `df` containing taxi_zone location ids. Rows with
        valid, nonzero values are not overwritten.
        """
    # focus location columns
    localdf = df[[lon_var, lat_var, locid_var]].copy()
    # localdf = localdf.reset_index()
    
    # fill missing lat/long values
    localdf[lon_var] = localdf[lon_var].fillna(value=0.0)
    localdf[lat_var] = localdf[lat_var].fillna(value=0.0)
    
    # (bool column) is location id missing && do we have lat/long coordinates?
    localdf['replace_locid'] = ((localdf[locid_var] == -1.0)
                                & (localdf[lon_var] != 0.0)
                                & (localdf[lat_var] != 0.0)
                               )
    c = 0
    # are there any values to replace?
    if (np.any(localdf['replace_locid'])):  # makes ~28.469% faster
        # go through zones 31 at a time
        for i in range(len(pip_iterations)-1):
            # tag 1st and last zone #s
            start = pip_iterations[i]
            end = pip_iterations[i+1]
            # derive taxi zones from coordinates
            t_zones = cuspatial.point_in_polygon(localdf[lon_var], 
                                                 localdf[lat_var], 
                                                 taxi_zones[0][start:end], 
                                                 taxi_zones[1], 
                                                 taxi_zones[2]['x'], 
                                                 taxi_zones[2]['y'])
            # insert taxi zones into location id columns 
            for j in t_zones.columns:
#                 if j == 200:
#                     print(f'j == {j}')
#                     print(f'np.sum(t_zones[j]) == {np.sum(t_zones[j])}')
#                     print(localdf[locid_var].loc[t_zones[j]].value_counts())
                localdf[locid_var].loc[t_zones[j]] = j
#                 if j == 200:
#                     print(localdf[locid_var].loc[t_zones[j]].value_counts())
#                     print()
#                     print(c)
#                     c += 1
#                     print()
#                     print()
            
        return localdf[locid_var].astype('float64') 

    else:
        localdf[locid_var] = localdf[locid_var].astype('float64')   
        return localdf[locid_var]

In [26]:
# all_trips = get_uber()

# derive & assign pickup & dropoff taxi zones 
all_trips['dropoff_taxizone_id'] = all_trips.map_partitions(assign_taxi_zones, 
                                                            lon_var='dropoff_longitude', 
                                                            lat_var='dropoff_latitude',
                                                            locid_var='dropoff_taxizone_id', 
                                                            meta=('dropoff_taxizone_id', np.float64))
all_trips['pickup_taxizone_id'] = all_trips.map_partitions(assign_taxi_zones, 
                                                           lon_var='pickup_longitude', 
                                                           lat_var='pickup_latitude',
                                                           locid_var='pickup_taxizone_id', 
                                                           meta=('pickup_taxizone_id', np.float64))

In [27]:
# all_trips = all_trips[sorted(all_trips.columns)]
# all_trips = all_trips.repartition(npartitions=1200)

In [28]:
all_trips = all_trips.map_partitions(lambda x: x.sort_values('pickup_datetime'), 
                                     meta=all_trips)

In [29]:
%%time
df = all_trips.compute()
df

CPU times: user 42.7 s, sys: 14.1 s, total: 56.8 s
Wall time: 56.9 s


Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
68772,,,,,,,,,,,...,-73.9952,147.0,,,,,,,uber,
85237,,,,,,,,,,,...,-73.9874,100.0,,,,,,,uber,
83557,,,,,,,,,,,...,-74.0041,230.0,,,,,,,uber,
96581,,,,,,,,,,,...,-73.9873,100.0,,,,,,,uber,
49440,,,,,,,,,,,...,-73.9922,151.0,,,,,,,uber,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30398,2019-02-01 00:01:39,0.0,0.0,48.0,,0.5,6.5,0.3,0.5,1,...,0.0000,100.0,1,N,2.34,0.0,10.14,0.99,yellow,2
3419,2019-02-01 00:02:52,0.0,0.0,237.0,,0.5,5.5,0.3,0.5,1,...,0.0000,140.0,1,N,1.35,0.0,8.15,0.50,yellow,1
96017,2019-02-01 00:11:38,0.0,0.0,164.0,,0.5,9.0,0.3,0.5,2,...,0.0000,68.0,1,N,0.00,0.0,10.30,0.80,yellow,1
87599,2019-02-01 00:08:20,0.0,0.0,68.0,,0.5,5.0,0.3,0.5,1,...,0.0000,100.0,1,N,1.89,0.0,8.19,0.68,yellow,2


In [30]:
missing_pickup_taxizone_ids = (len(df.loc[df.pickup_taxizone_id == -1])/len(df))*100
unexpected_missing_pickup_taxizone_ids = (len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_latitude != 0) & (df.pickup_longitude != 0)])/len(df))*100

print(f'MISSING: {str(missing_pickup_taxizone_ids)[:7]}%\nIDK WHY: {str(unexpected_missing_pickup_taxizone_ids)[:7]}%')

MISSING: 1.25866%
IDK WHY: 0.60919%


In [31]:
df.pickup_taxizone_id.value_counts()

234.0    58579
237.0    58203
161.0    55542
160.0    53019
236.0    52423
         ...  
105.0        3
99.0         1
103.0        1
104.0        1
109.0        1
Name: pickup_taxizone_id, Length: 266, dtype: int32

In [32]:
df.dropoff_taxizone_id.value_counts()

NaN      700000
236.0     49084
237.0     41530
161.0     37885
160.0     33646
          ...  
172.0        19
109.0        18
199.0         8
105.0         2
99.0          1
Name: dropoff_taxizone_id, Length: 265, dtype: int32

In [33]:
len(df.loc[df.pickup_taxizone_id == -1])

30304

In [34]:
len(df)

2407623

In [35]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2019-01-01 00:00:00')])/len(df))*100

0.491522136148392

In [36]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2018-01-01 00:00:00')])/len(df))*100

0.491522136148392

In [37]:
(len(df.loc[(df.pickup_taxizone_id == -1) 
            & (df.pickup_latitude != 0) 
            & (df.pickup_longitude != 0) 
            & (df.pickup_datetime > '2018-01-01 00:00:00')])/len(df))*100

0.491522136148392

In [38]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2017-01-01 00:00:00')])/len(df))*100

0.491522136148392

In [39]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2016-01-01 00:00:00')])/len(df))*100

0.5736363209688561

In [40]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2015-01-01 00:00:00')])/len(df))*100

0.6694154358884261

In [41]:
(len(df.loc[(df.pickup_taxizone_id == -1) 
            & (df.pickup_latitude != 0) 
            & (df.pickup_longitude != 0) 
            & (df.pickup_datetime > '2015-01-01 00:00:00')])/len(df))*100

0.5056439484088664

In [42]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2014-01-01 00:00:00')])/len(df))*100

0.7432226723203758

In [43]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2013-01-01 00:00:00')])/len(df))*100

0.8475994788220581

In [44]:
(len(df.loc[(df.pickup_taxizone_id == -1) 
            & (df.pickup_latitude != 0) 
            & (df.pickup_longitude != 0) 
            & (df.pickup_datetime > '2013-01-01 00:00:00')])/len(df))*100

0.5224239841536652

In [45]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2012-01-01 00:00:00')])/len(df))*100

0.9558390163243996

In [46]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2011-01-01 00:00:00')])/len(df))*100

1.072842384376624

In [47]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2010-01-01 00:00:00')])/len(df))*100

1.1670847138443188

In [48]:
(len(df.loc[(df.pickup_taxizone_id == -1) 
            & (df.pickup_latitude != 0) 
            & (df.pickup_longitude != 0) 
            & (df.pickup_datetime > '2010-01-01 00:00:00')])/len(df))*100

0.5821094083251407

In [49]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_datetime > '2009-01-01 00:00:00')])/len(df))*100

1.258668819827689

In [50]:
(len(df.loc[(df.pickup_taxizone_id == -1) 
            & (df.pickup_latitude != 0) 
            & (df.pickup_longitude != 0) 
            & (df.pickup_datetime > '2009-01-01 00:00:00')])/len(df))*100

0.6091900600716973

In [51]:
len(df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_datetime > '2016-01-01 00:00:00') & (df.dropoff_latitude != 0)])

538

In [52]:
len(df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_latitude != 0)])

4470

In [53]:
(len(df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_longitude != 0)])/len(df))*100

0.18428964999919006

In [54]:
len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_latitude != 0)])

14705

In [55]:
(len(df.loc[(df.pickup_taxizone_id == -1) & (df.pickup_longitude != 0)])/len(df))*100

0.6092315948136399

In [56]:
df.loc[(df.dropoff_taxizone_id == -1) & (df.dropoff_datetime > '2016-01-01 00:00:00') & (df.dropoff_latitude != 0)]

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,dropoff_taxizone_id,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,pickup_taxizone_id,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id
80052,2016-01-01 00:23:30,40.724758,-74.073868,-1.0,-1.0,0.0,70.00,0.0,0.0,1,...,-74.073845,-1.0,5,N,0.00,0.00,70.00,0.00,green,2
12110,2016-01-02 00:31:01,40.707867,-73.998917,-1.0,-1.0,0.5,11.50,0.3,0.5,1,...,-73.954536,21.0,1,N,0.00,0.00,12.80,2.79,green,2
48660,2016-01-01 02:18:10,41.169556,-73.254982,-1.0,-1.0,0.5,222.50,0.3,0.5,1,...,-73.990753,28.0,4,N,0.00,5.54,229.34,53.27,green,2
11179,2016-01-01 01:32:42,40.919346,-73.866753,-1.0,-1.0,0.5,24.00,0.3,0.5,1,...,-73.936562,43.0,1,N,0.00,0.00,25.30,8.60,green,1
67437,2016-01-01 03:39:15,40.940136,-73.897469,-1.0,-1.0,0.5,17.00,0.3,0.5,1,...,-73.885063,174.0,1,N,5.49,0.00,23.79,5.52,green,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43990,2016-01-31 07:20:03,40.906124,-73.838425,-1.0,,0.0,56.50,0.3,0.5,2,...,-73.983582,47.0,1,N,0.00,0.00,57.30,20.10,yellow,1
64280,2016-01-31 07:42:54,40.631008,-73.719086,-1.0,,0.0,21.00,0.3,0.5,1,...,-73.781784,136.0,1,N,5.45,0.00,27.25,6.81,yellow,2
58588,2016-01-31 09:06:29,40.479477,-74.406914,-1.0,,0.0,232.74,0.3,0.0,1,...,-74.406914,-1.0,5,N,0.00,0.00,233.04,0.00,yellow,1
77308,2016-01-31 10:58:40,40.535858,-74.534012,-1.0,,0.0,300.00,0.3,0.0,1,...,-73.801193,215.0,5,N,0.00,21.58,321.88,55.48,yellow,2


In [57]:
# df.loc[df.dropoff_taxizone_id == -1].head()