# Tip Prediction, Part II: Experimental Features

Author's Workday ID: C175799, Initials: RPR

## Choose GPU

In [33]:
# From: https://github.com/keras-team/keras/issues/6031
import os
gpu_id = input( "Select GPU [0 or 1]: " )

if gpu_id in [ "0", "1" ]:
    os.environ[ "CUDA_VISIBLE_DEVICES" ] = gpu_id
else:
    print( "Invalid GPU id.  Defaulting to '0,1'" )

Select GPU [0 or 1]: 1


## Imports and Util Functions

In [34]:
import pandas as pd
import numpy as np
import time
import datetime
import math
import random

def get_time( output=True ):
    
    temp = time.time()
    if output:
        now = datetime.datetime.now()
        print( now.strftime( "%Y.%m.%d %H:%M" ) )
        
    return temp

def print_time( start_time, end_time, interval="seconds" ):
    
    if interval == "hours":
        print ( "Time to process: [%s] hours" % ( str( ( end_time - start_time ) / 60 / 60 ) ) )
    elif interval == "minutes":
        print ( "Time to process: [%s] minutes" % ( str( ( end_time - start_time ) / 60 ) ) )
    else:
        print ( "Time to process: [%s] seconds" % ( str( end_time - start_time ) ) )

print_time( 0, 3600, interval="hours" )
print_time( 0, 3600, interval="minutes" )

verbose = False

Time to process: [1.0] hours
Time to process: [60.0] minutes


## Load Data

In [35]:
page_start = get_time()
trips = pd.read_csv( "data/green-tripdata-2015-09-cleaned.csv", compression="gzip" )
trips.columns = map( str.lower, trips.columns )
if verbose: print( list( trips.columns ) )
print_time( page_start, get_time() )

2018.06.01 17:10
2018.06.01 17:10
Time to process: [7.059286594390869] seconds


## Add Payment Type Mean Encoding

In [36]:
rows = trips.shape[ 0 ]
# cc = 1
# cash = 2
trips_by_payment_type_grp = trips.groupby( "payment_type" ).payment_type.count()
print( trips_by_payment_type_grp / rows )

payment_type
1    0.469046
2    0.526515
3    0.002210
4    0.002186
5    0.000044
Name: payment_type, dtype: float64


In [37]:
# Calculate a mapping: {payment_type: target_mean}
payment_type_target_mean = trips.groupby( 'payment_type' ).tip_recorded.mean()
print( payment_type_target_mean )

# In our non-regularized case we just *map* the computed means to the `payment_type`'s
trips[ 'payment_type_target_enc' ] = trips[ 'payment_type' ].map( payment_type_target_mean )
#trips.head()

payment_type
1    0.860920
2    0.000001
3    0.000000
4    0.000314
5    0.000000
Name: tip_recorded, dtype: float64


## Add Tip Percent Mean Encoding

In [38]:
trips_by_tip_percent_grp = trips.groupby( "tip_percent_bin" ).tip_percent_bin.count()
trips_by_tip_percent_grp[ "percent" ] = trips_by_tip_percent_grp / rows * 100
#trips_by_tip_percent_grp

#print( trips_by_tip_percent_grp / rows * 100 )
# 0.0      59.726507
# 1.0       0.068346
# 2.0       0.053234
# 3.0       0.068071
# 4.0       0.135112
# 5.0       0.212113
# 6.0       0.286640
# 7.0       0.351552
# 8.0       0.450258
# 9.0       0.481649

In [39]:
# add one-hot for tip_percent_bin
tip_bins = trips.tip_percent_bin.unique()
tip_bins.sort()
#print( tip_bins )

for tip in tip_bins:

    trips[ "tip_percent_bin_" + str( int ( tip ) ) ] = ( trips.tip_percent_bin == tip ) * 1
    
trips[ 0:5 ]


Unnamed: 0,vendorid,store_and_fwd_flag,ratecodeid,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,...,tip_percent_bin_120,tip_percent_bin_121,tip_percent_bin_122,tip_percent_bin_123,tip_percent_bin_124,tip_percent_bin_125,tip_percent_bin_126,tip_percent_bin_127,tip_percent_bin_128,tip_percent_bin_129
0,2,False,5,-73.979485,40.684956,-73.979431,40.68502,1,0.0,7.8,...,0,0,0,0,0,0,0,0,0,0
1,2,False,5,-74.010796,40.912216,-74.01078,40.912212,1,0.0,45.0,...,0,0,0,0,0,0,0,0,0,0
2,2,False,1,-73.92141,40.766708,-73.914413,40.764687,1,0.59,4.0,...,0,0,0,0,0,0,0,0,0,0
3,2,False,1,-73.921387,40.766678,-73.931427,40.771584,1,0.74,5.0,...,0,0,0,0,0,0,0,0,0,0
4,2,False,1,-73.955482,40.714046,-73.944412,40.714729,1,0.61,5.0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
start_time = get_time()

# Calculate a mapping: {payment_type: target_mean}
for i in range( len( tip_bins ) ):
    
    bin_name = "tip_percent_bin_" + str( int( i ) )
    tip_bin_n_payment_type_mean = trips.groupby( 'payment_type' )[ bin_name ].mean() 
    tip_bin_n_ratecode_mean = trips.groupby( 'ratecodeid' )[ bin_name ].mean() 
    print( tip_bin_n_payment_type_mean )
    print( tip_bin_n_ratecode_mean )
    

    # create cols w/ default value: 0.0
    payment_type_col_name = "payment_type_tip_bin_" + str( i ) + "_enc"
    trips[ payment_type_col_name ] = 0.0

    ratecode_col_name = "ratecode_tip_bin_" + str( i ) + "_enc"
    trips[ ratecode_col_name ] = 0.0
    
    # In our non-regularized case we just *map* the computed means to the payment_type
    # See notebook for: https://www.coursera.org/learn/competitive-data-science/lecture/b5Gxv/concept-of-mean-encoding
    trips[ payment_type_col_name ] = trips[ 'payment_type' ].map( tip_bin_n_payment_type_mean )
    trips[ ratecode_col_name ] = trips[ 'ratecodeid' ].map( tip_bin_n_ratecode_mean )
    
    print() 
    
print_time( start_time, get_time(), interval="minutes" )
    
trips.head()

2018.06.01 17:10
payment_type
1    0.141376
2    0.999999
3    1.000000
4    0.999686
5    1.000000
Name: tip_percent_bin_0, dtype: float64
ratecodeid
1    0.592178
2    0.693263
3    0.692737
4    0.599628
5    0.860152
6    1.000000
Name: tip_percent_bin_0, dtype: float64

payment_type
1    0.001457
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_1, dtype: float64
ratecodeid
1    0.000667
2    0.000000
3    0.002793
4    0.000000
5    0.001647
6    0.000000
Name: tip_percent_bin_1, dtype: float64

payment_type
1    0.001135
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_2, dtype: float64
ratecodeid
1    0.000528
2    0.000310
3    0.001397
4    0.001862
5    0.000728
6    0.000000
Name: tip_percent_bin_2, dtype: float64

payment_type
1    0.001451
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_3, dtype: float64
ratecodeid
1    0.000675
2    0.000310
3    0.000000
4    0.005587
5    0.000958
6 

payment_type
1    0.015842
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_32, dtype: float64
ratecodeid
1    0.007585
2    0.000000
3    0.004190
4    0.001862
5    0.000153
6    0.000000
Name: tip_percent_bin_32, dtype: float64

payment_type
1    0.013212
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_33, dtype: float64
ratecodeid
1    0.006306
2    0.000310
3    0.002793
4    0.001862
5    0.001149
6    0.000000
Name: tip_percent_bin_33, dtype: float64

payment_type
1    0.010392
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_34, dtype: float64
ratecodeid
1    0.004956
2    0.007451
3    0.004190
4    0.003724
5    0.000115
6    0.000000
Name: tip_percent_bin_34, dtype: float64

payment_type
1    0.006584
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_35, dtype: float64
ratecodeid
1    0.003148
2    0.000931
3    0.004190
4    0.000000
5    0.000153
6    0.00000

payment_type
1    0.000066
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_64, dtype: float64
ratecodeid
1    0.000032
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
Name: tip_percent_bin_64, dtype: float64

payment_type
1    0.000057
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_65, dtype: float64
ratecodeid
1    0.000027
2    0.000000
3    0.000000
4    0.000000
5    0.000038
6    0.000000
Name: tip_percent_bin_65, dtype: float64

payment_type
1    0.00001
2    0.00000
3    0.00000
4    0.00000
5    0.00000
Name: tip_percent_bin_66, dtype: float64
ratecodeid
1    0.000005
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
Name: tip_percent_bin_66, dtype: float64

payment_type
1    0.000511
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_67, dtype: float64
ratecodeid
1    0.000242
2    0.000310
3    0.000000
4    0.000000
5    0.000115
6    0.000000
Nam

payment_type
1    0.00001
2    0.00000
3    0.00000
4    0.00000
5    0.00000
Name: tip_percent_bin_96, dtype: float64
ratecodeid
1    0.000005
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
Name: tip_percent_bin_96, dtype: float64

payment_type
1    0.000006
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_97, dtype: float64
ratecodeid
1    0.000003
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
Name: tip_percent_bin_97, dtype: float64

payment_type
1    0.00001
2    0.00000
3    0.00000
4    0.00000
5    0.00000
Name: tip_percent_bin_98, dtype: float64
ratecodeid
1    0.000005
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
Name: tip_percent_bin_98, dtype: float64

payment_type
1    0.000003
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_99, dtype: float64
ratecodeid
1    0.000001
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
Name: ti

payment_type
1    0.000016
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_128, dtype: float64
ratecodeid
1    0.000008
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
Name: tip_percent_bin_128, dtype: float64

payment_type
1    0.000019
2    0.000000
3    0.000000
4    0.000000
5    0.000000
Name: tip_percent_bin_129, dtype: float64
ratecodeid
1    0.000009
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
Name: tip_percent_bin_129, dtype: float64

2018.06.01 17:13
Time to process: [3.0287057916323343] minutes


Unnamed: 0,vendorid,store_and_fwd_flag,ratecodeid,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,...,payment_type_tip_bin_125_enc,ratecode_tip_bin_125_enc,payment_type_tip_bin_126_enc,ratecode_tip_bin_126_enc,payment_type_tip_bin_127_enc,ratecode_tip_bin_127_enc,payment_type_tip_bin_128_enc,ratecode_tip_bin_128_enc,payment_type_tip_bin_129_enc,ratecode_tip_bin_129_enc
0,2,False,5,-73.979485,40.684956,-73.979431,40.68502,1,0.0,7.8,...,8.6e-05,0.0,6e-06,0.0,1.3e-05,0.0,1.6e-05,0.0,1.9e-05,0.0
1,2,False,5,-74.010796,40.912216,-74.01078,40.912212,1,0.0,45.0,...,8.6e-05,0.0,6e-06,0.0,1.3e-05,0.0,1.6e-05,0.0,1.9e-05,0.0
2,2,False,1,-73.92141,40.766708,-73.914413,40.764687,1,0.59,4.0,...,8.6e-05,4.1e-05,6e-06,3e-06,1.3e-05,6e-06,1.6e-05,8e-06,1.9e-05,9e-06
3,2,False,1,-73.921387,40.766678,-73.931427,40.771584,1,0.74,5.0,...,0.0,4.1e-05,0.0,3e-06,0.0,6e-06,0.0,8e-06,0.0,9e-06
4,2,False,1,-73.955482,40.714046,-73.944412,40.714729,1,0.61,5.0,...,0.0,4.1e-05,0.0,3e-06,0.0,6e-06,0.0,8e-06,0.0,9e-06


## Drop Temp Cols: Tip Percent Bins

In [41]:
start_time = get_time()

# drop temporary tip_percent_bin_* columns
print( len( trips.columns ) )
cols_to_drop = [ col for col in trips.columns if "tip_percent_bin_" in col ]
trips.drop( labels=cols_to_drop, axis=1, inplace=True )
print( len( trips.columns ) )

print_time( start_time, get_time() )

2018.06.01 17:13
435
305
2018.06.01 17:14
Time to process: [3.4071147441864014] seconds


## Show Mean Encoded Cols for Tip Bins by Payment Type

In [42]:
cols_to_show = [ col for col in trips.columns if "payment_type_tip_bin_" in col ]
cols_to_show.insert( 0, "payment_type" )
trips[ cols_to_show ].head( 10 )

Unnamed: 0,payment_type,payment_type_tip_bin_0_enc,payment_type_tip_bin_1_enc,payment_type_tip_bin_2_enc,payment_type_tip_bin_3_enc,payment_type_tip_bin_4_enc,payment_type_tip_bin_5_enc,payment_type_tip_bin_6_enc,payment_type_tip_bin_7_enc,payment_type_tip_bin_8_enc,...,payment_type_tip_bin_120_enc,payment_type_tip_bin_121_enc,payment_type_tip_bin_122_enc,payment_type_tip_bin_123_enc,payment_type_tip_bin_124_enc,payment_type_tip_bin_125_enc,payment_type_tip_bin_126_enc,payment_type_tip_bin_127_enc,payment_type_tip_bin_128_enc,payment_type_tip_bin_129_enc
0,1,0.141376,0.001457,0.001135,0.001451,0.002881,0.004522,0.006111,0.007495,0.009599,...,8.6e-05,7e-06,9e-06,1.2e-05,3e-06,8.6e-05,6e-06,1.3e-05,1.6e-05,1.9e-05
1,1,0.141376,0.001457,0.001135,0.001451,0.002881,0.004522,0.006111,0.007495,0.009599,...,8.6e-05,7e-06,9e-06,1.2e-05,3e-06,8.6e-05,6e-06,1.3e-05,1.6e-05,1.9e-05
2,1,0.141376,0.001457,0.001135,0.001451,0.002881,0.004522,0.006111,0.007495,0.009599,...,8.6e-05,7e-06,9e-06,1.2e-05,3e-06,8.6e-05,6e-06,1.3e-05,1.6e-05,1.9e-05
3,2,0.999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0.999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,0.141376,0.001457,0.001135,0.001451,0.002881,0.004522,0.006111,0.007495,0.009599,...,8.6e-05,7e-06,9e-06,1.2e-05,3e-06,8.6e-05,6e-06,1.3e-05,1.6e-05,1.9e-05
6,1,0.141376,0.001457,0.001135,0.001451,0.002881,0.004522,0.006111,0.007495,0.009599,...,8.6e-05,7e-06,9e-06,1.2e-05,3e-06,8.6e-05,6e-06,1.3e-05,1.6e-05,1.9e-05
7,2,0.999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,0.141376,0.001457,0.001135,0.001451,0.002881,0.004522,0.006111,0.007495,0.009599,...,8.6e-05,7e-06,9e-06,1.2e-05,3e-06,8.6e-05,6e-06,1.3e-05,1.6e-05,1.9e-05
9,2,0.999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Show Mean Encoded Cols for Tip Bins by RatecodeID

In [43]:
cols_to_show = [ col for col in trips.columns if "ratecode_tip_bin_" in col ]
cols_to_show.insert( 0, "ratecodeid" )
trips[ cols_to_show ].head( 10 )

Unnamed: 0,ratecodeid,ratecode_tip_bin_0_enc,ratecode_tip_bin_1_enc,ratecode_tip_bin_2_enc,ratecode_tip_bin_3_enc,ratecode_tip_bin_4_enc,ratecode_tip_bin_5_enc,ratecode_tip_bin_6_enc,ratecode_tip_bin_7_enc,ratecode_tip_bin_8_enc,...,ratecode_tip_bin_120_enc,ratecode_tip_bin_121_enc,ratecode_tip_bin_122_enc,ratecode_tip_bin_123_enc,ratecode_tip_bin_124_enc,ratecode_tip_bin_125_enc,ratecode_tip_bin_126_enc,ratecode_tip_bin_127_enc,ratecode_tip_bin_128_enc,ratecode_tip_bin_129_enc
0,5,0.860152,0.001647,0.000728,0.000958,0.000919,0.001839,0.001877,0.002413,0.003792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.860152,0.001647,0.000728,0.000958,0.000919,0.001839,0.001877,0.002413,0.003792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.592178,0.000667,0.000528,0.000675,0.001353,0.002127,0.002882,0.003537,0.004517,...,4.1e-05,4e-06,4e-06,6e-06,1e-06,4.1e-05,3e-06,6e-06,8e-06,9e-06
3,1,0.592178,0.000667,0.000528,0.000675,0.001353,0.002127,0.002882,0.003537,0.004517,...,4.1e-05,4e-06,4e-06,6e-06,1e-06,4.1e-05,3e-06,6e-06,8e-06,9e-06
4,1,0.592178,0.000667,0.000528,0.000675,0.001353,0.002127,0.002882,0.003537,0.004517,...,4.1e-05,4e-06,4e-06,6e-06,1e-06,4.1e-05,3e-06,6e-06,8e-06,9e-06
5,1,0.592178,0.000667,0.000528,0.000675,0.001353,0.002127,0.002882,0.003537,0.004517,...,4.1e-05,4e-06,4e-06,6e-06,1e-06,4.1e-05,3e-06,6e-06,8e-06,9e-06
6,1,0.592178,0.000667,0.000528,0.000675,0.001353,0.002127,0.002882,0.003537,0.004517,...,4.1e-05,4e-06,4e-06,6e-06,1e-06,4.1e-05,3e-06,6e-06,8e-06,9e-06
7,1,0.592178,0.000667,0.000528,0.000675,0.001353,0.002127,0.002882,0.003537,0.004517,...,4.1e-05,4e-06,4e-06,6e-06,1e-06,4.1e-05,3e-06,6e-06,8e-06,9e-06
8,1,0.592178,0.000667,0.000528,0.000675,0.001353,0.002127,0.002882,0.003537,0.004517,...,4.1e-05,4e-06,4e-06,6e-06,1e-06,4.1e-05,3e-06,6e-06,8e-06,9e-06
9,1,0.592178,0.000667,0.000528,0.000675,0.001353,0.002127,0.002882,0.003537,0.004517,...,4.1e-05,4e-06,4e-06,6e-06,1e-06,4.1e-05,3e-06,6e-06,8e-06,9e-06


## Summary

In [44]:
print( "Columns after mean target encoding:", trips.shape[ 1 ] )

Columns after mean target encoding: 305


In [45]:
trips.head()

Unnamed: 0,vendorid,store_and_fwd_flag,ratecodeid,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,...,payment_type_tip_bin_125_enc,ratecode_tip_bin_125_enc,payment_type_tip_bin_126_enc,ratecode_tip_bin_126_enc,payment_type_tip_bin_127_enc,ratecode_tip_bin_127_enc,payment_type_tip_bin_128_enc,ratecode_tip_bin_128_enc,payment_type_tip_bin_129_enc,ratecode_tip_bin_129_enc
0,2,False,5,-73.979485,40.684956,-73.979431,40.68502,1,0.0,7.8,...,8.6e-05,0.0,6e-06,0.0,1.3e-05,0.0,1.6e-05,0.0,1.9e-05,0.0
1,2,False,5,-74.010796,40.912216,-74.01078,40.912212,1,0.0,45.0,...,8.6e-05,0.0,6e-06,0.0,1.3e-05,0.0,1.6e-05,0.0,1.9e-05,0.0
2,2,False,1,-73.92141,40.766708,-73.914413,40.764687,1,0.59,4.0,...,8.6e-05,4.1e-05,6e-06,3e-06,1.3e-05,6e-06,1.6e-05,8e-06,1.9e-05,9e-06
3,2,False,1,-73.921387,40.766678,-73.931427,40.771584,1,0.74,5.0,...,0.0,4.1e-05,0.0,3e-06,0.0,6e-06,0.0,8e-06,0.0,9e-06
4,2,False,1,-73.955482,40.714046,-73.944412,40.714729,1,0.61,5.0,...,0.0,4.1e-05,0.0,3e-06,0.0,6e-06,0.0,8e-06,0.0,9e-06


## Write to Parquet

In [48]:
start_time = get_time()
trips.to_parquet( "data/green-tripdata-2015-09-cleaned-plus-experimental-features.parquet", compression="gzip" )
print_time( start_time, get_time() )


2018.06.01 17:23
2018.06.01 17:23
Time to process: [19.018310070037842] seconds


## Time to Process Additional Features

In [None]:
print_time( page_start, get_time(), interval="minutes" )

In [None]:
trips.columns