# Feature engineering

## Data loading

In [3]:
import pandas as pd

path = './artifacts/saved_df.csv'
df = pd.read_csv(path, index_col=0)

In [None]:
df.info()

In [5]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['duration_dt'] = pd.to_timedelta(df['duration_dt'])

## Data exploration

In [None]:
df.head().T

In [7]:
df['paid_to_driver'] = -1 * df['paid_to_driver']

In [None]:
df.describe()

In [None]:
df[df['total_earning'] < 0]

In [None]:
df[df['customer_fare'] < 0]

In [11]:
df.drop(df[df['total_earning'] < 0].index, inplace=True)

In [None]:
df[df['total_earning'] < 0]

In [None]:
df[df['paid_to_uber'] < 0]

It seems that total_earning does not correspond to how much the driver earned. It's better to use paid_to_driver as total earning.

In [31]:
# Split df in paid rides and canceled ones.
df_paid_cancelations = df[df['distance_km']<= 0]
df_rides = df.drop(df_paid_cancelations.index)

## Feature engineering

### Formation of base benchmarks:
- fee = paid_to_uber / customer_fare
- earning_pct = paid_to_driver / customer_fare
- earning_base_fare_pct = base_fare / customer_fare
- traffic_benchmark = duration / distance
- ted = paid_to_driver / duration
- bfd = base_fare / duration
- cfd = customer_fare / duration 
- pud = paid_to_uber / duration 
- tei = paid_to_driver / distance
- bfi = base_fare / distance
- cfi = customer_fare / distance 
- pui = paid_to_uber / distance
- tedi = paid_to_driver / (duration * distance)
- bfdi = base_fare / (duration * distance)
- cfdi = customer_fare / (duration * distance) 
- pudi = paid_to_uber / (duration * distance) 

In [41]:
df_rides['fee'] = df_rides['paid_to_uber'] / df_rides['customer_fare'] * 100
df_rides['earning_pct'] = df_rides['paid_to_driver'] / df_rides['customer_fare'] * 100
df_rides['earning_base_fare_pct'] = df_rides['base_fare'] / df_rides['customer_fare'] * 100
df_rides['traffic_benchmark'] = (df_rides['duration_dt'].dt.total_seconds()/60) / df_rides['distance_km']
df_rides['ted'] = df_rides['paid_to_driver'] / (df_rides['duration_dt'].dt.total_seconds()/60)
df_rides['bfd'] = df_rides['base_fare'] / (df_rides['duration_dt'].dt.total_seconds()/60)
df_rides['cfd'] = df_rides['customer_fare'] / (df_rides['duration_dt'].dt.total_seconds()/60)
df_rides['pud'] = df_rides['paid_to_uber'] / (df_rides['duration_dt'].dt.total_seconds()/60)
df_rides['tei'] = df_rides['paid_to_driver'] / df_rides['distance_km']
df_rides['bfi'] = df_rides['base_fare'] / df_rides['distance_km']
df_rides['cfi'] = df_rides['customer_fare'] / df_rides['distance_km']
df_rides['pui'] = df_rides['paid_to_uber'] / df_rides['distance_km']
df_rides['tedi'] = df_rides['paid_to_driver'] / ((df_rides['duration_dt'].dt.total_seconds()/60) * df_rides['distance_km'])
df_rides['bfdi'] = df_rides['base_fare'] / ((df_rides['duration_dt'].dt.total_seconds()/60) * df_rides['distance_km'])
df_rides['cfdi'] = df_rides['customer_fare'] / ((df_rides['duration_dt'].dt.total_seconds()/60) * df_rides['distance_km'])
df_rides['pudi'] = df_rides['paid_to_uber'] / ((df_rides['duration_dt'].dt.total_seconds()/60) * df_rides['distance_km'])

In [43]:
df_rides.T

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,3157,3158,3159,3160,3161,3162,3163,3164,3165,3166
type,UberX,UberX,UberX,UberX,UberX,UberX,UberX,UberX,UberX,UberX,...,UberX,UberX,Comfort,Comfort,Comfort,Black,Black,Black,Black,Black
date,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,...,2024-11-25,2024-11-25,2024-11-25,2024-11-25,2024-11-25,2024-11-25,2024-11-25,2024-11-25,2024-11-25,2024-11-25
time,17:28:00,16:37:00,16:19:00,15:57:00,15:01:00,13:06:00,12:32:00,12:28:00,12:18:00,12:10:00,...,18:27:00,17:05:00,16:00:00,14:43:00,14:11:00,08:33:00,08:17:00,08:03:00,07:48:00,06:49:00
duration,11 min 32 sec,27 min 43 sec,16 min 18 sec,5 min 29 sec,2 min 54 sec,4 min 5 sec,21 min 33 sec,3 min 59 sec,1 min 54 sec,4 min 0 sec,...,27 min 0 sec,1 hr 20 min,51 min 54 sec,1 hr 8 min,25 min 27 sec,4 min 48 sec,7 min 49 sec,7 min 36 sec,7 min 42 sec,15 min 11 sec
distance,4.15 km,13.42 km,4.78 km,1.38 km,0.96 km,0.80 km,5.28 km,0.91 km,0.55 km,1.61 km,...,8.12 km,25.80 km,28.58 km,29.37 km,8.19 km,1.23 km,1.10 km,1.64 km,1.35 km,4.76 km
origin,"Rua Antônio das Chagas, Santo Amaro - São Paul...","Rua Helena, Itaim Bibi - São Paulo - SP, 04552...","Alameda Gabriel Monteiro da Silva, Jardim Amer...","R. Turiassu, Perdizes - São Paulo - SP, 05005-...","Rua das Tabocas, Alto de Pinheiros - São Paulo...","Terminal Butantã, São Paulo - SP, 05510-050, BR","Rua Capital Federal, Perdizes - São Paulo - SP...","R. Francisco Isoldi, Alto de Pinheiros - São P...","Rua Judite, Alto de Pinheiros - São Paulo - SP...","Av. Arruda Botelho, Alto de Pinheiros - São Pa...",...,"Rua Dona Ana Néri, Cambuci - São Paulo - SP, 0...","Rua dos Têxteis, Cidade Tiradentes - São Paulo...","Terminal 2, Aeroporto Internacional de São Pau...","Rua Martim Francisco, Consolação - Sao Paulo -...","Rua Itaici, Santana - São Paulo - SP, 02460-03...","R. Fradique Coutinho, Pinheiros - São Paulo - ...","R. Fradique Coutinho, São Paulo - SP, 05422-00...","Rua Francisco Leitão, Pinheiros - Sao Paulo - ...","Rua Lisboa, Jardim Paulista - São Paulo - SP, ...","Rua Alves Guimarães, Jardim Paulista - São Pau..."
destination,"R. Geórgia, Brooklin - São Paulo - SP, 04559-0...","Rua Quipa, Campo Limpo - São Paulo - SP, 05756...","Rua Helena, Itaim Bibi - São Paulo - SP, 04552...","Rua Ministro Godói, Perdizes - São Paulo - SP,...","R. Delfina, Vila Madalena - São Paulo - SP, 05...","Avenida Afrânio Peixoto, Butantã - São Paulo -...","Av. Rebouças, Bairro Pinheiros - São Paulo - S...","Rua Paulistânia, Sumarezinho - São Paulo - SP,...","Rua Gumercindo Fleury, Alto de Pinheiros - São...","Pç. Panamericana, Alto de Pinheiros - São Paul...",...,"Rua Doutor Olavo Egídio, Santana - São Paulo -...","Rua Vitoantônio Del Vechio, Mooca - São Paulo ...","Rua Ernesto Gould, Cidade Tiradentes - São Pau...","Terminal 3, Aeroporto Internacional de São Pau...","Rua Marquês de Itu, Consolação - São Paulo - S...","Rua Manuel Henrique Lopes, Pinheiros - São Pau...","R. Mourato Coelho Sobreloja, Vila Madalena - S...","R. Simão Álvares, Pinheiros - São Paulo - SP, ...","Av. Brasil, Jardins - São Paulo - SP, 01431-01...","R. Dona Veridiana, Higienópolis - São Paulo - ..."
total_earning,15.01,27.09,13.78,6.69,6.11,9.52,20.0,6.7,6.11,6.89,...,27.52,54.96,76.38,76.52,26.02,15.46,13.76,15.53,16.82,24.94
base_fare,11.01,27.09,13.78,6.69,6.11,6.52,20.0,6.7,6.11,6.89,...,25.02,54.96,74.35,75.09,26.02,9.96,9.51,11.03,10.32,20.3
customer_fare,19.97,34.95,22.96,10.96,9.98,10.9,24.88,10.97,9.94,10.99,...,30.3,60.23,90.31,128.42,43.37,18.49,16.02,17.89,22.13,33.75


In [54]:
df_rides.iloc[:,-16:].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fee,3106.0,27.890808,14.827361,-85.273091,21.227895,34.550489,38.794046,41.131956
earning_pct,3106.0,70.028322,10.842473,32.239156,61.055754,65.135683,77.064336,133.773165
earning_base_fare_pct,3106.0,63.643565,9.462541,23.446659,59.974899,61.765504,66.603208,133.773165
traffic_benchmark,3106.0,3.587073,1.654064,0.805066,2.602176,3.348431,4.287037,50.833333
ted,3106.0,1.536228,0.813015,0.433043,1.107823,1.378844,1.723635,22.5
bfd,3106.0,1.371537,0.584133,0.391726,1.035075,1.270874,1.559897,11.4
cfd,3106.0,2.211992,1.073666,0.636702,1.609723,2.020268,2.531542,24.648
pud,3106.0,0.625579,0.496719,-5.82,0.366796,0.628633,0.860244,5.409836
tei,3106.0,5.553911,10.546732,1.161911,3.498484,4.641873,6.172056,476.0
bfi,3106.0,4.910562,9.21886,1.161911,3.291346,4.324893,5.495495,476.0
