# Feature engineering

## Data loading

In [1]:
import pandas as pd

path = './artifacts/saved_df.csv'
df = pd.read_csv(path, index_col=0)

In [2]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3536 entries, 0 to 3535
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   type                 3536 non-null   object 
 1   date                 3536 non-null   object 
 2   time                 3536 non-null   object 
 3   duration             3536 non-null   object 
 4   distance             3536 non-null   object 
 5   origin               3536 non-null   object 
 6   destination          3536 non-null   object 
 7   total_earning        3536 non-null   float64
 8   base_fare            3536 non-null   float64
 9   customer_fare        3536 non-null   float64
 10  paid_to_driver       3536 non-null   float64
 11  paid_to_uber         3536 non-null   float64
 12  datetime             3536 non-null   object 
 13  duration_dt          3536 non-null   object 
 14  distance_km          3536 non-null   float64
 15  origin_lat_lng       3536 non-null   o

In [4]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['duration_dt'] = pd.to_timedelta(df['duration_dt'])

## Data exploration

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
type,UberX,UberX,UberX,UberX,UberX
date,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11
time,17:28:00,17:20:00,16:37:00,16:19:00,15:57:00
duration,11 min 32 sec,0 sec,27 min 43 sec,16 min 18 sec,5 min 29 sec
distance,4.15 km,---,13.42 km,4.78 km,1.38 km
origin,"Rua Antônio das Chagas, Santo Amaro - São Paul...","Rua Luís Correia de Melo, Santo Amaro - São Pa...","Rua Helena, Itaim Bibi - São Paulo - SP, 04552...","Alameda Gabriel Monteiro da Silva, Jardim Amer...","R. Turiassu, Perdizes - São Paulo - SP, 05005-..."
destination,"R. Geórgia, Brooklin - São Paulo - SP, 04559-0...","Rua Luís Correia de Melo, Santo Amaro - São Pa...","Rua Quipa, Campo Limpo - São Paulo - SP, 05756...","Rua Helena, Itaim Bibi - São Paulo - SP, 04552...","Rua Ministro Godói, Perdizes - São Paulo - SP,..."
total_earning,15.01,4.22,27.09,13.78,6.69
base_fare,11.01,4.22,27.09,13.78,6.69
customer_fare,19.97,5.25,34.95,22.96,10.96


In [6]:
df['paid_to_driver'] = -1 * df['paid_to_driver']

In [7]:
df.describe()

Unnamed: 0,total_earning,base_fare,customer_fare,paid_to_driver,paid_to_uber,datetime,duration_dt,distance_km
count,3536.0,3536.0,3536.0,3536.0,3536.0,3536,3536,3536.0
mean,24.180322,22.727927,35.175939,24.895082,9.432107,2024-05-24 22:10:41.419683328,0 days 00:19:07.527997737,6.534228
min,-10.82,1.89,-5.07,1.89,-42.08,2023-11-06 20:41:00,0 days 00:00:00,0.0
25%,12.7375,11.74,19.655,13.1675,4.8075,2024-01-20 20:38:45,0 days 00:08:30,2.27
50%,19.355,17.775,28.845,20.035,7.98,2024-05-02 23:01:00,0 days 00:14:44.500000,4.165
75%,30.0625,28.17,43.955,31.02,12.75,2024-09-21 04:30:30,0 days 00:24:40,8.16
max,203.47,203.47,374.87,209.47,123.4,2025-01-25 19:51:00,0 days 02:11:00,116.49
std,17.056696,16.655796,23.688978,17.493154,8.671385,,0 days 00:15:31.505233058,7.35755


In [8]:
df[df['total_earning'] < 0]

Unnamed: 0,type,date,time,duration,distance,origin,destination,total_earning,base_fare,customer_fare,paid_to_driver,paid_to_uber,datetime,duration_dt,distance_km,origin_lat_lng,destination_lat_lng
83,Comfort,2023-11-18,20:26:00,3 min 41 sec,0.74 km,"Avenida Paulista, São Paulo - São Paulo - SP, ...","Rua Haddock Lobo, Cerqueira César - São Paulo ...",-10.82,7.87,-5.07,9.18,5.75,2023-11-18 20:26:00,0 days 00:03:41,0.74,"[-23.5573402, -46.6612825]","[-23.5579545, -46.6617369]"


In [9]:
df[df['customer_fare'] < 0]

Unnamed: 0,type,date,time,duration,distance,origin,destination,total_earning,base_fare,customer_fare,paid_to_driver,paid_to_uber,datetime,duration_dt,distance_km,origin_lat_lng,destination_lat_lng
83,Comfort,2023-11-18,20:26:00,3 min 41 sec,0.74 km,"Avenida Paulista, São Paulo - São Paulo - SP, ...","Rua Haddock Lobo, Cerqueira César - São Paulo ...",-10.82,7.87,-5.07,9.18,5.75,2023-11-18 20:26:00,0 days 00:03:41,0.74,"[-23.5573402, -46.6612825]","[-23.5579545, -46.6617369]"


In [10]:
df.drop(df[df['total_earning'] < 0].index, inplace=True)

In [11]:
df[df['total_earning'] < 0]

Unnamed: 0,type,date,time,duration,distance,origin,destination,total_earning,base_fare,customer_fare,paid_to_driver,paid_to_uber,datetime,duration_dt,distance_km,origin_lat_lng,destination_lat_lng


In [12]:
df[df['paid_to_uber'] < 0]

Unnamed: 0,type,date,time,duration,distance,origin,destination,total_earning,base_fare,customer_fare,paid_to_driver,paid_to_uber,datetime,duration_dt,distance_km,origin_lat_lng,destination_lat_lng
7,UberX,2023-11-11,12:32:00,21 min 33 sec,5.28 km,"Rua Capital Federal, Perdizes - São Paulo - SP...","Av. Rebouças, Bairro Pinheiros - São Paulo - S...",20.00,20.00,24.88,20.00,-1.34,2023-11-11 12:32:00,0 days 00:21:33,5.28,"[-23.5407904, -46.6851003]","[-23.5645767, -46.678679]"
32,UberX,2023-11-09,19:33:00,9 min 51 sec,2.42 km,"Rua Roma, Lapa - São Paulo - SP, 05050-090, BR","Rua Bica de Pedra, Perdizes - São Paulo - SP, ...",12.02,11.02,13.37,12.02,-1.07,2023-11-09 19:33:00,0 days 00:09:51,2.42,"[-23.5227259, -46.7017216]","[-23.5398646, -46.6965347]"
51,UberX,2023-11-08,07:29:00,1 hr 5 min,15.86 km,"Avenida Doutor Salomão Vasconcelos, Cangaiba -...","Rua Florêncio de Abreu, Sé - São Paulo - SP, 0...",40.06,38.81,42.88,40.06,-5.13,2023-11-08 07:29:00,0 days 01:05:00,15.86,"[-23.504126, -46.524001]","[-23.539276, -46.6329022]"
74,UberX,2023-11-19,01:36:00,8 min 11 sec,4.22 km,"Rua Maria Amália Lopes Azevedo, Tremembé - São...","Rua Albertina V da Silva Gordo, Mandaqui - São...",16.00,13.50,17.59,16.00,-1.10,2023-11-19 01:36:00,0 days 00:08:11,4.22,"[-23.4564712, -46.5939624]","[-23.475604, -46.6236056]"
117,UberX,2023-11-15,21:19:00,21 min 31 sec,13.88 km,"Rua Calciolândia, Vila Medeiros - São Paulo - ...","R. Pedro de Castillo, Furnas - São Paulo - SP,...",30.01,30.01,32.21,30.01,-5.11,2023-11-15 21:19:00,0 days 00:21:31,13.88,"[-23.4935981, -46.5817175]","[-23.4198084, -46.5814597]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3450,Black,2025-01-16,07:24:00,24 min 18 sec,5.95 km,"Rua Harmonia, Pinheiros - Sao Paulo - SP, 0543...","Torre Sul Av. Chedid. Jafet, Vila Olímpia - Sã...",27.00,26.20,31.42,27.00,-5.09,2025-01-16 07:24:00,0 days 00:24:18,5.95,"[-23.5556505, -46.6877182]","[-23.5916743, -46.6889744]"
3471,VIP,2025-01-25,19:51:00,36 min 31 sec,16.23 km,"R. Ailson Simões, Cupecê - São Paulo - SP, 046...","Avenida Imperatriz Leopoldina, 420, São Bernar...",44.42,44.42,49.91,44.42,-6.99,2025-01-25 19:51:00,0 days 00:36:31,16.23,"[-23.6643409, -46.669048]","[-23.7050716, -46.5474969]"
3482,Black,2025-01-24,18:53:00,43 min 40 sec,11.08 km,"Av. Brigadeiro Faria Lima, Itaim Bibi - São Pa...","Rua Lino Coutinho, Ipiranga - São Paulo - SP, ...",59.24,45.24,70.89,59.24,-7.85,2025-01-24 18:53:00,0 days 00:43:40,11.08,"[-23.5865341, -46.6826006]","[-23.5997207, -46.601733]"
3490,Comfort,2025-01-23,18:04:00,1 hr 3 min,23.35 km,"Av. Presidente Juscelino Kubitschek, Vila Nova...","Av. Barão de Mauá, Jardim Chácara Inglesa - Sã...",60.34,60.34,71.22,60.34,-11.17,2025-01-23 18:04:00,0 days 01:03:00,23.35,"[-23.5904419, -46.6808145]","[-23.6959978, -46.5610264]"


It seems that total_earning does not correspond to how much the driver earned. It's better to use paid_to_driver as total earning.

In [13]:
# Split df in paid rides and canceled ones.
df_paid_cancelations = df[df['distance_km']<= 0]
df_rides = df.drop(df_paid_cancelations.index)

## Feature engineering

### Formation of base benchmarks:
- fee = paid_to_uber / customer_fare
- earning_pct = paid_to_driver / customer_fare
- earning_base_fare_pct = base_fare / customer_fare
- traffic_benchmark = duration / distance
- ted = paid_to_driver / duration
- bfd = base_fare / duration
- cfd = customer_fare / duration 
- pud = paid_to_uber / duration 
- tei = paid_to_driver / distance
- bfi = base_fare / distance
- cfi = customer_fare / distance 
- pui = paid_to_uber / distance
- tedi = paid_to_driver / (duration * distance)
- bfdi = base_fare / (duration * distance)
- cfdi = customer_fare / (duration * distance) 
- pudi = paid_to_uber / (duration * distance) 

In [14]:
df_rides['fee'] = df_rides['paid_to_uber'] / df_rides['customer_fare'] * 100
df_rides['earning_pct'] = df_rides['paid_to_driver'] / df_rides['customer_fare'] * 100
df_rides['earning_base_fare_pct'] = df_rides['base_fare'] / df_rides['customer_fare'] * 100
df_rides['traffic_benchmark'] = (df_rides['duration_dt'].dt.total_seconds()/60) / df_rides['distance_km']
df_rides['ted'] = df_rides['paid_to_driver'] / (df_rides['duration_dt'].dt.total_seconds()/60+5)
df_rides['bfd'] = df_rides['base_fare'] / (df_rides['duration_dt'].dt.total_seconds()/60+5)
df_rides['cfd'] = df_rides['customer_fare'] / (df_rides['duration_dt'].dt.total_seconds()/60+5)
df_rides['pud'] = df_rides['paid_to_uber'] / (df_rides['duration_dt'].dt.total_seconds()/60+5)
df_rides['tei'] = df_rides['paid_to_driver'] / df_rides['distance_km']
df_rides['bfi'] = df_rides['base_fare'] / df_rides['distance_km']
df_rides['cfi'] = df_rides['customer_fare'] / df_rides['distance_km']
df_rides['pui'] = df_rides['paid_to_uber'] / df_rides['distance_km']
df_rides['tedi'] = df_rides['paid_to_driver'] / ((df_rides['duration_dt'].dt.total_seconds()/60) * df_rides['distance_km'])
df_rides['bfdi'] = df_rides['base_fare'] / ((df_rides['duration_dt'].dt.total_seconds()/60) * df_rides['distance_km'])
df_rides['cfdi'] = df_rides['customer_fare'] / ((df_rides['duration_dt'].dt.total_seconds()/60) * df_rides['distance_km'])
df_rides['pudi'] = df_rides['paid_to_uber'] / ((df_rides['duration_dt'].dt.total_seconds()/60) * df_rides['distance_km'])

In [15]:
df_rides.T

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,3526,3527,3528,3529,3530,3531,3532,3533,3534,3535
type,UberX,UberX,UberX,UberX,UberX,UberX,UberX,UberX,UberX,UberX,...,Comfort,VIP,Black,Comfort,Black,Comfort,Black,Comfort,Black,Prioridade
date,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11,...,2025-01-20,2025-01-20,2025-01-20,2025-01-20,2025-01-20,2025-01-20,2025-01-20,2025-01-20,2025-01-20,2025-01-20
time,17:28:00,16:37:00,16:19:00,15:57:00,15:01:00,13:06:00,12:32:00,12:28:00,12:18:00,12:10:00,...,16:37:00,15:53:00,15:37:00,15:16:00,14:23:00,10:35:00,08:21:00,08:04:00,07:27:00,06:14:00
duration,11 min 32 sec,27 min 43 sec,16 min 18 sec,5 min 29 sec,2 min 54 sec,4 min 5 sec,21 min 33 sec,3 min 59 sec,1 min 54 sec,4 min 0 sec,...,6 min 28 sec,22 min 8 sec,16 min 41 sec,16 min 57 sec,44 min 12 sec,13 min 15 sec,14 min 50 sec,16 min 39 sec,34 min 6 sec,51 min 38 sec
distance,4.15 km,13.42 km,4.78 km,1.38 km,0.96 km,0.80 km,5.28 km,0.91 km,0.55 km,1.61 km,...,1.65 km,6.60 km,12.28 km,5.42 km,17.86 km,3.87 km,6.84 km,7.05 km,12.61 km,25.17 km
origin,"Rua Antônio das Chagas, Santo Amaro - São Paul...","Rua Helena, Itaim Bibi - São Paulo - SP, 04552...","Alameda Gabriel Monteiro da Silva, Jardim Amer...","R. Turiassu, Perdizes - São Paulo - SP, 05005-...","Rua das Tabocas, Alto de Pinheiros - São Paulo...","Terminal Butantã, São Paulo - SP, 05510-050, BR","Rua Capital Federal, Perdizes - São Paulo - SP...","R. Francisco Isoldi, Alto de Pinheiros - São P...","Rua Judite, Alto de Pinheiros - São Paulo - SP...","Av. Arruda Botelho, Alto de Pinheiros - São Pa...",...,"Rua Cardeal Arcoverde, Pinheiros - São Paulo -...","Rua Blumenau, Vila Leopoldina - São Paulo - SP...","Rua Álvaro Rodrigues, Itaim Bibi - São Paulo -...","Praça Comandante Linneu Gomes, Campo Belo - Sã...","Rua Engenheiro Mac Lean, Santana - São Paulo -...","Rua Doutor Diogo de Faria, Vila Mariana - São ...","R. Eudoro Lemos, Santana - Sao Paulo - SP, 020...","R. Turiassu, Perdizes - São Paulo - SP, 05005-...","Pç. Cmte. Linneu Gomes, Vila Congonhas - São P...","Rua Américo Brasiliense, Centro - São Bernardo..."
destination,"R. Geórgia, Brooklin - São Paulo - SP, 04559-0...","Rua Quipa, Campo Limpo - São Paulo - SP, 05756...","Rua Helena, Itaim Bibi - São Paulo - SP, 04552...","Rua Ministro Godói, Perdizes - São Paulo - SP,...","R. Delfina, Vila Madalena - São Paulo - SP, 05...","Avenida Afrânio Peixoto, Butantã - São Paulo -...","Av. Rebouças, Bairro Pinheiros - São Paulo - S...","Rua Paulistânia, Sumarezinho - São Paulo - SP,...","Rua Gumercindo Fleury, Alto de Pinheiros - São...","Pç. Panamericana, Alto de Pinheiros - São Paul...",...,"Rua Teodoro Sampaio, Jardim Paulista - São Pau...","R. Mateus Grou, Pinheiros - São Paulo - SP, 05...","Av. José César de Oliveira, Vila Leopoldina - ...","Rua das Margaridas, Itaim Bibi - São Paulo - S...","Aeroporto Deputado Freitas Nobre (CGH), Vila C...","R. Treze de Maio, Bela Vista - São Paulo - SP,...","Av. Mandaqui, Limão - São Paulo - SP, 02550-00...","Av. Cruzeiro do Sul, Canindé - São Paulo - SP,...","Rua Doutor Homem de Melo, Perdizes - São Paulo...","Aeroporto Deputado Freitas Nobre (CGH), Vila C..."
total_earning,15.01,27.09,13.78,6.69,6.11,9.52,20.0,6.7,6.11,6.89,...,7.89,17.76,37.46,26.49,55.18,13.24,27.4,20.24,68.45,45.99
base_fare,11.01,27.09,13.78,6.69,6.11,6.52,20.0,6.7,6.11,6.89,...,7.89,16.26,37.46,26.49,55.18,13.24,25.15,20.24,65.78,42.75
customer_fare,19.97,34.95,22.96,10.96,9.98,10.9,24.88,10.97,9.94,10.99,...,13.15,22.95,56.71,37.0,88.48,22.07,41.17,33.73,84.67,71.25


In [16]:
# Some statistics of engineered features.
df_rides.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
total_earning,3470.0,24.558524,5.62,13.03,19.67,30.4275,203.47,16.991746
base_fare,3470.0,23.073118,5.62,11.97,18.005,28.5375,203.47,16.621987
customer_fare,3470.0,35.737026,8.58,19.95,29.365,44.7375,374.87,23.556641
paid_to_driver,3470.0,25.281115,5.62,13.4625,20.38,31.4,209.47,17.430603
paid_to_uber,3470.0,9.585254,-42.08,5.1025,8.09,12.87,123.4,8.680952
datetime,3470.0,2024-05-25 11:15:14.195965696,2023-11-06 20:41:00,2024-01-21 10:52:00,2024-05-03 10:31:30,2024-09-22 18:39:30,2025-01-25 19:51:00,
duration_dt,3470.0,0 days 00:19:29.290489913,0 days 00:00:50,0 days 00:08:48.250000,0 days 00:15:01.500000,0 days 00:24:53.750000,0 days 02:11:00,0 days 00:15:26.723051747
distance_km,3470.0,6.658297,0.02,2.36,4.24,8.23,116.49,7.371457
fee,3470.0,27.429581,-90.616246,20.360557,33.791462,38.763056,41.131956,15.284757
earning_pct,3470.0,70.204783,32.239156,61.043617,65.377545,77.643525,133.773165,10.90642


With these features we can have a better understanding of the data and the relationships between the variables. Now, let's generate some visualizations to understand the data better.

In [17]:
df_rides.to_csv('./artifacts/df_rides.csv')
df_paid_cancelations.to_csv('./artifacts/df_paid_cancelations.csv')