In [1]:
!pip install haversine



In [2]:
import pandas as pd
from haversine import haversine

In [3]:
df = pd.read_csv('../../data/processed_data/riders_trips.csv')
df['signup_date'] = pd.to_datetime(df['signup_date'], utc=True)
df['pickup_time'] = pd.to_datetime(df['pickup_time'], utc=True)
df['dropoff_time'] = pd.to_datetime(df['dropoff_time'], utc=True)

df['age'] = df['age'].astype('int64')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype              
---  ------            --------------   -----              
 0   user_id           200000 non-null  object             
 1   signup_date       200000 non-null  datetime64[ns, UTC]
 2   loyalty_status    200000 non-null  object             
 3   age               200000 non-null  int64              
 4   city              200000 non-null  object             
 5   avg_rating_given  200000 non-null  float64            
 6   churn_prob        200000 non-null  float64            
 7   trip_id           200000 non-null  object             
 8   driver_id         200000 non-null  object             
 9   fare              200000 non-null  float64            
 10  surge_multiplier  200000 non-null  float64            
 11  tip               200000 non-null  float64            
 12  payment_type      200000 non-null  object   

user_id -> Unique identifier for each customer using the mobility platform.

signup_date -> Date on which the user registered or created an account on the platform.

loyalty_status -> Customer’s loyalty or membership tier, indicating level of engagement or rewards eligibility.

age -> Age of the user at the time of the record.

city -> Primary city where the user is registered or where the trip occurred.

avg_rating_given -> Average rating the user has given to drivers across completed trips.

churn_prob -> Predicted probability that the user will stop using the platform within a defined future period.

trip_id -> Unique identifier for an individual trip or ride.

driver_id
Unique identifier for the driver who completed the trip.

fare
Base fare charged for the trip, excluding tips.

surge_multiplier
Pricing multiplier applied to the base fare during periods of high demand.

tip
Optional gratuity given by the user to the driver.

payment_type
Method used by the user to pay for the trip (e.g., card, cash, digital wallet).

pickup_time
Timestamp indicating when the trip started.

dropoff_time
Timestamp indicating when the trip ended.

pickup_lat
Latitude coordinate of the trip pickup location.

pickup_lng
Longitude coordinate of the trip pickup location.

dropoff_lat
Latitude coordinate of the trip drop-off location.

dropoff_lng
Longitude coordinate of the trip drop-off location.

weather
Weather conditions at the time of the trip, which may influence demand and travel time.

### Data Quality Checks 

#### Typographic Errors

In [6]:
cat_cols = df.select_dtypes(include='object')

for col in cat_cols:
    print(f'{df[col].name}')
    print(f'{df[col].unique()}')
    print('\n')

user_id
['R00000' 'R00001' 'R00002' ... 'R09997' 'R09998' 'R09999']


loyalty_status
['Bronze' 'Silver' 'Gold' 'Platinum']


city
['Nairobi' 'Lagos' 'Cairo']


trip_id
['T001144' 'T022441' 'T024771' ... 'T166786' 'T176764' 'T187733']


driver_id
['D03414' 'D04441' 'D00635' ... 'D04427' 'D04825' 'D00014']


payment_type
['Card' 'Mobile Money' 'Cash']


weather
['Rainy' 'Sunny' 'Foggy' 'Cloudy']




No typographical error in the categorial labels.

#### Validating Dates 

In [7]:
df[['signup_date', 'pickup_time', 'dropoff_time']].describe().transpose()

Unnamed: 0,count,mean,min,25%,50%,75%,max
signup_date,200000,2024-04-26 21:01:14.303999744+00:00,2023-04-27 00:00:00+00:00,2023-10-27 00:00:00+00:00,2024-04-25 00:00:00+00:00,2024-10-31 00:00:00+00:00,2025-04-26 00:00:00+00:00
pickup_time,200000,2024-10-27 02:05:30.728254976+00:00,2024-04-26 21:40:34+00:00,2024-07-28 05:28:59.750000128+00:00,2024-10-27 03:27:51+00:00,2025-01-26 12:48:44+00:00,2025-04-27 23:43:26+00:00
dropoff_time,200000,2024-10-27 02:37:28.166855168+00:00,2024-04-26 22:05:31+00:00,2024-07-28 05:56:58.750000128+00:00,2024-10-27 03:48:26.500000+00:00,2025-01-26 13:20:24.249999872+00:00,2025-04-28 00:12:26+00:00


In [8]:
acc_less_than_zero = df[df['signup_date'] > df['pickup_time']]

acc_less_than_zero['days_b4_signup'] = (acc_less_than_zero['signup_date'] - acc_less_than_zero['pickup_time']).dt.days

acc_less_than_zero[['signup_date', 'pickup_time', 'days_b4_signup']]

# .groupby('user_id')[['signup_date', 'pickup_time']].min()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acc_less_than_zero['days_b4_signup'] = (acc_less_than_zero['signup_date'] - acc_less_than_zero['pickup_time']).dt.days


Unnamed: 0,signup_date,pickup_time,days_b4_signup
0,2025-01-24 00:00:00+00:00,2024-09-03 22:29:02+00:00,142
2,2025-01-24 00:00:00+00:00,2024-05-23 07:10:47+00:00,245
3,2025-01-24 00:00:00+00:00,2025-01-02 13:42:13+00:00,21
4,2025-01-24 00:00:00+00:00,2025-01-07 11:56:49+00:00,16
5,2025-01-24 00:00:00+00:00,2025-01-10 14:53:46+00:00,13
...,...,...,...
199995,2025-03-29 00:00:00+00:00,2025-02-08 08:03:19+00:00,48
199996,2025-03-29 00:00:00+00:00,2024-09-11 06:12:51+00:00,198
199997,2025-03-29 00:00:00+00:00,2024-12-12 15:47:38+00:00,106
199998,2025-03-29 00:00:00+00:00,2024-09-06 16:49:13+00:00,203


The above shows customers who have their pickup dates predating their signup dates. 
It suggests that they were somehow using the app without registering.
If it is so can their be called 'users'.

For the sake of the project, we would not make any reference to the sign_up column in the datasets. It would have been useful in determining the account age of customers. But because of the above insight we would drop it.

In [9]:
acc_less_than_zero.groupby('user_id')[['signup_date', 'pickup_time']].min()

Unnamed: 0_level_0,signup_date,pickup_time
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
R00000,2025-01-24 00:00:00+00:00,2024-05-01 07:21:52+00:00
R00001,2024-09-09 00:00:00+00:00,2024-05-10 18:14:41+00:00
R00002,2024-09-07 00:00:00+00:00,2024-06-18 17:48:24+00:00
R00003,2025-03-17 00:00:00+00:00,2024-05-15 05:13:12+00:00
R00004,2024-08-20 00:00:00+00:00,2024-05-23 13:02:45+00:00
...,...,...
R09985,2025-02-24 00:00:00+00:00,2024-07-16 05:14:57+00:00
R09994,2024-05-31 00:00:00+00:00,2024-04-27 08:20:06+00:00
R09995,2025-04-26 00:00:00+00:00,2024-06-07 05:03:58+00:00
R09997,2025-04-18 00:00:00+00:00,2024-04-29 07:30:01+00:00


Alternatively, we could drop these users data as they may be considered as 'users'

#### Checking numerical values

In [10]:
df.describe(include='number')

Unnamed: 0,age,avg_rating_given,churn_prob,fare,surge_multiplier,tip,pickup_lat,pickup_lng,dropoff_lat,dropoff_lng
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,34.66165,4.461348,0.285692,15.401285,1.1415,0.469566,11.8496,23.924133,11.849589,23.924173
std,9.508071,0.428432,0.159011,6.163199,0.255362,1.100545,13.362151,14.577572,13.362229,14.577642
min,18.0,2.6,0.002934,2.97,1.0,0.0,-1.78636,2.879224,-1.83322,2.830979
25%,28.0,4.2,0.161546,11.0,1.0,0.0,-1.172683,3.496574,-1.172868,3.497195
50%,35.0,4.5,0.265622,14.13,1.0,0.0,6.525574,31.238814,6.525235,31.239118
75%,41.0,4.8,0.388132,18.35,1.2,0.4,29.934766,36.703772,29.935056,36.704067
max,70.0,5.0,0.913302,82.74,3.8,21.86,30.544251,37.31709,30.592457,37.364817


No outrageous values in the numerical columns of the data

### Feature Engineering

#### Behaviour/Demand Features

In [11]:
df['pickup_time_year'] = df['pickup_time'].dt.year
df['pickup_time_month'] = df['pickup_time'].dt.month
df['pickup_time_month_year'] = df['pickup_time'].dt.strftime('%b %Y')
df["pickup_time_day"] = df["pickup_time"].dt.day_name()
df["pickup_time_day_num"] = df["pickup_time"].dt.day
df['pickup_hour'] = df['pickup_time'].dt.hour

df['time_of_day'] = pd.cut(df['pickup_hour'], bins=[-1, 5, 11, 17, 21, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening', 'Late Night'])

df["pickup_is_weekend"] = df["pickup_time"].dt.weekday >= 5  # Saturday=5, Sunday=6

df['pickup_is_peak_hour'] = df['pickup_hour'].between(7, 9) | df['pickup_hour'].between(16, 19)

df['pickup_is_night'] = df['pickup_hour'].between(22, 23) | df['pickup_hour'].between(0, 5)

def map_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Autumn"

df["pickup_time_season"] = df["pickup_time_month"].apply(map_season)

df['trip_duration_min'] = (df['dropoff_time'] - df['pickup_time']).dt.total_seconds() / 60

df['trip_distance_km'] = df.apply(
    lambda x: haversine(
        (x['pickup_lat'], x['pickup_lng']),
        (x['dropoff_lat'], x['dropoff_lng'])
    ), axis=1
)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 33 columns):
 #   Column                  Non-Null Count   Dtype              
---  ------                  --------------   -----              
 0   user_id                 200000 non-null  object             
 1   signup_date             200000 non-null  datetime64[ns, UTC]
 2   loyalty_status          200000 non-null  object             
 3   age                     200000 non-null  int64              
 4   city                    200000 non-null  object             
 5   avg_rating_given        200000 non-null  float64            
 6   churn_prob              200000 non-null  float64            
 7   trip_id                 200000 non-null  object             
 8   driver_id               200000 non-null  object             
 9   fare                    200000 non-null  float64            
 10  surge_multiplier        200000 non-null  float64            
 11  tip                     20

#### Princing & Revenue Features

In [12]:
df['total_fare'] = df['fare'] * df['surge_multiplier']

df['total_fare_with_tip'] = df['total_fare'] + df['tip']

df['tip_percentage'] = df['tip'] / df['total_fare_with_tip']

df['is_surge_trip'] = df['surge_multiplier'] > 1

# df['base_fare_bucket'] = pd.qcut(df['fare'], q=4, labels=['low','medium','high','very_high'])

df['total_fare_bucket'] = pd.qcut(df['total_fare'], q=4, labels=['low','medium','high','very_high'])

df.head()

Unnamed: 0,user_id,signup_date,loyalty_status,age,city,avg_rating_given,churn_prob,trip_id,driver_id,fare,...,pickup_is_peak_hour,pickup_is_night,pickup_time_season,trip_duration_min,trip_distance_km,total_fare,total_fare_with_tip,tip_percentage,is_surge_trip,total_fare_bucket
0,R00000,2025-01-24 00:00:00+00:00,Bronze,34,Nairobi,5.0,0.142431,T001144,D03414,23.62,...,False,True,Autumn,26.0,2.732109,33.068,33.068,0.0,True,very_high
1,R00000,2025-01-24 00:00:00+00:00,Bronze,34,Nairobi,5.0,0.142431,T022441,D04441,16.31,...,False,False,Spring,6.0,3.010959,16.31,16.31,0.0,False,high
2,R00000,2025-01-24 00:00:00+00:00,Bronze,34,Nairobi,5.0,0.142431,T024771,D00635,9.66,...,True,False,Spring,56.0,0.966453,9.66,9.69,0.003096,False,low
3,R00000,2025-01-24 00:00:00+00:00,Bronze,34,Nairobi,5.0,0.142431,T042553,D03102,11.02,...,False,False,Winter,36.0,1.751916,12.122,12.672,0.043403,True,medium
4,R00000,2025-01-24 00:00:00+00:00,Bronze,34,Nairobi,5.0,0.142431,T055259,D03417,20.83,...,False,False,Winter,20.0,4.774206,20.83,21.74,0.041858,False,high


#### User Behaviour

In [13]:
## Create this based on the decision to drop unregisterted users
# df['account_age_days'] = (
#     df['pickup_time'] - df['signup_date']
# ).dt.days

user_agg = df.groupby('user_id').agg(
    user_trip_count=('trip_id', 'count'),
    avg_user_fare=('total_fare', 'mean'),
    avg_user_tip=('tip', 'mean')
).reset_index()

user_agg
# df = df.merge(user_agg, on='user_id', how='left')

Unnamed: 0,user_id,user_trip_count,avg_user_fare,avg_user_tip
0,R00000,25,16.404000,0.161200
1,R00001,14,13.923786,0.054286
2,R00002,24,19.905208,0.217083
3,R00003,9,16.142000,0.096667
4,R00004,16,23.860812,0.586250
...,...,...,...,...
9995,R09995,13,20.256462,0.247692
9996,R09996,15,14.297400,0.709333
9997,R09997,18,19.719278,0.259444
9998,R09998,22,18.657545,0.479091


#### Weather-Sensitive Features

In [14]:
df['bad_weather_flag'] = df['weather'].isin(['Rainy', 'Foggy']).astype(int)

In [15]:
df['weather_surge_interaction'] = df['bad_weather_flag'] * df['surge_multiplier']

In [16]:
weather_counts = df['weather'].value_counts(normalize=True)

# Map demand index (higher = more trips in that weather)
df['weather_demand_index'] = df['weather'].map(weather_counts)

In [17]:
df[['weather', 'bad_weather_flag', 'weather_demand_index', 'surge_multiplier', 'weather_surge_interaction']].head()

Unnamed: 0,weather,bad_weather_flag,weather_demand_index,surge_multiplier,weather_surge_interaction
0,Rainy,1,0.19988,1.4,1.4
1,Sunny,0,0.600755,1.0,0.0
2,Sunny,0,0.600755,1.0,0.0
3,Sunny,0,0.600755,1.1,0.0
4,Sunny,0,0.600755,1.0,0.0


In [18]:
df.drop(columns=['signup_date', 'dropoff_time'], inplace=True) 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 39 columns):
 #   Column                     Non-Null Count   Dtype              
---  ------                     --------------   -----              
 0   user_id                    200000 non-null  object             
 1   loyalty_status             200000 non-null  object             
 2   age                        200000 non-null  int64              
 3   city                       200000 non-null  object             
 4   avg_rating_given           200000 non-null  float64            
 5   churn_prob                 200000 non-null  float64            
 6   trip_id                    200000 non-null  object             
 7   driver_id                  200000 non-null  object             
 8   fare                       200000 non-null  float64            
 9   surge_multiplier           200000 non-null  float64            
 10  tip                        200000 non-null  float64     

In [19]:
with pd.option_context('display.max_columns', None):
    display(df)

Unnamed: 0,user_id,loyalty_status,age,city,avg_rating_given,churn_prob,trip_id,driver_id,fare,surge_multiplier,tip,payment_type,pickup_time,pickup_lat,pickup_lng,dropoff_lat,dropoff_lng,weather,pickup_time_year,pickup_time_month,pickup_time_month_year,pickup_time_day,pickup_time_day_num,pickup_hour,time_of_day,pickup_is_weekend,pickup_is_peak_hour,pickup_is_night,pickup_time_season,trip_duration_min,trip_distance_km,total_fare,total_fare_with_tip,tip_percentage,is_surge_trip,total_fare_bucket,bad_weather_flag,weather_surge_interaction,weather_demand_index
0,R00000,Bronze,34,Nairobi,5.0,0.142431,T001144,D03414,23.62,1.4,0.00,Card,2024-09-03 22:29:02+00:00,-1.115239,36.805339,-1.136842,36.793631,Rainy,2024,9,Sep 2024,Tuesday,3,22,Late Night,False,False,True,Autumn,26.0,2.732109,33.068,33.068,0.000000,True,very_high,1,1.4,0.199880
1,R00000,Bronze,34,Nairobi,5.0,0.142431,T022441,D04441,16.31,1.0,0.00,Card,2025-04-02 14:46:29+00:00,-1.350546,36.745210,-1.339873,36.770102,Sunny,2025,4,Apr 2025,Wednesday,2,14,Afternoon,False,False,False,Spring,6.0,3.010959,16.310,16.310,0.000000,False,high,0,0.0,0.600755
2,R00000,Bronze,34,Nairobi,5.0,0.142431,T024771,D00635,9.66,1.0,0.03,Card,2024-05-23 07:10:47+00:00,-1.316560,36.687127,-1.310676,36.680729,Sunny,2024,5,May 2024,Thursday,23,7,Morning,False,True,False,Spring,56.0,0.966453,9.660,9.690,0.003096,False,low,0,0.0,0.600755
3,R00000,Bronze,34,Nairobi,5.0,0.142431,T042553,D03102,11.02,1.1,0.55,Mobile Money,2025-01-02 13:42:13+00:00,-1.726473,37.301560,-1.713882,37.311035,Sunny,2025,1,Jan 2025,Thursday,2,13,Afternoon,False,False,False,Winter,36.0,1.751916,12.122,12.672,0.043403,True,medium,0,0.0,0.600755
4,R00000,Bronze,34,Nairobi,5.0,0.142431,T055259,D03417,20.83,1.0,0.91,Card,2025-01-07 11:56:49+00:00,-1.483414,36.974683,-1.474478,36.932673,Sunny,2025,1,Jan 2025,Tuesday,7,11,Morning,False,False,False,Winter,20.0,4.774206,20.830,21.740,0.041858,False,high,0,0.0,0.600755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,R09999,Gold,36,Nairobi,3.9,0.401529,T161109,D00887,18.04,1.0,0.49,Card,2025-02-08 08:03:19+00:00,-1.281023,36.756645,-1.274055,36.713220,Sunny,2025,2,Feb 2025,Saturday,8,8,Morning,True,True,False,Winter,32.0,4.889174,18.040,18.530,0.026444,False,high,0,0.0,0.600755
199996,R09999,Gold,36,Nairobi,3.9,0.401529,T166028,D02903,26.68,1.3,0.00,Cash,2024-09-11 06:12:51+00:00,-1.483096,36.833612,-1.497719,36.826340,Cloudy,2024,9,Sep 2024,Wednesday,11,6,Morning,False,False,False,Autumn,49.0,1.815820,34.684,34.684,0.000000,True,very_high,0,0.0,0.149370
199997,R09999,Gold,36,Nairobi,3.9,0.401529,T166786,D02777,9.10,1.0,1.46,Cash,2024-12-12 15:47:38+00:00,-1.135358,36.654228,-1.120021,36.655476,Sunny,2024,12,Dec 2024,Thursday,12,15,Afternoon,False,False,False,Winter,40.0,1.711043,9.100,10.560,0.138258,False,low,0,0.0,0.600755
199998,R09999,Gold,36,Nairobi,3.9,0.401529,T176764,D04642,20.27,1.0,0.08,Mobile Money,2024-09-06 16:49:13+00:00,-1.109425,36.967027,-1.065215,36.917535,Sunny,2024,9,Sep 2024,Friday,6,16,Afternoon,False,True,False,Autumn,17.0,7.378412,20.270,20.350,0.003931,False,high,0,0.0,0.600755


In [20]:
os.makedirs('../data/processed_data', exist_ok=True)
df.to_csv('../../data/processed_data/data_EDA.csv',index=False)