In [26]:
import pandas as pd
import numpy as np


df = pd.read_csv("data/ncr_ride_bookings.csv")
df.shape

(150000, 21)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Date                               150000 non-null  object 
 1   Time                               150000 non-null  object 
 2   Booking ID                         150000 non-null  object 
 3   Booking Status                     150000 non-null  object 
 4   Customer ID                        150000 non-null  object 
 5   Vehicle Type                       150000 non-null  object 
 6   Pickup Location                    150000 non-null  object 
 7   Drop Location                      150000 non-null  object 
 8   Avg VTAT                           139500 non-null  float64
 9   Avg CTAT                           102000 non-null  float64
 10  Cancelled Rides by Customer        10500 non-null   float64
 11  Reason for cancelling by Customer  1050

In [28]:
df.head()

Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Avg VTAT,Avg CTAT,...,Reason for cancelling by Customer,Cancelled Rides by Driver,Driver Cancellation Reason,Incomplete Rides,Incomplete Rides Reason,Booking Value,Ride Distance,Driver Ratings,Customer Rating,Payment Method
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,,,...,,,,,,,,,,
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,...,,,,1.0,Vehicle Breakdown,237.0,5.73,,,UPI
2,2024-08-23,08:56:10,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,25.8,...,,,,,,627.0,13.58,4.9,4.9,Debit Card
3,2024-10-21,17:17:25,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,28.5,...,,,,,,416.0,34.02,4.6,5.0,UPI
4,2024-09-16,22:08:00,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,19.6,...,,,,,,737.0,48.21,4.1,4.3,UPI


In [29]:
for column in df.columns:
    if df[column].dtype == 'object':
        print(f"{column}: {df[column].nunique()} unique values")

Date: 365 unique values
Time: 62910 unique values
Booking ID: 148767 unique values
Booking Status: 5 unique values
Customer ID: 148788 unique values
Vehicle Type: 7 unique values
Pickup Location: 176 unique values
Drop Location: 176 unique values
Reason for cancelling by Customer: 5 unique values
Driver Cancellation Reason: 4 unique values
Incomplete Rides Reason: 3 unique values
Payment Method: 5 unique values


In [30]:
category_columns = ['Booking Status', "Vehicle Type", "Pickup Location", "Drop Location", "Reason for cancelling by Customer", "Driver Cancellation Reason", "Incomplete Rides Reason", "Payment Method"]

df[category_columns] = df[category_columns].astype("category")

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype   
---  ------                             --------------   -----   
 0   Date                               150000 non-null  object  
 1   Time                               150000 non-null  object  
 2   Booking ID                         150000 non-null  object  
 3   Booking Status                     150000 non-null  category
 4   Customer ID                        150000 non-null  object  
 5   Vehicle Type                       150000 non-null  category
 6   Pickup Location                    150000 non-null  category
 7   Drop Location                      150000 non-null  category
 8   Avg VTAT                           139500 non-null  float64 
 9   Avg CTAT                           102000 non-null  float64 
 10  Cancelled Rides by Customer        10500 non-null   float64 
 11  Reason for cancelling by C

In [32]:
df["datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"], format="%Y-%m-%d %H:%M:%S")
df.drop(columns=["Date", "Time"], inplace=True)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   Booking ID                         150000 non-null  object        
 1   Booking Status                     150000 non-null  category      
 2   Customer ID                        150000 non-null  object        
 3   Vehicle Type                       150000 non-null  category      
 4   Pickup Location                    150000 non-null  category      
 5   Drop Location                      150000 non-null  category      
 6   Avg VTAT                           139500 non-null  float64       
 7   Avg CTAT                           102000 non-null  float64       
 8   Cancelled Rides by Customer        10500 non-null   float64       
 9   Reason for cancelling by Customer  10500 non-null   category      
 10  Cancelled Rides by D

In [38]:
df_floats = df.select_dtypes(include=['float64'])
df[df_floats.columns] = df_floats.apply(pd.to_numeric, downcast='float')

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   Booking ID                         150000 non-null  object        
 1   Booking Status                     150000 non-null  category      
 2   Customer ID                        150000 non-null  object        
 3   Vehicle Type                       150000 non-null  category      
 4   Pickup Location                    150000 non-null  category      
 5   Drop Location                      150000 non-null  category      
 6   Avg VTAT                           139500 non-null  float32       
 7   Avg CTAT                           102000 non-null  float32       
 8   Cancelled Rides by Customer        10500 non-null   float32       
 9   Reason for cancelling by Customer  10500 non-null   category      
 10  Cancelled Rides by D