# About Dataset
🚗 Uber Ride Analytics Dataset 2024
This comprehensive dataset contains detailed ride-sharing data from Uber operations for the year 2024, providing rich insights into booking patterns, vehicle performance, revenue streams, cancellation behaviors, and customer satisfaction metrics.

## 📊 Dataset Overview
The dataset captures 148,770 total bookings across multiple vehicle types and provides a complete view of ride-sharing operations including successful rides, cancellations, customer behaviors, and financial metrics.
Key Statistics:
- Total Bookings: 148.77K rides
- Success Rate: 65.96% (93K completed rides)
- Cancellation Rate: 25% (37.43K cancelled bookings)
- Customer Cancellations: 19.15% (27K rides)
- Driver Cancellations: 7.45% (10.5K rides)

In [7]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from pathlib import Path 

# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [8]:

df = pd.read_csv("ncr_ride_bookings.csv")

## EDA

In [9]:
print('Data Loaded: ', df.shape)

Data Loaded:  (150000, 21)


In [10]:
# Quick overview
print('==== Quick overview ====')
display(df.head())

==== Quick overview ====


Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Avg VTAT,Avg CTAT,...,Reason for cancelling by Customer,Cancelled Rides by Driver,Driver Cancellation Reason,Incomplete Rides,Incomplete Rides Reason,Booking Value,Ride Distance,Driver Ratings,Customer Rating,Payment Method
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,,,...,,,,,,,,,,
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,...,,,,1.0,Vehicle Breakdown,237.0,5.73,,,UPI
2,2024-08-23,08:56:10,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,25.8,...,,,,,,627.0,13.58,4.9,4.9,Debit Card
3,2024-10-21,17:17:25,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,28.5,...,,,,,,416.0,34.02,4.6,5.0,UPI
4,2024-09-16,22:08:00,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,19.6,...,,,,,,737.0,48.21,4.1,4.3,UPI


In [11]:
print('==== Data Info ====')
df.info()

==== Data Info ====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Date                               150000 non-null  object 
 1   Time                               150000 non-null  object 
 2   Booking ID                         150000 non-null  object 
 3   Booking Status                     150000 non-null  object 
 4   Customer ID                        150000 non-null  object 
 5   Vehicle Type                       150000 non-null  object 
 6   Pickup Location                    150000 non-null  object 
 7   Drop Location                      150000 non-null  object 
 8   Avg VTAT                           139500 non-null  float64
 9   Avg CTAT                           102000 non-null  float64
 10  Cancelled Rides by Customer        10500 non-null   float64
 11  Reason for cancelli

In [12]:
print('==== Data Description ====')
df.describe().round()

==== Data Description ====


Unnamed: 0,Avg VTAT,Avg CTAT,Cancelled Rides by Customer,Cancelled Rides by Driver,Incomplete Rides,Booking Value,Ride Distance,Driver Ratings,Customer Rating
count,139500.0,102000.0,10500.0,27000.0,9000.0,102000.0,102000.0,93000.0,93000.0
mean,8.0,29.0,1.0,1.0,1.0,508.0,25.0,4.0,4.0
std,4.0,9.0,0.0,0.0,0.0,396.0,14.0,0.0,0.0
min,2.0,10.0,1.0,1.0,1.0,50.0,1.0,3.0,3.0
25%,5.0,22.0,1.0,1.0,1.0,234.0,12.0,4.0,4.0
50%,8.0,29.0,1.0,1.0,1.0,414.0,24.0,4.0,4.0
75%,11.0,37.0,1.0,1.0,1.0,689.0,37.0,5.0,5.0
max,20.0,45.0,1.0,1.0,1.0,4277.0,50.0,5.0,5.0


In [13]:
df.describe(include='O')

Unnamed: 0,Date,Time,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,Reason for cancelling by Customer,Driver Cancellation Reason,Incomplete Rides Reason,Payment Method
count,150000,150000,150000,150000,150000,150000,150000,150000,10500,27000,9000,102000
unique,365,62910,148767,5,148788,7,176,176,5,4,3,5
top,2024-11-16,17:44:57,"""CNR3648267""",Completed,"""CID6715450""",Auto,Khandsa,Ashram,Wrong Address,Customer related issue,Customer Demand,UPI
freq,462,16,3,93000,3,37419,949,936,2362,6837,3040,45909


In [None]:
# Datetime parsing
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.time

df['Hour'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.hour
df['Day_of_week'] = pd.to_datetime(df['Date'], errors='coerce').dt.day_name()
df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.month_name()



In [None]:
# Missing values summary
print('==== Missing Values ====')
df.isnull().sum().sort_values(ascending=False).plot(kind='barh', figsize=(10, 6), title='Missing Values', xlabel='Missing Values', ylabel='Columns')
plt.show()


In [None]:
# Replace string "null" with NaN
df = df.replace('null', np.nan)

In [None]:
# Check missing values
print('Missing values each columns:')
df.isnull().sum()

In [None]:
print('Percentage of missing values:')
(df.isnull().sum() / len(df) * 100)

In [None]:
# For numeric column, fill missing values with median
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    if df[col].isnull().sum() > 0:
        df.fillna({col: df[col].median()}, inplace=True)


# For categorical column, fill missing values with mode or 'Unknown' 
categorical_columns = df.select_dtypes(include=['O']).columns
for col in categorical_columns:
    if df[col].isnull().sum() > 0:
        df.fillna({col: 'Unknown'}, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# Check data duplicates
df.duplicated().sum()

There are no duplicates in the data.

In [None]:
# Delete whitespace in column string
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str).str.strip()

In [None]:
df.head()

### Visualization

In [None]:
# Distribution rides by hour
plt.figure(figsize=(12, 6))
df['Hour'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Rides by Hour')
plt.xlabel('Hour')
plt.ylabel('Number of Rides')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
    

In [None]:
# Distribution rides by month
plt.figure(figsize=(12, 6))
df['Month'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Rides by Month')
plt.xlabel('Month')
plt.ylabel('Number of Rides')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
    

In [None]:
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.figure(figsize=(10, 5))
df['Day_of_week'].value_counts().reindex(order).plot(kind='bar')
plt.title('Distribution of Rides by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Rides')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


### 📊 1. Visualisasi Temporal (Waktu)

Time Patterns:

In [None]:
# Heatmap pattern for booking by hour and day
plt.figure(figsize=(12, 8))
pivot_table = df.pivot_table(values='Booking ID', index='Hour', columns='Day_of_week', aggfunc='count')
sns.heatmap(pivot_table, annot=True, cmap='YlOrRd', fmt='d')
plt.title('Booking Patterns: Hour vs Day of Week')
plt.show()

Monthly Trends:


In [None]:
monthly_bookings = df.groupby('Month').size()
plt.figure(figsize=(12, 5))

order_month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
monthly_bookings = monthly_bookings.reindex(order_month)

plt.plot(monthly_bookings.index, monthly_bookings.values, marker='o')
plt.title('Monthly Booking Trends 2024')
plt.xlabel('Month')
plt.ylabel('Number of Bookings')
plt.grid(True)
plt.show()


### 🚗 2. Visualisasi Vehicle Type

Vehicle type distribution:

In [None]:
# Pie chart distribution of vehicle types
palette_vehicle = sns.color_palette('viridis', len(df['Vehicle Type'].unique()))

plt.figure(figsize=(12, 9))
df['Vehicle Type'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, colors=palette_vehicle)
plt.title('Vehicle Type Distribution')
plt.ylabel('')
plt.legend(title='Vehicle Type')
plt.show()


Revenue by Vehicle Type

In [None]:
# Box plot revenue by vehicle type
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Vehicle Type', y='Booking Value', palette='viridis', hue='Vehicle Type')
plt.xticks(rotation=0)
plt.title('Revenue Distribution by Vehicle Type')
plt.show()

### 📍 3. Location Visualization

Top Pickup/Drop Locations:

In [None]:
# Horizontal bar chart top locations
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# Top pickup locations
df['Pickup Location'].value_counts().nlargest(10).plot(kind='barh', ax=ax1, color=sns.color_palette('viridis'))

ax1.set_title('Top 10 Pickup Locations')

# Top drop locations
df['Drop Location'].value_counts().nlargest(10).plot(kind='barh', ax=ax2, color=sns.color_palette('plasma'))

ax2.set_title('Top 10 Drop Locations')
plt.subplots_adjust(wspace=0.4, hspace=0.3)
plt.show()

### 💰 4. Visualisasi Revenue & Performance

Revenue Analysis:

In [None]:
# Scatter plot: Distance vs Revenue
plt.figure(figsize=(10, 6))
plt.scatter(df['Ride Distance'], df['Booking Value'], alpha=0.6)
plt.xlabel('Ride Distance (km)')
plt.ylabel('Booking Value')
plt.title('Distance vs Revenue Correlation')
plt.show()

Payment Method Analysis:

In [None]:
# Stacked bar chart: Payment method by vehicle type
payment_vehicle = pd.crosstab(df['Vehicle Type'], df['Payment Method'])
payment_vehicle.plot(kind='bar', stacked=False, figsize=(12, 6), cmap='plasma')
plt.title('Payment Methods by Vehicle Type')
plt.xticks(rotation=0)
plt.show()

### ❌ 5. Visualization Cancellation Analysis

Cancellation Reasons:

In [None]:
# Donut chart for cancellation reasons
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Customer cancellations
customer_cancel = df['Reason for cancelling by Customer'].value_counts()
ax1.pie(customer_cancel.values, labels=customer_cancel.index, autopct='%1.1f%%', colors=sns.color_palette('viridis'))

ax1.set_title('Customer Cancellation Reasons')

# Driver cancellations
driver_cancel = df['Driver Cancellation Reason'].value_counts()
ax2.pie(driver_cancel.values, labels=driver_cancel.index, autopct='%1.1f%%', colors=sns.color_palette('plasma'))

ax2.set_title('Driver Cancellation Reasons')

plt.show()

### ⭐ 6. Visualisasi Rating & Satisfaction

Rating Distributions:

In [None]:
# Histogram ratings
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

df['Driver Ratings'].hist(bins=20, ax=ax1, alpha=0.7)
ax1.set_title('Driver Ratings Distribution')

df['Customer Rating'].hist(bins=20, ax=ax2, alpha=0.7)
ax2.set_title('Customer Ratings Distribution')
plt.show()

Rating vs Revenue:

In [None]:
# Scatter plot: Rating vs Revenue
plt.figure(figsize=(10, 6))
plt.scatter(df['Driver Ratings'], df['Booking Value'], alpha=0.6, label='Driver Rating')
plt.scatter(df['Customer Rating'], df['Booking Value'], alpha=0.6, label='Customer Rating')
plt.xlabel('Rating')
plt.ylabel('Booking Value')
plt.legend()
plt.title('Rating vs Revenue Analysis')
plt.show()

### 📈 7. Dashboard-style Visualizations

Multi-metric Dashboard:

In [None]:
# Subplot dashboard
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Booking status distribution
df['Booking Status'].value_counts().plot(kind='pie', ax=axes[0,0])
axes[0, 0].set_title('Booking Status')

# Hourly pattern
df['Hour'].value_counts().sort_index().plot(kind='line', ax=axes[0,1])
axes[0,1].set_title('Hourly Booking Pattern')

# Vehicle type performance
vehicle_revenue = df.groupby('Vehicle Type')['Booking Value'].mean()
vehicle_revenue.plot(kind='bar', ax=axes[0,2])
axes[0,2].set_title('Avg Revenue by Vehicle')

# Distance distribution
df['Ride Distance'].hist(bins=30, ax=axes[1,0])
axes[1,0].set_title('Distance Distribution')

# VTAT vs CTAT
axes[1,1].scatter(df['Avg VTAT'], df['Avg CTAT'])
axes[1,1].set_title('VTAT vs CTAT')

# Monthly trend
monthly_revenue = df.groupby('Month')['Booking Value'].sum()
monthly_revenue.plot(kind='line', marker='o', ax=axes[1,2])
axes[1,2].set_title('Monthly Revenue Trend')

plt.tight_layout()
plt.show()

📊 Key Insights from Dashboard:
1. Booking Status (Pie Chart)
- Completed rides dominate (~65-70%)
- Cancelled by Driver is significant (orange slice)
- Cancelled by Customer is visible (red slice)
- No Driver Found and Incomplete are relatively small
2. Hourly Booking Pattern (Line Chart)
- Peak hours: 9-10 AM and 5-6 PM
- Low activity: 12-5 AM (night hours)
- Classic pattern: Morning and evening rush hours
3. Average Revenue by Vehicle
- All vehicle types have similar average revenue (~480-490)
- This indicates consistent pricing strategy
- Auto slightly lower than others
4. Distance Distribution
- Highly skewed: Majority are short-distance trips
- Peak around 20-25 km
- Long tail: Few long-distance journeys
5. VTAT vs CTAT Scatter
- Dense cluster shows consistency in waiting times
- VTAT (Vehicle Time to Arrival) vs CTAT (Customer Time to Arrival)
- Pattern shows good correlation
6. Monthly Revenue Trend
- High volatility throughout the year
- Peak in June (~6.25M)
- Dip in January (~5.7M)
- Clear seasonal patterns
- High volatility throughout the year
- Peak in June (~6.25M)
- Dip in January (~5.7M)
- Clear seasonal patterns visible


In [None]:
# Analisis cancellation rate by hour
cancellation_by_hour = df[df['Booking Status'].isin(['Cancelled by Customer', 'Cancelled by Driver'])].groupby('Hour').size()

cancellation_by_hour.plot(kind='barh', title='Cancellation Rate by Hour', figsize=(11, 9))
plt.xlabel('Hour of Day')
plt.ylabel('Number of Cancellations')
plt.show()


In [None]:
# Revenue per km analysis
df['Revenue_per_km'] = df['Booking Value'] / df['Ride Distance']
revenue_efficiency = df.groupby('Vehicle Type')['Revenue_per_km'].mean()
revenue_efficiency.plot(kind='bar', title='Revenue per km by Vehicle Type', figsize=(11, 9))
plt.xlabel('Vehicle Type')
plt.ylabel('Revenue per km')
plt.xticks(rotation=0)
plt.show()


In [None]:
# Weekly patterns
df['Week'] = df['Date'].dt.isocalendar().week
weekly_demand = df.groupby(['Week', 'Hour']).size().unstack()
plt.figure(figsize=(12, 8))
sns.heatmap(weekly_demand, cmap='YlGnBu')
plt.title('Weekly Demand Pattern')
plt.xlabel('Hour of Day')
plt.ylabel('Week of Year')
plt.show()



📊 Weekly Demand Pattern Heatmap Analysis

This heatmap provides valuable insights about Uber demand patterns throughout 2024! Let me break down the key findings:

🔍 Key Insights:

### 1. Consistent Daily Patterns

- **Peak Hours (5-7 PM)**
  - Dark blue coloring indicates highest demand
  - Primary rush hour period

- **Secondary Peak (9-11 AM)**
  - Morning rush hour activity
  - Notable but lower than evening peak

- **Low Demand Period (12-6 AM)**
  - Yellow/light green coloring
  - Minimal ride activity

- **Evening Activity (8-10 PM)**
  - Maintains relatively high demand
  - Post-peak but significant usage

### 2. Weekly Patterns

- **Year-round Consistency**
  - Regular weekly pattern repetition
  - Predictable demand cycles

- **Weekend vs Weekday**
  - No significant visible differentiation
  - Weekly aggregation masks daily variations

- **Seasonal Variations**
  - Fluctuating demand across weeks
  - Notable periodic changes

### 3. Seasonal Trends

- **Mid-year Peak (Weeks 20-30)**
  - May-July shows heightened activity
  - Summer season demand surge

- **Year-end Activity (Weeks 45-52)**
  - November-December peak period
  - Holiday season impact

- **Early Year (Weeks 1-10)**
  - January-March shows lower demand
  - Post-holiday season decline


# Machine Learning Implementation

## 🎯 1. Booking Status Prediction - Classification

### Step 1: Data Preparation & Feature Engineering

In [None]:
# Check target variable distribution
print('Booking Status Distribution:')
print(df['Booking Status'].value_counts())
print('Percentage')
print(df['Booking Status'].value_counts(normalize=True) * 100)

### Step 2: Feature Engineering

In [None]:
# Create new features
df_ml = df.copy()

# Temporal features
df_ml['Is_Weekend'] = df_ml['Day_of_week'].isin(['Saturday', 'Sunday']).astype(int)
df_ml['Is_Rush_Hour'] = df_ml['Hour'].isin([8, 9, 17, 18, 19]).astype(int)
df_ml['Is_Night'] = df_ml['Hour'].isin([22, 23, 0, 1, 2, 3, 4, 5]).astype(int)

# Cyclical encoding for hour
df_ml['Hour_sin'] = np.sin(2 * np.pi * df_ml['Hour'] / 24)
df_ml['Hour_cos'] = np.cos(2 * np.pi * df_ml['Hour'] / 24)

# Efficiency features
df_ml['Revenue_per_km'] = df_ml['Booking Value'] / df_ml['Ride Distance']
df_ml['VTAT_CTAT_ratio'] = df_ml['Avg VTAT'] / df_ml['Avg CTAT']

# Location features (frequency encoding)
pickup_counts = df_ml['Pickup Location'].value_counts()
drop_counts = df_ml['Drop Location'].value_counts()
df_ml['Pickup_Frequency'] = df_ml['Pickup Location'].map(pickup_counts)
df_ml['Drop_Frequency'] = df_ml['Drop Location'].map(drop_counts)

print("New features created successfully!")


### Step 3: Feature Selection & Preprocessing

In [None]:
# Select features for modeling
feature_columns = [
    'Hour', 'Hour_sin', 'Hour_cos',
    'Is_Weekend', 'Is_Rush_Hour', 'Is_Night',
    'Ride Distance', 'Booking Value',
    'Avg VTAT', 'Avg CTAT', 'VTAT_CTAT_ratio',
    'Revenue_per_km',
    'Pickup_Frequency', 'Drop_Frequency'
]

# Add categorical features with encoding
categorical_features = ['Vehicle Type', 'Day_of_week', 'Month']

# Label encoding for categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df_ml[f'{col}_encoded'] = le.fit_transform(df_ml[col])
    label_encoders[col] = le 
    feature_columns.append(f'{col}_encoded')
    

# Prepare final dataset
X = df_ml[feature_columns]
y = df_ml['Booking Status']

print(f'Features shape: {X.shape}')
print(f'Target classes: {y.unique()}')

### Step 4: Train-Test Split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=42, stratify=y
)

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')
print('Training set target distribution:')
print(y_train.value_counts(normalize=True))


### Step 5: Model Training & Evaluation

In [None]:
# Intialize and train Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)


# Train the model
print('Training Random Forest model...')
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.4f}')

# Detailed classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

### Step 6: Model Analysis & Visualization

In [None]:
# Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=rf_model.classes_,
            yticklabels=rf_model.classes_)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('Top 10 Feature Importance - Booking Status Prediction')
plt.xlabel('Importance')
plt.show()

### Step 7: Cross-Validation

In [None]:
# Cross-validation for robust evaluation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print('Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})')

### Step 8: Model Interpretation & Business Insights

In [None]:
# Analyze predictions by booking status
prediction_analysis = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Correct': y_test == y_pred
})

# Succes rate by actual status
success_by_status = prediction_analysis.groupby('Actual')['Correct'].mean()
print('Prediction Accuracy by Booking Status:')
for status, accuracy in success_by_status.items():
    print(f'{status}: {accuracy:.4f}')