In [1]:
# Processing for Uber Data 2023

In [1]:
import pandas as pd

In [3]:
# Import file and print basic information 

csv_file_path = './driver_activity_2024_07_26.csv'
df = pd.read_csv(csv_file_path)

# Test dataframe
df.head()

  df = pd.read_csv(csv_file_path)


Unnamed: 0,weekstr,driver_hashed_uuid,total_session_mileage,total_passenger_mileage,online_minutes,rental_company,is_rental,trip_minutes,driver_pay_excl_bonus_excl_tips,driver_tips,driver_bonus
0,2023-06-12 00:00:00.000,1F12B9286DCD069992B36B817DDD997F,351.14,318.35,1297.483333,alm,True,914.2333333333332,1121.33,58.9,2.0
1,2023-06-12 00:00:00.000,BE8F1C4692F270AE9E42B68D9D62665D,343.64,292.48,1800.8,\N,False,984.7000000000002,973.52,79.2,0.0
2,2023-06-12 00:00:00.000,AB3B4EF2B8752C1099AAB6AD2058E402,442.56,419.49,3052.333333,\N,False,2043.2,1747.64,126.64,3.0
3,2023-06-12 00:00:00.000,32166A232214DE700E035DB9A20FDC69,421.35,324.0,2178.783333,\N,False,1359.85,1320.17,159.46,22.5
4,2023-06-12 00:00:00.000,D7684099A4ED802A5D947F0A461FDF80,39.85,34.8,301.05,\N,False,194.5,195.1,19.78,4.0


In [4]:
# Convert to numeric values and remove non-numeric values
df['total_session_mileage'] = pd.to_numeric(df['total_session_mileage'], errors='coerce')
df['driver_pay_excl_bonus_excl_tips'] = pd.to_numeric(df['driver_pay_excl_bonus_excl_tips'], errors='coerce')
df['driver_tips'] = pd.to_numeric(df['driver_tips'], errors='coerce')
df['driver_bonus'] = pd.to_numeric(df['driver_bonus'], errors='coerce')
df['online_minutes'] = pd.to_numeric(df['online_minutes'], errors='coerce')

# Remove null values from the data
df.dropna(subset=['total_session_mileage'], inplace=True)
df.dropna(subset=['is_rental'], inplace=True)
df.dropna(subset=['driver_tips'], inplace=True)
df.dropna(subset=['driver_bonus'], inplace=True)
df.dropna(subset=['online_minutes'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Ensure the 'is_rental' column contains only 'true' or 'false' as strings
df['is_rental'] = df['is_rental'].astype(str).str.strip().str.lower()

In [5]:
# Prepare an empty list to store values 
results = []

In [6]:
### Print values for EXPENSES

## Calculate the number of ALL drivers
total_drivers = df['driver_hashed_uuid'].nunique()
print("Total drivers:", total_drivers)
results.append(["Total drivers", total_drivers])

## Calculate the number of RENTERS
unique_renters = df.loc[df['is_rental'] == 'true', 'driver_hashed_uuid'].nunique()
print("Total renters:", unique_renters)
results.append(["Total renters", unique_renters])

## Calculate the number of OWNERS
unique_owners = df.loc[df['is_rental'] == 'false', 'driver_hashed_uuid'].nunique()
print("Total owners:", unique_owners)
results.append(["Total owners", unique_owners])

# Calculate duplicates
owners_set = set(df.loc[df['is_rental'] == 'false', 'driver_hashed_uuid'])
renters_set = set(df.loc[df['is_rental'] == 'true', 'driver_hashed_uuid'])
intersection_set = owners_set & renters_set
drivers_both_categories = len(intersection_set)
print("Drivers in both categories:", drivers_both_categories)
results.append(["Drivers in both categories", drivers_both_categories])

## Calculate the total annual miles for ALL
total_miles_all = df['total_session_mileage'].sum()
print("Total miles for all drivers:", total_miles_all)
results.append(["Total miles for all drivers", total_miles_all])

## Calculate the total annual miles for RENTERS
total_miles_renters = df.loc[df['is_rental'] == 'true', 'total_session_mileage'].sum()
print("Total miles for renters:", total_miles_renters)
results.append(["Total miles for renters", total_miles_renters])

## Calculate the total annual miles for OWNERS
total_miles_owners = df.loc[df['is_rental'] == 'false', 'total_session_mileage'].sum()
print("Total miles for owners:", total_miles_owners)
results.append(["Total miles for owners", total_miles_owners])

## Calculate the mean weekly miles for ALL
average_weekly_miles_all = df['total_session_mileage'].mean()
print("Mean weekly miles for all drivers:", average_weekly_miles_all)
results.append(["Mean weekly miles for all drivers", average_weekly_miles_all])

## Calculate the mean weekly miles for RENTERS
average_weekly_miles_renters = df.loc[df['is_rental'] == 'true', 'total_session_mileage'].mean()
print("Mean weekly miles for renters:", average_weekly_miles_renters)
results.append(["Mean weekly miles for renters", average_weekly_miles_renters])

## Calculate the mean weekly miles for OWNERS
average_weekly_miles_owners = df.loc[df['is_rental'] == 'false', 'total_session_mileage'].mean()
print("Mean weekly miles for owners:", average_weekly_miles_owners)
results.append(["Mean weekly miles for owners", average_weekly_miles_owners])

Total drivers: 90709
Total renters: 17013
Total owners: 87927
Drivers in both categories: 14233
Total miles for all drivers: 1120169975.7700002
Total miles for renters: 163542886.02999997
Total miles for owners: 956543784.37
Mean weekly miles for all drivers: 324.09540492903807
Mean weekly miles for renters: 365.4238498423521
Mean weekly miles for owners: 318.06823685206575


In [7]:
### Print values for EARNINGS

# Calculate the mean weekly salary
mean_weekly_salary = df['driver_pay_excl_bonus_excl_tips'].mean()
print("Mean weekly salary for all drivers: $", mean_weekly_salary)
results.append(["Mean weekly salary for all drivers", mean_weekly_salary])

# Calculate the mean weekly tips
mean_weekly_tips = df['driver_tips'].mean()
print("Mean weekly tips for all drivers: $", mean_weekly_tips)
results.append(["Mean weekly tips for all drivers", mean_weekly_tips])

# Calculate the mean weekly bonus
mean_weekly_bonus = df['driver_bonus'].mean()
print("Mean weekly bonus for all drivers: $", mean_weekly_bonus)
results.append(["Mean weekly bonus for all drivers", mean_weekly_bonus])

# Calculate the mean weekly overall earnings
total_earnings = mean_weekly_salary + mean_weekly_tips + mean_weekly_bonus
print("Mean weekly total earnings for all drivers: $", total_earnings)
results.append(["Mean weekly total earnings for all drivers", total_earnings])

# Calculate the mean weekly minutes worked
mean_weekly_minutes_worked = df['online_minutes'].mean()
print("Mean weekly minutes worked for all drivers:", mean_weekly_minutes_worked)
results.append(["Mean weekly minutes worked for all drivers", mean_weekly_minutes_worked])

# Calculate the mean weekly hours worked
mean_weekly_hours_worked = mean_weekly_minutes_worked / 60
print("Mean weekly hours worked for all drivers:", mean_weekly_hours_worked)
results.append(["Mean weekly hours worked for all drivers", mean_weekly_hours_worked])

# Calculate the mean hourly earnings
mean_hourly_earnings = total_earnings / mean_weekly_hours_worked
print("Mean hourly earnings: $", mean_hourly_earnings)
results.append(["Mean hourly earnings", mean_hourly_earnings])

# Calculate the mean annual earnings
mean_annual_earnings = total_earnings * 52
print("Mean annual earnings: $", mean_annual_earnings)
results.append(["Mean annual earnings", mean_annual_earnings])

Mean weekly salary for all drivers: $ 986.1991969759067
Mean weekly tips for all drivers: $ 95.24208961497462
Mean weekly bonus for all drivers: $ 13.240848168651226
Mean weekly total earnings for all drivers: $ 1094.6821347595326
Mean weekly minutes worked for all drivers: 1829.7616888585642
Mean weekly hours worked for all drivers: 30.496028147642736
Mean hourly earnings: $ 35.89589206370629
Mean annual earnings: $ 56923.4710074957


In [54]:
# Group by 'driver_hashed_uuid' and calculate the sum of the relevant columns
grouped_df = df.groupby('driver_hashed_uuid').sum()

# Calculate the total earnings for each driver
grouped_df['total_earnings'] = grouped_df['driver_pay_excl_bonus_excl_tips'] + grouped_df['driver_tips'] + grouped_df['driver_bonus']

# Max test
max_earnings = grouped_df['total_earnings'].max()
print("Max total earnings:", max_earnings)

# Define specific quintile ranges
bins = [0, 25000, 50000, 75000, 100000, max(max_earnings + 1, 100000)]

# Bin the total earnings
grouped_df['earnings_bin'] = pd.cut(grouped_df['total_earnings'], bins, right=False, include_lowest=True)

# Convert the Interval objects to strings for CSV export
grouped_df['earnings_bin_str'] = grouped_df['earnings_bin'].astype(str)

# Count the number of drivers in each bin
bin_distribution = grouped_df['earnings_bin_str'].value_counts().sort_index()

# Create a DataFrame for the bin distribution table
bin_distribution_table = bin_distribution.reset_index()
bin_distribution_table.columns = ['Earnings Range', 'Number of Drivers']

# Calculate the percentage of drivers in each bin
total_drivers = bin_distribution_table['Number of Drivers'].sum()
bin_distribution_table['Percentage of Drivers'] = (bin_distribution_table['Number of Drivers'] / total_drivers) * 100

# Display the bin distribution table
print(bin_distribution_table)

# Export the distribution table to a CSV file
bin_distribution_table.to_csv('./earnings_binned.csv', index=False)

Max total earnings: 190769.3770441479
           Earnings Range  Number of Drivers  Percentage of Drivers
0          [0.0, 25000.0)              29344                  32.35
1  [100000.0, 190769.377)               2366                   2.61
2      [25000.0, 50000.0)              27422                  30.23
3      [50000.0, 75000.0)              22114                  24.38
4     [75000.0, 100000.0)               9462                  10.43
5                     nan                  1                   0.00
