In [1]:
import pandas as pd

# Function to load and preprocess cab ride data
def load_and_preprocess_cab_data(filepath):
    """
    Load cab ride data from a CSV file and preprocess it.
    Args:
        filepath (str): The path to the cab ride data CSV file.
    Returns:
        DataFrame: Preprocessed cab ride data.
    """
    try:
        data_df = pd.read_csv(filepath)
        data_df = data_df[data_df['name'] != 'Taxi']
        data_df['date_time'] = pd.to_datetime(data_df['time_stamp'], unit='ms')
        data_df['car_type'] = data_df['name'].apply(determine_car)
        data_df['weekday'] = data_df['date_time'].dt.weekday.apply(lambda x: 1 if 0 <= x <= 4 else 0)
        data_df['rush_hour'] = data_df['date_time'].apply(is_rush_hour)
        return data_df
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return None

# Function to determine car type
def determine_car(car):
    car_mapping = {
        "Black": "Luxury", "Lux Black": "Luxury", "Lux": "Luxury",
        "Black SUV": "Luxury SUV", "Lux Black XL": "Luxury SUV",
        "UberX": "Base", "Lyft": "Base",
        "UberXL": "Base XL", "Lyft XL": "Base XL",
        "UberPool": "Shared", "Shared": "Shared",
        "WAV": "Wheel Chair Accessible"
    }
    return car_mapping.get(car, "Other")

# Function to check if it's rush hour
def is_rush_hour(time_obj):
    morning_rush = time_obj.hour in range(7, 10)
    evening_rush = time_obj.hour in range(16, 19)
    return int(morning_rush or evening_rush)

# Function to load and preprocess weather data
def load_and_preprocess_weather_data(filepath):
    """
    Load weather data from a CSV file and preprocess it.
    Args:
        filepath (str): The path to the weather data CSV file.
    Returns:
        DataFrame: Preprocessed weather data.
    """
    try:
        weather_df = pd.read_csv(filepath)
        weather_df['date_time'] = pd.to_datetime(weather_df['time_stamp'], unit='s')
        weather_df['is_raining'] = weather_df['rain'].apply(lambda x: 1 if x > 0 else 0)
        weather_df['temp_groups'] = weather_df['temp'].apply(group_temp)
        return weather_df
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return None

# Function to group temperature
def group_temp(temp):
    temp_ranges = [(30, 20), (40, 30), (50, 40), (float('inf'), 50)]
    for upper_bound, group in temp_ranges:
        if temp < upper_bound:
            return group

# Function to merge and clean datasets
def merge_and_clean_data(ride_data, weather_data):
    """
    Merge and clean ride and weather datasets.
    Args:
        ride_data (DataFrame): Preprocessed ride data.
        weather_data (DataFrame): Preprocessed weather data.
    Returns:
        DataFrame: Merged and cleaned dataset.
    """
    merged_df = pd.merge_asof(ride_data.sort_values('date_time'), 
                              weather_data.sort_values('date_time'), 
                              on='date_time', 
                              left_by='source', 
                              right_by='location',
                              direction='nearest')

    final_columns = [
        'date_time', 'distance', 'cab_type', 'source', 'destination',
        'car_type', 'weekday', 'rush_hour', 'is_raining', 'temp_groups',
        'surge_multiplier', 'price',
    ]

    return merged_df[final_columns]

# Function to get main data 
def get_cleaned_data():
    """
    Loads, merges, and cleans the main datasets.
    Returns:
        DataFrame: The cleaned data.
    """
    data_df = load_and_preprocess_cab_data("Data/cab_rides.csv")
    weather_df = load_and_preprocess_weather_data("Data/weather.csv")
    if data_df is not None and weather_df is not None:
        return merge_and_clean_data(data_df, weather_df)
    else:
        raise ValueError("Error in loading data.")
    
# Function to get base modeling data
def get_base_data():
    """
    Extract base data where surge_multiplier is 1.0.
    Returns:
        DataFrame: Filtered base data.
    """
    df = get_cleaned_data()
    base_df = df[df["surge_multiplier"] == 1.0]
    return base_df[["cab_type", "source", "destination", "car_type", "weekday", "rush_hour", "is_raining", "temp_groups", "surge_multiplier", "price"]]

# Function to get dynamic modeling data 
def get_dynamic_data():
    """
    Extract dynamic data.
    Returns:
        DataFrame: Filtered dynamic data.
    """
    df = get_cleaned_data()
    return df[["cab_type", "source", "destination", "car_type", "weekday", "rush_hour", "is_raining", "temp_groups", "surge_multiplier", "price"]]

# NOT YET FUNCTIONAL
# def get_demand_data():
#     """
#     Create dataset based off demand estimation calculation.
#     Returns:
#         DataFrame: Filtered Demand data.
#     """
#     df = get_cleaned_data()
#     demand_df = demand_estimation(df)
#     return demand_df[[set of columns to keep]]

# Main script execution
if __name__ == "__main__":
    data_df = load_and_preprocess_cab_data("Data/cab_rides.csv")
    weather_df = load_and_preprocess_weather_data("Data/weather.csv")

    if data_df is not None and weather_df is not None:
        merged_df = merge_and_clean_data(data_df, weather_df)
        # Save the cleaned data
        merged_df.to_csv("Data/base_cleaned.csv")
    

In [10]:
cleaned_df = get_cleaned_data()
print(cleaned_df.shape)
cleaned_df.head()

(637976, 12)


Unnamed: 0,date_time,distance,cab_type,source,destination,car_type,weekday,rush_hour,is_raining,temp_groups,surge_multiplier,price
0,2018-11-26 03:40:46.318,3.03,Lyft,Boston University,Theatre District,Luxury SUV,1,0,0,40,1.0,34.0
1,2018-11-26 03:40:46.319,1.3,Uber,South Station,Theatre District,Luxury,1,0,0,40,1.0,18.5
2,2018-11-26 03:40:46.320,2.43,Lyft,Northeastern University,Beacon Hill,Base,1,0,0,40,1.0,10.5
3,2018-11-26 03:40:46.320,2.71,Uber,Theatre District,Fenway,Base XL,1,0,0,40,1.0,32.0
4,2018-11-26 03:40:46.320,2.71,Uber,Theatre District,Fenway,Base,1,0,0,40,1.0,19.5


In [11]:
base_df = get_base_data()
print(base_df.shape)
base_df.head()

(617001, 10)


Unnamed: 0,cab_type,source,destination,car_type,weekday,rush_hour,is_raining,temp_groups,surge_multiplier,price
0,Lyft,Boston University,Theatre District,Luxury SUV,1,0,0,40,1.0,34.0
1,Uber,South Station,Theatre District,Luxury,1,0,0,40,1.0,18.5
2,Lyft,Northeastern University,Beacon Hill,Base,1,0,0,40,1.0,10.5
3,Uber,Theatre District,Fenway,Base XL,1,0,0,40,1.0,32.0
4,Uber,Theatre District,Fenway,Base,1,0,0,40,1.0,19.5


In [14]:
base_df[base_df['cab_type'] == 'Uber']['car_type'].unique()

array(['Luxury', 'Base XL', 'Base', 'Wheel Chair Accessible',
       'Luxury SUV', 'Shared'], dtype=object)

In [15]:
base_df[base_df['cab_type'] == 'Lyft']['car_type'].unique()

array(['Luxury SUV', 'Base', 'Shared', 'Luxury', 'Base XL'], dtype=object)

In [12]:
dynamic_df = get_dynamic_data()
print(dynamic_df.shape)
dynamic_df.head()

(637976, 10)


Unnamed: 0,cab_type,source,destination,car_type,weekday,rush_hour,is_raining,temp_groups,surge_multiplier,price
0,Lyft,Boston University,Theatre District,Luxury SUV,1,0,0,40,1.0,34.0
1,Uber,South Station,Theatre District,Luxury,1,0,0,40,1.0,18.5
2,Lyft,Northeastern University,Beacon Hill,Base,1,0,0,40,1.0,10.5
3,Uber,Theatre District,Fenway,Base XL,1,0,0,40,1.0,32.0
4,Uber,Theatre District,Fenway,Base,1,0,0,40,1.0,19.5


Reverse Eng

In [2]:
import pandas as pd

In [4]:
eta_df = pd.read_csv('Data/estimated_eta_added.csv')

surge_eta_df = eta_df[eta_df['surge_multiplier'] != 1.0]
no_surge_eta_df = eta_df[eta_df['surge_multiplier'] == 1.0]

### price = base_price * (1 + estimated_eta * estimated_demand)
### base_price = price / (1 + estimated_eta * estimated_demand)

def rev_base_price(row):
    return row['price'] / (1 + row['estimated_eta'] * row['estimated_demand'])

surge_eta_df['base_price'] = surge_eta_df.apply(rev_base_price, axis=1)
no_surge_eta_df['base_price'] = no_surge_eta_df.apply(rev_base_price, axis=1)

surge_eta_df['price_diff'] = surge_eta_df['price'] - surge_eta_df['base_price']
no_surge_eta_df['price_diff'] = no_surge_eta_df['price'] - no_surge_eta_df['base_price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surge_eta_df['base_price'] = surge_eta_df.apply(rev_base_price, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_surge_eta_df['base_price'] = no_surge_eta_df.apply(rev_base_price, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surge_eta_df['price_diff'] = surge_eta_df['price'] - su

In [5]:
surge_eta_df['price_diff'].describe()

count    20975.000000
mean         9.531648
std          9.358192
min          0.087162
25%          0.494316
50%          8.630501
75%         17.444087
max         58.279945
Name: price_diff, dtype: float64

In [6]:
no_surge_eta_df['price_diff'].describe()

count    617001.000000
mean          7.059071
std           5.531686
min           0.050287
25%           1.154406
50%           6.759392
75%          10.938054
max          54.178585
Name: price_diff, dtype: float64