In [4]:
import pandas as pd
import numpy as np
import random

In [6]:
# loading the original or previously enhanced dataset
df = pd.read_csv("data/rideshare_kaggle.csv")

# checking for gender column and adding if missing
if 'gender' not in df.columns:
    # adding gender column by choosing male or female randomly
    df['gender'] = np.random.choice(['Male', 'Female'], size=len(df))
    print("gender column has been added.")
else:
    # saying that gender column is already there
    print("gender column already exists.")

# checking for vehicle_type column and adding if missing
if 'vehicle_type' not in df.columns:
    # adding vehicle_type column by picking a type randomly
    df['vehicle_type'] = np.random.choice(['Sedan', 'SUV', 'Bike', 'Hatchback'], size=len(df))
    print("vehicle_type column has been added.")
else:
    # saying that vehicle_type column is already there
    print("vehicle_type column already exists.")

# defining location to coordinate mapping
location_coords = {
    'Back Bay': (42.3503, -71.0810),
    'Beacon Hill': (42.3588, -71.0707),
    'North End': (42.3656, -71.0542),
    'North Station': (42.3664, -71.0622),
    'South Station': (42.3522, -71.0555),
    'Theatre District': (42.3519, -71.0645),
    'West End': (42.3655, -71.0661),
    'Financial District': (42.3550, -71.0552)
}

# checking and adding coordinate columns if not present
if 'source_lat' not in df.columns:
    # mapping source names to latitude and longitude
    df['source_lat'] = df['source'].map(lambda x: location_coords.get(x, (np.nan, np.nan))[0])
    df['source_lon'] = df['source'].map(lambda x: location_coords.get(x, (np.nan, np.nan))[1])
    # mapping destination names to latitude and longitude
    df['dest_lat'] = df['destination'].map(lambda x: location_coords.get(x, (np.nan, np.nan))[0])
    df['dest_lon'] = df['destination'].map(lambda x: location_coords.get(x, (np.nan, np.nan))[1])
    print("coordinate columns have been added.")
else:
    # saying that coordinate columns already exist
    print("coordinate columns already exist.")

# saving the enhanced dataset to a new csv file
enhanced_path = "data/enhanced_gig_data.csv"
df.to_csv(enhanced_path, index=False)
print(f"enhanced dataset has been saved to: {enhanced_path}")


gender column has been added.
vehicle_type column has been added.
coordinate columns have been added.
enhanced dataset has been saved to: data/enhanced_gig_data.csv


In [7]:
# loading the enhanced dataset
df = pd.read_csv("data/enhanced_gig_data.csv")

# pick only the columns we need
columns = [
    'datetime', 'hour', 'day', 'month',
    'source', 'destination',
    'cab_type', 'price', 'distance', 'surge_multiplier',
    'gender', 'vehicle_type',
    'source_lat', 'source_lon', 'dest_lat', 'dest_lon'
]
df_clean = df[columns]

# drop any rows that have missing values
df_clean = df_clean.dropna()

# convert datetime column to datetime type
df_clean['datetime'] = pd.to_datetime(df_clean['datetime'])

# save the cleaned dataset to a new csv file
df_clean.to_csv("data/cleaned_gig_data.csv", index=False)

print("cleaned dataset saved successfully.")


cleaned dataset saved successfully.
