In [None]:
import pandas as pd
import numpy as np

In [None]:
data_file_name = "202406-divvy-tripdata.csv"
raw_file = pd.read_csv(f"data/{data_file_name}")
base_dataframe = pd.DataFrame(raw_file)
base_dataframe.head()

In [None]:
# check for percentage of blank cells in each column of the dataframe 
base_dataframe.isna().sum()/base_dataframe.shape[0]

In [None]:
# removing rides with blank data
cleaned_data = base_dataframe.dropna()

In [None]:
# dropping rows with the same start and end times
def get_matching_rows_indexes(df, col1, col2):
    """
    Get a list of indexes of rows where the values in col1 and col2 are the same.

    Parameters:
    df (pd.DataFrame): The DataFrame to check.
    col1 (str): The name of the first column.
    col2 (str): The name of the second column.

    Returns:
    list: A list of row indexes where the values in col1 and col2 are the same.
    """
    matching_indexes = df.index[df[col1] == df[col2]].tolist()
    return matching_indexes

rides_with_the_same_start_and_end_time = get_matching_rows_indexes(cleaned_data, "started_at", "ended_at")
cleaned_data = cleaned_data.drop(rides_with_the_same_start_and_end_time)


In [None]:
# getting distance of bike rides from longitude and lattitude co-ordinates
def deg2rad(deg):
    """
    Convert degrees to radians.

    Parameters:
    deg (float): Angle in degrees.

    Returns:
    float: Angle in radians.
    """
    return deg * (np.pi / 180)

def get_distance_from_lat_lon_in_km(lat1, lon1, lat2, lon2):
    """
    Calculate the distance between two points on the Earth's surface specified by latitude and longitude using the Haversine formula.

    Parameters:
    lat1 (float): Latitude of the first point in degrees.
    lon1 (float): Longitude of the first point in degrees.
    lat2 (float): Latitude of the second point in degrees.
    lon2 (float): Longitude of the second point in degrees.

    Returns:
    float: Distance between the two points in kilometers.
    """

    R = 6371  # Radius of the Earth in km
    
    dLat = deg2rad(lat2 - lat1)
    dLon = deg2rad(lon2 - lon1)
    
    a = (np.sin(dLat / 2) * np.sin(dLat / 2) +
         np.cos(deg2rad(lat1)) * np.cos(deg2rad(lat2)) *
         np.sin(dLon / 2) * np.sin(dLon / 2))
    
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    d = R * c # Distance in km
    return d

cleaned_data["ride_distance_km"] = get_distance_from_lat_lon_in_km(
    cleaned_data["start_lat"],
    cleaned_data["start_lng"],
    cleaned_data["end_lat"], 
    cleaned_data["end_lng"]    
)

In [None]:
def get_seconds(time_str):
    """
    Get seconds from hh:mm:ss formatted time.
    
    Parameters: 
    time_str (string): time in hh:mm:ss format

    Returns:
    int: time in seconds
    """
    try:
        h, m, s = list(map(int, time_str.split(":")))
        return h * 3600 + m * 60 + s
    
    except ValueError:
        return 0
    
    
def get_zero_distance_indexes(df, col):
    """
    Get a list of indexes of rows where the values in col are 0.

    Parameters:
    df (pd.DataFrame): The DataFrame to check.
    col (str): The name of the column.

    Returns:
    list: A list of row indexes where the values in col are 0
    """
    matching_indexes = df.index[df[col] == 0].tolist()
    return matching_indexes


# cleaning data of zero-length rides
rides_with_zero_time = get_zero_distance_indexes(cleaned_data, "ride_length")
cleaned_data = cleaned_data.drop(rides_with_zero_time)

# converting the hh:mm:ss format of time into seconds
cleaned_data["ride_length_seconds"] = cleaned_data["ride_length"].apply(get_seconds)

# converting member_casual to 1/0
cleaned_data["member_casual"] = cleaned_data["member_casual"].astype("category")
cleaned_data["member_casual_binary"] = cleaned_data["member_casual"].cat.codes

# exporting the cleaned data to csv format
cleaned_data.to_csv(f"cleaned_data/clean-{data_file_name}", index=False)