<a href="https://colab.research.google.com/github/emilyberlinghoff/BikeShare/blob/main/flow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import pandas as pd
from collections import defaultdict

def add_running_inflow_outflow_flow(input_csv: str, output_csv: str) -> None:
    """
    Reads a bike-share CSV (with columns including Start Station Id, End Station Id, Start Time, etc.),
    sorts rows by Start Time, and processes each trip in chronological order to maintain a running tally.

    For each row:
      - Increment outflow for that row's start station by 1
      - Increment inflow for that row's end station by 1

    Then add four columns to show, as of that row/trip:
      1) 'Total outflow of start station'
      2) 'Total inflow of end station'
      3) 'Total flow of start station' = inflow - outflow for the start station
      4) 'Total flow of end station'   = inflow - outflow for the end station

    Writes the updated DataFrame to output_csv.
    """

    # Read the CSV
    df = pd.read_csv(input_csv)

    # Sort by Start Time (assumes it's a valid datetime or sortable string).
    # Remove or adjust if you do not want chronological sorting.
    df.sort_values(by="Start Time", inplace=True)

    # Dictionaries to track how many bikes have flowed in/out for each station
    station_outflow = defaultdict(int)
    station_inflow = defaultdict(int)

    # We'll store these four columns per row in lists, then attach them to df
    total_outflow_of_start_station_list = []
    total_inflow_of_end_station_list = []
    total_flow_of_start_station_list = []
    total_flow_of_end_station_list = []

    # Go row by row in chronological order
    for _, row in df.iterrows():
        start_id = row["Start Station Id"]
        end_id = row["End Station Id"]

        # This trip departs start station -> increment that station's outflow
        station_outflow[start_id] += 1
        # This trip arrives at end station -> increment that station's inflow
        station_inflow[end_id] += 1

        # Now record the cumulative totals for these two stations
        outflow_of_start_station = station_outflow[start_id]
        inflow_of_end_station = station_inflow[end_id]

        # The total flow for a station is (inflow - outflow)
        flow_of_start_station = station_inflow[start_id] - station_outflow[start_id]
        flow_of_end_station = station_inflow[end_id] - station_outflow[end_id]

        # Append these values for the row
        total_outflow_of_start_station_list.append(outflow_of_start_station)
        total_inflow_of_end_station_list.append(inflow_of_end_station)
        total_flow_of_start_station_list.append(flow_of_start_station)
        total_flow_of_end_station_list.append(flow_of_end_station)

    # Attach the new columns
    df["Total outflow of start station"] = total_outflow_of_start_station_list
    df["Total inflow of end station"]   = total_inflow_of_end_station_list
    df["Total flow of start station"]   = total_flow_of_start_station_list
    df["Total flow of end station"]     = total_flow_of_end_station_list

    # Write out to a new CSV
    df.to_csv(output_csv, index=False)


if __name__ == "__main__":
    input_csv = "Bike share ridership 2024-01.csv" # CHANGE THIS FOR EACH FILE
    output_csv = "Flow 2024-01.csv" # CHANGE THIS FOR EACH FILE
    add_running_inflow_outflow_flow(input_csv, output_csv)
    print(f"Saved updated file with per-row inflow/outflow/flow columns to {output_file}")

Saved updated file with per-row inflow/outflow/flow columns to Flow 2024-01.csv


In [30]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_station_flow_by_week(csv_file: str, station_id: int) -> None:
    """
    Reads the CSV containing columns:
      - Start Station Id, End Station Id
      - Total flow of start station, Total flow of end station
      - Start Time

    Groups data for the given station_id into five day-range chunks:
      1) Days 1-7
      2) Days 8-14
      3) Days 15-21
      4) Days 22-28
      5) Days 29+

    Creates a separate plot for each chunk that has any data.
    """

    # 1. Read the CSV
    df = pd.read_csv(csv_file)

    # 2. Convert 'Start Time' to datetime so we can plot it properly
    df["Start Time"] = pd.to_datetime(df["Start Time"], errors="coerce", dayfirst=False)

    # 3. Filter rows where this station appears as start OR end
    df_station = df[
        (df["Start Station Id"] == station_id)
        | (df["End Station Id"] == station_id)
    ].copy()

    # 4. For each row, pick which "Total flow" column applies to this station
    def get_station_flow(row):
        if row["Start Station Id"] == station_id:
            return row["Total flow of start station"]
        else:
            return row["Total flow of end station"]

    df_station["Station Flow"] = df_station.apply(get_station_flow, axis=1)

    # If there's no data for this station, just return
    if df_station.empty:
        print(f"No data found for station {station_id}.")
        return

    # 5. Extract the day of month for chunking.
    #    Since each file is exactly one calendar month, we can rely on "day" alone.
    df_station["Day"] = df_station["Start Time"].dt.day

    # 6. Sort rows chronologically
    df_station.sort_values(by="Start Time", inplace=True)

    # 7. Define the day ranges. The last chunk covers days 29 to (29, 30, 31, etc.)
    day_chunks = [
        (1, 7),
        (8, 14),
        (15, 21),
        (22, 28),
        (29, 31)  # This will include days 29, 30, 31 (if they exist)
    ]

    # 8. Loop over each day-range chunk
    for (start_day, end_day) in day_chunks:
        # Slice the DataFrame based on the day of the month
        # For the last chunk, using <= 31 effectively captures days 29, 30, 31
        chunk_df = df_station[
            (df_station["Day"] >= start_day) &
            (df_station["Day"] <= end_day)
        ]

        if chunk_df.empty:
            continue  # Skip if there's no data in this range

        # 9. Create a figure for this chunk
        plt.figure(figsize=(8, 4))
        plt.plot(
            chunk_df["Start Time"],
            chunk_df["Station Flow"],
            marker='o',
            markersize=2,
            linewidth=0.8
        )
        plt.xticks(rotation=45, ha='right', fontsize=8)
        plt.yticks(fontsize=8)
        plt.xlabel("Time", fontsize=8)
        plt.ylabel("Total Flow (inflow - outflow)", fontsize=8)
        plt.title(
            f"Station {station_id} Flow: Days {start_day}-{end_day}",
            fontsize=10
        )
        plt.tight_layout()

        # 10. Save each chunk separately
        output_filename = (
            f"Flow_{station_id}_Days_{start_day}_to_{end_day}.png"
        )
        plt.savefig(output_filename, dpi=300, bbox_inches='tight')
        plt.close()

        print(f"Saved plot: {output_filename}")

    print(f"All day-chunked plots for station {station_id} have been saved.")

# Usage:
plot_station_flow_by_week("Flow 2024-01.csv", station_id=7269)

Saved plot: Flow_7269_Days_1_to_7.png
Saved plot: Flow_7269_Days_8_to_14.png
Saved plot: Flow_7269_Days_15_to_21.png
Saved plot: Flow_7269_Days_22_to_28.png
Saved plot: Flow_7269_Days_29_to_31.png
All day-chunked plots for station 7269 have been saved.
