In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [1]:
# Now lets read the processed data first

import pandas as pd
from pathlib import Path

month = 1
year = 2024
path = Path("..") / "data" / "processed" / f"rides_{year}_{month:02}.csv"

rides = pd.read_csv(path)
rides.iloc[1000:1020]

  rides = pd.read_csv(path)


Unnamed: 0,pickup_datetime,pickup_location_id
1000,2024-01-07 20:16:25.596,5282.02
1001,2024-01-12 06:24:37.146,6569.08
1002,2024-01-30 05:46:37.664,6569.08
1003,2024-01-18 17:25:35.727,5659.11
1004,2024-01-27 12:33:14.606,7954.12
1005,2024-01-31 06:35:25.341,5659.11
1006,2024-01-12 09:04:43.075,8156.01
1007,2024-01-19 16:35:20.070,5659.11
1008,2024-01-16 18:32:57.673,5659.11
1009,2024-01-29 07:04:52.189,4339.01


In [2]:
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1880623 entries, 0 to 1880622
Data columns (total 2 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   pickup_datetime     object
 1   pickup_location_id  object
dtypes: object(2)
memory usage: 28.7+ MB


In [3]:
# As we are trying to predict how many rides a particular location might get after 1 hour, so doesn't need the the minute information such as
# 45:29. We just roundoff these accurate information to nearest hour. If timestamp is < 1hr then make it 0. Similarly if it is between 1 and 2
# we need to roundoff it to 1. 

rides["pickup_datetime"] = pd.to_datetime(rides["pickup_datetime"], errors='coerce')
rides["pickup_hour"] = rides["pickup_datetime"].dt.floor('h')
rides.tail()

Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
1880618,2024-01-29 07:40:32.831,4956.02,2024-01-29 07:00:00
1880619,2024-01-29 11:56:47.527,6209.05,2024-01-29 11:00:00
1880620,2024-01-12 16:51:37.231,4172.04,2024-01-12 16:00:00
1880621,2024-01-26 09:32:45.932,7692.11,2024-01-26 09:00:00
1880622,2024-01-29 17:29:55.879,6459.07,2024-01-29 17:00:00


In [4]:
# Now lets find how many rides we have in each hour

agg_rides = rides.groupby(["pickup_hour", "pickup_location_id"]).size().reset_index()
agg_rides.rename(columns={0: "rides"}, inplace=True)
agg_rides.tail()

Unnamed: 0,pickup_hour,pickup_location_id,rides
778782,2024-01-31 23:00:00,8664.06,1
778783,2024-01-31 23:00:00,8715.01,1
778784,2024-01-31 23:00:00,8734.04,1
778785,2024-01-31 23:00:00,8748.02,1
778786,2024-01-31 23:00:00,8795.01,1


In [5]:
def fill_missing_rides_full_range(df, hour_col, location_col, rides_col):
    """
    Fills in missing rides for all hours in the range and all unique locations.

    Parameters:
    - df: DataFrame with columns [hour_col, location_col, rides_col]
    - hour_col: Name of the column containing hourly timestamps
    - location_col: Name of the column containing location IDs
    - rides_col: Name of the column containing ride counts

    Returns:
    - DataFrame with missing hours and locations filled in with 0 rides
    """
    # Ensure the hour column is in datetime format
    df[hour_col] = pd.to_datetime(df[hour_col])

    # Get the full range of hours (from min to max) with hourly frequency
    full_hours = pd.date_range(
        start=df[hour_col].min(),
        end=df[hour_col].max(),
        freq="h"
    )

    # Get all unique location IDs
    all_locations = df[location_col].unique()

    # Create a DataFrame with all combinations of hours and locations
    full_combinations = pd.DataFrame(
        [(hour, location) for hour in full_hours for location in all_locations],
        columns=[hour_col, location_col]
    )

    # Merge the original DataFrame with the full combinations DataFrame
    merged_df = pd.merge(full_combinations, df, on=[hour_col, location_col], how='left')

    # Fill missing rides with 0
    merged_df[rides_col] = merged_df[rides_col].fillna(0).astype(int)

    return merged_df

In [6]:
hour_col = "pickup_hour"
location_col = "pickup_location_id"
rides_col = "rides"
agg_data_filled = fill_missing_rides_full_range(agg_rides, hour_col, location_col, rides_col).sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)

In [7]:
agg_data_filled.head()

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2024-01-01 00:00:00,2733.03,0
1,2024-01-01 01:00:00,2733.03,0
2,2024-01-01 02:00:00,2733.03,0
3,2024-01-01 03:00:00,2733.03,0
4,2024-01-01 04:00:00,2733.03,0


In [8]:
from typing import Optional, List
import plotly.express as px
import pandas as pd

def plot_rides(
    rides: pd.DataFrame,
    locations: Optional[List[int]] = None
):

    rides_to_plot = rides[rides.pickup_location_id.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_hour",
        y="rides",
        color="pickup_location_id",
        template="none"
    )

    fig.show()

In [9]:
plot_rides(agg_data_filled, locations=[8664.06,2733.03])

In [11]:
month = 1
year = 2024
path = Path("..") / "data" / "processed" / f"ts_data_{year}_{month:02}.csv"

agg_data_filled.to_csv(path,index = False)