In [1]:
import os
import datetime
import pickle

import sklearn
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

In [2]:
# Paths
data_path = "../data/yellow_tripdata_2016-01.csv"
model_output_dir = "../models/"

In [3]:
# Model Parameters
n_estimators = 100

In [4]:
# Load Data
data = pd.read_csv(data_path)

### Preprocessing

In [5]:
# Convert Columns to datetime objects
data["tpep_pickup_datetime"] = pd.to_datetime(data['tpep_pickup_datetime'], format='%Y-%m-%d %H:%M:%S')

# Drop unneeded columns
data = data.drop(["tpep_dropoff_datetime", "store_and_fwd_flag"], axis=1)

# Set pickup date as index - will be useful later on for slicing
data = data.set_index("tpep_pickup_datetime")
data = data.sort_index()


## Experiment 1 - Disjunct Training Data Sets

- Each model is trained on the data of a single day

In [6]:
# Group the data into segments of one day each
daily_segments = data.groupby(pd.Grouper(freq="1D"))

# Iterate over daily segments
for date, segment_data in daily_segments:
    # Split data into training data and target
    # Here we remove the datetime columns, the categorical columns and the target column
    X = segment_data.drop(labels=["trip_distance"], axis=1)
    y = segment_data["trip_distance"]
    # Define SKLearn Model
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=3)

    # Train SKLearn Model
    model.fit(X, y)

    # Save SKLearn Model
    model_save_file = (
        model_output_dir
        + "sklearn/daily/"
        + f"sklearn_{n_estimators}_trees_daily_{date.date()}.pkl"
    )
    with open(model_save_file, "wb") as f:
        pickle.dump(model, f)

# Experiment 2 - Overlapping Training Data Sets

- Each Model is trained on a 5 day window
- A model is trained each day
- So from one model to the next 4/5 of training data are identical

In [7]:
from datetime import datetime

for i in range(1, 28):
    start_datetime = datetime(2016, 1, i, 0, 0, 0)
    end_datetime = datetime(2016, 1, i + 4, 0, 0, 0)

    segment_data = data[start_datetime:end_datetime]
    X = segment_data.drop(labels=["trip_distance"], axis=1)
    y = segment_data["trip_distance"]

    # Define SKLearn Model
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=3)

    # Train SKLearn Model
    model.fit(X, y)

    # Save SKLearn Model
    model_save_file = (
        model_output_dir
        + "sklearn/5days/"
        + f"sklearn_{n_estimators}_trees_{start_datetime.date()}_{end_datetime.date()}.pkl"
    )
    with open(model_save_file, "wb") as f:
        pickle.dump(model, f)