In [2]:
import os
import datetime

import xgboost
import pandas as pd

In [3]:
# Paths
data_path = "../data/yellow_tripdata_2016-01.csv"
model_output_dir = "../models/"

In [4]:
# Model Parameters
n_estimators = 1000

In [5]:
# Load Data
data = pd.read_csv(data_path)

### Preprocessing

In [6]:
# Convert Columns to datetime objects
data["tpep_pickup_datetime"] = pd.to_datetime(data['tpep_pickup_datetime'], format='%Y-%m-%d %H:%M:%S')

# Drop unneeded columns
data = data.drop(["tpep_dropoff_datetime", "store_and_fwd_flag"], axis=1)

# Set pickup date as index - will be useful later on for slicing
data = data.set_index("tpep_pickup_datetime")
data = data.sort_index()


## Experiment 1 - Disjunct Training Data Sets

- Each model is trained on the data of a single day

In [6]:
# Group the data into segments of one day each
daily_segments = data.groupby(pd.Grouper(freq="1D"))

# Iterate over daily segments
for date, segment_data in daily_segments:
   
    # Split data into training data and target
    # Here we remove the datetime columns, the categorical columns and the target column
    X = segment_data.drop(labels=["total_amount"], axis=1)
    y = segment_data["total_amount"]
    
    # Define XGBoost Model
    x_model = xgboost.XGBRegressor(
        learning_rate=0.1, 
        n_estimators=n_estimators, 
        max_depth=3, 
        min_child_weight=3,
        gamma=0, 
        subsample=0.8, 
        reg_alpha=200, 
        reg_lambda=200, 
        colsample_bytree=0.8, 
        n_jobs=-1
    )
    # Train XGBoost Model
    x_model.fit(X, y)
    
    # Save XGBoost Model
    model_save_file = model_output_dir + f"xgboost_{n_estimators}_trees_daily_{date.date()}.json"
    x_model.save_model(model_save_file)
    

Data for 2016-01-01 00:00:00:
345037
Data for 2016-01-02 00:00:00:
312831
Data for 2016-01-03 00:00:00:
302878
Data for 2016-01-04 00:00:00:
316171
Data for 2016-01-05 00:00:00:
343251
Data for 2016-01-06 00:00:00:
348516
Data for 2016-01-07 00:00:00:
364894
Data for 2016-01-08 00:00:00:
392070
Data for 2016-01-09 00:00:00:
405825
Data for 2016-01-10 00:00:00:
351788
Data for 2016-01-11 00:00:00:
342651
Data for 2016-01-12 00:00:00:
367390
Data for 2016-01-13 00:00:00:
395090
Data for 2016-01-14 00:00:00:
396473
Data for 2016-01-15 00:00:00:
401289
Data for 2016-01-16 00:00:00:
411899
Data for 2016-01-17 00:00:00:
379156
Data for 2016-01-18 00:00:00:
341481
Data for 2016-01-19 00:00:00:
385187
Data for 2016-01-20 00:00:00:
382105
Data for 2016-01-21 00:00:00:
399654
Data for 2016-01-22 00:00:00:
420162
Data for 2016-01-23 00:00:00:
78133
Data for 2016-01-24 00:00:00:
159766
Data for 2016-01-25 00:00:00:
282087
Data for 2016-01-26 00:00:00:
327655
Data for 2016-01-27 00:00:00:
359180
Da

# Experiment 2 - Overlapping Training Data Sets

- Each Model is trained on a 5 day window
- A model is trained each day
- So from one model to the next 4/5 of training data are identical

In [8]:
from datetime import datetime

for i in range(1, 28):
    start_datetime  = datetime(2016, 1, i, 0, 0, 0)
    end_datetime = datetime(2016, 1, i+4, 0, 0, 0)
    
    segment_data = data[start_datetime:end_datetime]
    
    X = segment_data.drop(labels=["total_amount"], axis=1)
    y = segment_data["total_amount"]
    
    # Define XGBoost Model
    x_model = xgboost.XGBRegressor(
        learning_rate=0.1, 
        n_estimators=n_estimators, 
        max_depth=3, 
        min_child_weight=3,
        gamma=0, 
        subsample=0.8, 
        reg_alpha=200, 
        reg_lambda=200, 
        colsample_bytree=0.8, 
        n_jobs=-1
    )
    # Train XGBoost Model
    x_model.fit(X, y)
    
    model_save_file = model_output_dir + f"xgboost_{n_estimators}_trees_5days_{start_datetime.date()}_{end_datetime.date()}.json"
    x_model.save_model(model_save_file)

# Experiment 3 - Incremental Training

- Each Model is trained on a 1 day window
- The initial Model has 10 Trees
- Each Incremental Model adds 10 more Trees

In [13]:
# Group the data into segments of one day each
daily_segments = data.groupby(pd.Grouper(freq="1D"))

trees_per_round = 10
total_trees = trees_per_round

params = {
    'max_depth': 3,
    'learning_rate': 0.1,
}

model = None

# Iterate over daily segments
for i, (date, segment_data) in enumerate(daily_segments):
   
    # Split data into training data and target
    # Here we remove the datetime columns, the categorical columns and the target column
    X = segment_data.drop(labels=["total_amount"], axis=1)
    y = segment_data["total_amount"]
    
    # Train Model Sequentially by boosting for `trees_per_round` rounds
    model = xgboost.train(params, dtrain=xgboost.DMatrix(X, label=y),
                        xgb_model=model, num_boost_round=trees_per_round)
    
    
    # Save XGBoost Model
    model_save_file = model_output_dir + f"/incremental/xgboost_{total_trees}_trees_incremental_{date.date()}.json"    
    model.save_model(model_save_file)

    total_trees += trees_per_round
    