In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from xgboost.sklearn import XGBRegressor  
import pickle
sns.set_style("darkgrid")

In [2]:
def generate_predict_start_dates(start_date, end_date, train_length, test_length, window_step):
    train_time = pd.to_timedelta(pd.np.ceil(train_length), unit="D")
    test_time = pd.to_timedelta(pd.np.ceil(test_length), unit="D")
    window_time = pd.to_timedelta(pd.np.ceil(window_step), unit='D')
    
    output_dates = [pd.to_datetime(start_date) + train_time]
    most_recent_date = pd.to_datetime(start_date) + train_time
    while most_recent_date + train_time + test_time <= pd.to_datetime(end_date):
        output_dates.append(most_recent_date + window_time)
        most_recent_date = most_recent_date + window_time
    return(output_dates)

In [3]:
def read_trimmed_data(city, veggie, data_dir):
    # reads in cleaned .csv data for one city and veggie, returns
    output_data = pd.read_csv(data_dir+veggie+'_'+city+'_TRIM.csv')
    output_data['Date'] = pd.to_datetime(output_data['Date'])
    output_data = output_data.sort_values(by='Date')
    output_data = output_data.reset_index(drop=True)
    return output_data

In [4]:
def nearest_date(dates, targdate):
    for i in dates:
        i = i.to_pydatetime()
    nearest = min(dates, key=lambda x: abs(x - targdate))
    timedelta = abs(nearest - targdate)
    return nearest, timedelta

In [5]:
def get_train_test(veggie_data, start_date, train_days, predict_days):
    all_dates = pd.to_datetime(veggie_data['Date'])
    train_time = pd.to_timedelta(pd.np.ceil(train_days), unit="D")
    test_time = pd.to_timedelta(pd.np.ceil(predict_days), unit="D")
    start_date_train = start_date - train_time
    nearest_date_train, deltatrain = nearest_date(all_dates, start_date_train)
    end_date_test = start_date + test_time
    nearest_date_test, deltatest = nearest_date(all_dates, end_date_test)
    training_set = veggie_data[(veggie_data['Date'] >= nearest_date_train) & (veggie_data['Date'] < start_date)]
    test_set = veggie_data[(veggie_data['Date'] >= start_date) & (veggie_data['Date'] <= nearest_date_test)]
    return training_set, test_set

In [6]:
def build_features(input_data):
    input_data = [input_data['Average Price'].mean()]
    return(input_data)

In [7]:
def build_set(veggie_data, start_dates, train_days, predict_days):
    training_features = []
    predicting_set = []
    for i in start_dates:
        train, pred = get_train_test(veggie_data, i, train_days, predict_days)
        predicting_set.append(pred['Average Price'].mean())
        features = build_features(train)
        training_features.append(features)
    return training_features, predicting_set

In [8]:
cities = ['NEW+YORK', 'LOS+ANGELES']
veggienames = ['APPLES','APRICOTS','ASPARAGUS','AVOCADOS','BANANAS','BEANS','BEETS','BLACKBERRIES','BLUEBERRIES','BROCCOLI','BRUSSELS+SPROUTS','CABBAGE','CANTALOUPS','CARROTS','CAULIFLOWER','CELERY','CHERRIES','CLEMENTINES', 'CUCUMBERS','ENDIVE','GARLIC','GINGER+ROOT','GRAPEFRUIT','GRAPES','HONEYDEWS','KIWIFRUIT','LEMONS','LETTUCE%2C+ICEBERG','LETTUCE%2C+ROMAINE','LETTUCE%2C+RED+LEAF','LETTUCE%2C+GREEN+LEAF', 'LIMES','MANGOES','NECTARINES','OKRA','ORANGES','PEACHES','PEARS','PEAS+GREEN','PEPPERS%2C+BELL+TYPE','PINEAPPLES','PLUMS','POTATOES','RADISHES','RASPBERRIES','RHUBARB','SPINACH','SQUASH','STRAWBERRIES','TURNIPS']
datadir = './trimmed_data_and_plots/'
modeldir = './saved_xgb_models/'

train_start_date = '20070601'
train_end_date = '20170531'
# three year train, 3 month test
train_length = 365*3
predict_length = 90
# increment window of train/test before generating new set
increment_test = 15

In [10]:
datelist = generate_predict_start_dates(train_start_date, train_end_date, train_length, predict_length, increment_test)

for c in cities:
    for v in veggienames:
        data = read_trimmed_data(c, v, datadir)

        trainingX, predY = build_set(data, datelist, train_length, predict_length)

        model = XGBRegressor()

        model.fit(trainingX, predY, verbose=True)
        
        print('done training '+c+' '+v)
        # save the model
        with open(modeldir+c+'_'+v+'_model', 'wb') as fl:
            pickle.dump(model, fl)

done training NEW+YORK APPLES
done training NEW+YORK APRICOTS
done training NEW+YORK ASPARAGUS
done training NEW+YORK AVOCADOS
done training NEW+YORK BANANAS
done training NEW+YORK BEANS
done training NEW+YORK BEETS
done training NEW+YORK BLACKBERRIES
done training NEW+YORK BLUEBERRIES
done training NEW+YORK BROCCOLI
done training NEW+YORK BRUSSELS+SPROUTS
done training NEW+YORK CABBAGE
done training NEW+YORK CANTALOUPS
done training NEW+YORK CARROTS
done training NEW+YORK CAULIFLOWER
done training NEW+YORK CELERY
done training NEW+YORK CHERRIES
done training NEW+YORK CLEMENTINES
done training NEW+YORK CUCUMBERS
done training NEW+YORK ENDIVE
done training NEW+YORK GARLIC
done training NEW+YORK GINGER+ROOT
done training NEW+YORK GRAPEFRUIT
done training NEW+YORK GRAPES
done training NEW+YORK HONEYDEWS
done training NEW+YORK KIWIFRUIT
done training NEW+YORK LEMONS
done training NEW+YORK LETTUCE%2C+ICEBERG
done training NEW+YORK LETTUCE%2C+ROMAINE
done training NEW+YORK LETTUCE%2C+RED+LEA