In [None]:
"""reads in a saved xgboost model and predicts prices for the 3 months following the last date contained in
some input produce data"""

In [9]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from xgboost.sklearn import XGBRegressor  
import pickle
import datetime
import math
sns.set_style("darkgrid")
model_dir = './saved_xgb_models/'
data_dir = './trimmed_data_and_plots/'
predict_dir = './produce_predictions/'

In [10]:
def read_stored_model(city, veggie, directory):
    # read in pickled xgb model
    loaded_model = pickle.load(open(directory+city+'_'+veggie+'_model', "rb"))
    return loaded_model

In [11]:
def read_trimmed_data(city, veggie, data_dir):
    # reads in cleaned .csv data for one city and veggie, returns
    output_data = pd.read_csv(data_dir+veggie+'_'+city+'_TRIM.csv')
    output_data.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
    output_data['Date'] = pd.to_datetime(output_data['Date'])
    output_data = output_data.sort_values(by='Date')
    output_data = output_data.reset_index(drop=True)
    #output_data = normalize_price(output_data)
    return output_data

In [12]:
def normalize_price(input_data): 
    # normalize produce price - ultimately not used here
    input_data['Average Price'] = (input_data['Average Price'] - input_data['Average Price'].median())/input_data['Average Price'].std()
    return input_data

In [13]:
def nearest_date(dates, targdate):
    # finds nearest date to targdate in list of dates
    diff = abs(dates - targdate)
    nearest = dates.iloc[diff.idxmin]
    timedelta = abs(nearest - targdate)
    return nearest, timedelta

In [14]:
def get_train(veggie_data, train_length):
    # select data to use to make the predictions for xgb
    all_dates = pd.to_datetime(veggie_data['Date'])
    start_date = all_dates.tail(1).iloc[0]
    train_time = pd.to_timedelta(pd.np.ceil(train_days), unit="D")
    start_date_train = start_date - train_time
    start_date_train = start_date_train
    nearest_date_train, deltatrain = nearest_date(all_dates, start_date_train)
    training_set = veggie_data[(veggie_data['Date'] >= nearest_date_train) & (veggie_data['Date'] < start_date)]
    return training_set, start_date

In [15]:
def build_features(input_data, start_date, test_days, train_days):
    # feature engineering
    features = []
    # convert times to deltas
    train_time = pd.to_timedelta(pd.np.ceil(train_days), unit="D")
    test_time = pd.to_timedelta(pd.np.ceil(test_days), unit="D")
    one_year = pd.to_timedelta(pd.np.ceil(365), unit='D')
    ten = pd.to_timedelta(pd.np.ceil(14), unit='D')
    twenty = pd.to_timedelta(pd.np.ceil(14), unit='D')
    thirty = pd.to_timedelta(pd.np.ceil(30), unit='D')
    sixty = pd.to_timedelta(pd.np.ceil(60), unit='D')
    ninety = pd.to_timedelta(pd.np.ceil(90), unit='D')
    recent_dates = [ten, twenty, thirty, sixty, ninety, one_year]
    
    # convert pricing data to pct_change
    pct_changes = input_data['Average Price'].pct_change()
    pct_changes = pd.concat([input_data['Date'], pct_changes], axis=1)
    # replace 0s with nans for now
    pct_changes['Average Price'] = pct_changes['Average Price'].replace(0, np.NaN)
    num_years = train_days // 365
    # compute historical averages and stds
    for y in range(num_years):
        historical_prices = input_data[(input_data['Date'] >= (start_date-(y+1)*one_year)) & (input_data['Date'] <= (start_date-(y+1)*one_year + test_time))]
        features.append(historical_prices['Average Price'].mean())
        features.append(historical_prices['Average Price'].std())
        
        # now append stock ticker/oil (external data):
        features.append(historical_prices['SYY'].mean())
        features.append(historical_prices['FDP'].mean())
        features.append(historical_prices['SENEA'].mean())
        features.append(historical_prices['CAG'].mean())
        features.append(historical_prices['KR'].mean())
        features.append(historical_prices['POILWTIUSDM'].mean())

        historical_pct = pct_changes[(pct_changes['Date'] >= (start_date-(y+1)*one_year)) & (pct_changes['Date'] <= (start_date-(y+1)*one_year + test_time))]
        features.append(historical_pct['Average Price'].mean())
        features.append(historical_pct['Average Price'].std())
        
        # now append stock ticker/oil (external data):
        features.append(historical_prices['SYY'].pct_change().mean())
        features.append(historical_prices['FDP'].pct_change().mean())
        features.append(historical_prices['SENEA'].pct_change().mean())
        features.append(historical_prices['CAG'].pct_change().mean())
        features.append(historical_prices['KR'].pct_change().mean())
        
    # now do last 14, 30, etc. day features avg/std
    for d in recent_dates:
        recent_prices = input_data[(input_data['Date'] >= (start_date - d)) & (input_data['Date'] <= (start_date))]
        features.append(recent_prices['Average Price'].mean())
        features.append(recent_prices['Average Price'].std())
        features.append(recent_prices['Date'].dt.month.mean())
        
        # just tickers because of oil sparsity
        features.append(recent_prices['SYY'].mean())
        features.append(recent_prices['FDP'].mean())
        features.append(recent_prices['SENEA'].mean())
        features.append(recent_prices['CAG'].mean())
        features.append(recent_prices['KR'].mean())
        
        recent_pct = pct_changes[(pct_changes['Date'] >= (start_date - d)) & (pct_changes['Date'] <= (start_date))]
        features.append(recent_pct['Average Price'].mean())
        features.append(recent_pct['Average Price'].std())
        
        # ticker data
        features.append(recent_prices['SYY'].pct_change().mean())
        features.append(recent_prices['FDP'].pct_change().mean())
        features.append(recent_prices['SENEA'].pct_change().mean())
        features.append(recent_prices['CAG'].pct_change().mean())
        features.append(recent_prices['KR'].pct_change().mean())
    return(features)

In [16]:
def build_corr_features(input_data, start_date, test_days, train_days):
    # feature engineering
    features = []
    # convert times to deltas
    train_time = pd.to_timedelta(pd.np.ceil(train_days), unit="D")
    test_time = pd.to_timedelta(pd.np.ceil(test_days), unit="D")
    one_year = pd.to_timedelta(pd.np.ceil(365), unit='D')
    fourteen = pd.to_timedelta(pd.np.ceil(14), unit='D')
    thirty = pd.to_timedelta(pd.np.ceil(30), unit='D')
    recent_dates = [fourteen, thirty]
    # convert pricing data to pct_change
    pct_changes = input_data['Average Price'].pct_change()
    pct_changes = pd.concat([input_data['Date'], pct_changes], axis=1)
    # replace 0s with nans for now
    pct_changes['Average Price'] = pct_changes['Average Price'].replace(0, np.NaN)
    # compute historical averages and stds
    num_years = train_days // 365
    for y in range(num_years):
        historical_prices = input_data[(input_data['Date'] >= (start_date-(y+1)*one_year)) & (input_data['Date'] <= (start_date-(y+1)*one_year + test_time))]
        features.append(historical_prices['Average Price'].mean())
    # now do last 14, 30, day features avg/std
    for d in recent_dates:
        recent_prices = input_data[(input_data['Date'] >= (start_date - d)) & (input_data['Date'] <= (start_date))]
        features.append(recent_prices['Average Price'].mean())
    return(features)

In [17]:
def current_price(input_data):
    # retrieve the last week's worth of data
    return input_data['Average Price'].tail(7).mean()

In [248]:
cities = ['NEW+YORK', 'LOS+ANGELES']
veggies = ['APPLES','APRICOTS','ASPARAGUS','AVOCADOS','BANANAS','BEANS','BEETS','BLACKBERRIES','BLUEBERRIES','BROCCOLI','CABBAGE','CANTALOUPS','CARROTS','CAULIFLOWER','CELERY','CHERRIES','CLEMENTINES', 'CUCUMBERS','ENDIVE','GARLIC','GINGER+ROOT','GRAPEFRUIT','GRAPES','HONEYDEWS','KIWIFRUIT','LEMONS','LETTUCE%2C+ICEBERG','LETTUCE%2C+ROMAINE','LETTUCE%2C+RED+LEAF','LETTUCE%2C+GREEN+LEAF', 'LIMES','NECTARINES','OKRA','ORANGES','PEACHES','PEARS','PEAS+GREEN','PEPPERS%2C+BELL+TYPE','PINEAPPLES','PLUMS','POTATOES','RADISHES','RASPBERRIES','RHUBARB','SPINACH','SQUASH','STRAWBERRIES']
train_days = 365*3

test_days = 90

# load the correlation_data
correlation_dir = './'

In [252]:
labels = ['Item', 'Current Price', '3 Month Prediction']
# city loop

for c in cities:
    
    correlation_list = pd.read_csv(correlation_dir+c+'_correlations.csv')

    outputs = []
    # loop over veggies and make a prediction for each
    for v in veggies:

        input_data = read_trimmed_data(c, v, data_dir)
        vcorrs = correlation_list[v+'_norm']
        historical_data, sdate = get_train(input_data, train_days)

        inputX = build_features(historical_data, sdate, test_days, train_days, vcorrs, c, data_dir)
        # catch excepts with a valuerror?
        currentprice = current_price(historical_data)

        model = read_stored_model(c, v, model_dir)

        outputY = model.predict(inputX)
        print(v)

        outputs.append((v, currentprice, outputY[0]))
    all_predictions = pd.DataFrame.from_records(outputs, columns=labels)

    all_predictions.to_csv(predict_dir + c + '_predictions.csv')

    # end city loop


APPLES
APRICOTS
ASPARAGUS
AVOCADOS
BANANAS
BEANS
BEETS
BLACKBERRIES
BLUEBERRIES
BROCCOLI
BRUSSELS+SPROUTS
CABBAGE
CANTALOUPS
CARROTS
CAULIFLOWER
CELERY
CHERRIES
CLEMENTINES
CUCUMBERS
ENDIVE
GARLIC
GINGER+ROOT
GRAPEFRUIT
GRAPES
HONEYDEWS
KIWIFRUIT
LEMONS
LETTUCE%2C+ICEBERG
LETTUCE%2C+ROMAINE
LETTUCE%2C+RED+LEAF
LETTUCE%2C+GREEN+LEAF
LIMES
MANGOES
NECTARINES
OKRA
ORANGES
PEACHES
PEARS
PEAS+GREEN
PEPPERS%2C+BELL+TYPE
PINEAPPLES
PLUMS
POTATOES
RADISHES
RASPBERRIES
RHUBARB
SPINACH
SQUASH
STRAWBERRIES
TURNIPS
APPLES
APRICOTS
ASPARAGUS
AVOCADOS
BANANAS
BEANS
BEETS
BLACKBERRIES
BLUEBERRIES
BROCCOLI
BRUSSELS+SPROUTS
CABBAGE
CANTALOUPS
CARROTS
CAULIFLOWER
CELERY
CHERRIES
CLEMENTINES
CUCUMBERS
ENDIVE
GARLIC
GINGER+ROOT
GRAPEFRUIT
GRAPES
HONEYDEWS
KIWIFRUIT
LEMONS
LETTUCE%2C+ICEBERG
LETTUCE%2C+ROMAINE
LETTUCE%2C+RED+LEAF
LETTUCE%2C+GREEN+LEAF
LIMES
MANGOES
NECTARINES
OKRA
ORANGES
PEACHES
PEARS
PEAS+GREEN
PEPPERS%2C+BELL+TYPE
PINEAPPLES
PLUMS
POTATOES
RADISHES
RASPBERRIES
RHUBARB
SPINACH
SQUASH
STR