In [70]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import joblib
import pickle
import xgboost as xgb
import os
from utils import *
from xgboost import XGBRFRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
import numpy as np
import time
import pylab as plt
np.set_printoptions(precision=3, suppress=True)
%matplotlib inline

In [71]:
def train_model():
    data = pd.read_csv('./data.csv', encoding='GBK')
    train = data.groupby(['asin', 'data_date']).agg({'ordered_units': sum})
    train = train.reset_index()
    train.drop(train[train.ordered_units<=0].index, inplace=True)
    train.data_date = pd.to_datetime(train.data_date)
    train_featured = create_feature(train)
    x_train = train_featured.drop('ordered_units', axis=1)
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    x_train = encoder.fit_transform(x_train)
    y_train = train_featured['ordered_units']
    
    # 调整参数
    regressor = XGBRegressor(n_estimators=500,
                             learning_rate=0.2,
                             max_depth=5,
                             gamma=0,
                             min_child_weight=1,
                             subsample=0.8,
                             colsample_bytree=0.8)

    params_dict = {'n_estimators': np.arange(200, 700, 50),
                   'learning_rate': np.arange(0, 1, 0.1),
                   'max_depth': np.arange(3, 10, 1),
                   'gamma': np.arange(0, 5, 1),
                   'min_child_weight': np.arange(0, 5, 1),
                   'subsample': np.arange(0.4, 1, 0.1),
                   'colsample_bytree': np.arange(0.4, 1, 0.1)}

    # start_time = time.time()
    timeKF = TimeSeriesSplit(n_splits=3)
    rscv = RandomizedSearchCV(
        regressor, param_distributions=params_dict, n_iter=30, cv=timeKF, scoring='r2')
    rscv.fit(x_train, y_train)
    # print(time.time()-start_time)
    model = XGBRegressor(**rscv.best_params_)
    model.fit(x_train, y_train)
    # save model
    joblib.dump(encoder, os.path.join('.', 'encoder.pkl'))
    joblib.dump(model, os.path.join('.', 'regressor.pkl'))

In [3]:
# train_model()

In [78]:
def predict(asin, start, end):
    encoder = joblib.load(os.path.join('.', 'encoder.pkl'))
    model = joblib.load(os.path.join('.', 'regressor.pkl'))
    x_valid = pd.DataFrame()
    x_valid.loc[:, 'data_date'] = pd.date_range(start, end)
    x_valid.loc[:, 'asin'] = asin
    x_valid_featured = create_feature(x_valid)
    x_valid_encoded = encoder.transform(x_valid_featured)
    prediction = model.predict(x_valid_encoded)
    return prediction

In [79]:
predict('B074NYJL9J', '2019/10/1', '2021/10/1')

  df['weekofyear'] = df.data_date.dt.weekofyear


array([   9.612,    9.612,    9.612,    8.073,    9.612,   13.577,
         10.318,   10.054,   10.054,   10.054,    8.516,   10.054,
         14.019,   -4.851, -149.586,   19.642,   16.115,   16.591,
         16.115,   20.079,   13.653,   13.389,   13.389,   13.389,
         11.851,   13.389,   17.354,   13.365,   13.101,   13.101,
         13.101,   13.924,   15.462,   19.427,   12.674,   12.41 ,
         12.41 ,   12.41 ,   10.872,   12.41 ,   16.375,   15.726,
         15.462,   15.462,   15.462,   15.077,   18.99 ,   19.427,
         27.634,   25.356,   25.356,   25.356,   23.818,   25.356,
         29.321,   38.583,   38.319,   38.319,  110.204,   36.781,
         91.434,   30.602,  375.003,   99.32 ,  103.11 ,   97.204,
         90.793,  130.788,  123.6  ,  341.831,  331.249,  330.925,
        381.758,  716.348,  418.855,  417.967,   77.886,  117.272,
         58.726,   44.394,   70.138,   61.491,   40.344,   44.399,
         27.71 ,   34.688,   51.588,   48.937,   43.759,   51.