In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
from deepar.ts_dataset import TSTrainDataset, TSTestDataset
from deepar.learner import DeepARLearner
import tensorflow as tf
import numpy as np
import os 
import time
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
        exit(-1)


In [3]:
data = pd.read_csv("data/train_new.csv")

In [4]:
data['日期'] = pd.to_datetime(data['日期'])
data['固网投诉率'] = data['固网投诉率'].fillna(0)

In [None]:
data['固网投诉率'].describe()

In [5]:
# 这个地方很奇怪，我把target变成int之后，结果就变正常了，整个调用的loss也没变，数据处理应该也没有针对target类型的处理
data['固网投诉率'] = data['固网投诉率'].astype(int)

区分训练集，测试集

In [6]:
train_df = data[data['日期']<='2022-07-31']
test_df = data[data['日期']>='2022-08-01']

新建TSTrainDataset对象，其中id作为groupby_col需加到feat_static_cats中，count_data=False代表依然是用Gaussian_loss_likelihood

In [7]:
ds = TSTrainDataset(df=train_df, date_col='日期', target_col='固网投诉率', groupby_col='id', freq='D', feat_static_cats=['id', '地市名称'],feat_static_reals=['地市编码'], count_data=False)

In [8]:
DeepARLearner(ds).model.summary()

number of cats : 3, number of conts: 5


2022-12-02 21:42:01.348287: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-02 21:42:02.123913: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14253 MB memory:  -> device: 0, name: NVIDIA RTX A4000, pci bus id: 0000:52:00.0, compute capability: 8.6


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(64, 20)]           0           []                               
                                                                                                  
 input_2 (InputLayer)           [(64, 20)]           0           []                               
                                                                                                  
 input_3 (InputLayer)           [(64, 20)]           0           []                               
                                                                                                  
 embedding (Embedding)          (64, 20, 128)        21504       ['input_1[0][0]']                
                                                                                              

DeepAR的模型，可以调整rnn/lstm/gru，以及层数，个数，这边随便定了一下

In [9]:
learner = DeepARLearner(ds, cell_type='lstm',num_cells=10, num_layers=2)

number of cats : 3, number of conts: 5


In [10]:
# training time
train_start_time = time.time()
best_metric, epochs = learner.fit(1)
train_end_time = time.time()
print(f"training period took {train_end_time - train_start_time} seconds")

2022-12-02 21:42:04.113342: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


training period took 25.576476097106934 seconds


In [11]:
best_metric, epochs

(<tf.Tensor: shape=(), dtype=float32, numpy=4.5841002>, 1)

In [12]:
val_ds = TSTestDataset(ds, test_df.drop(columns=['固网投诉率']))

In [14]:
# inference period
inference_start_time = time.time()
pred = learner.predict(val_ds, samples=1, point_estimate=True, return_in_sample_predictions=False)
inference_end_time = time.time()
print(f"inference period took {inference_end_time - inference_start_time} seconds")

batch 0
horizon idx :0
in-sample ancestral sampling
batch 1
horizon idx :1
learn from test samples
batch 2
horizon idx :2
learn from test samples
batch 3
horizon idx :3
learn from test samples
batch 4
horizon idx :4
learn from test samples
batch 5
horizon idx :5
learn from test samples
batch 6
horizon idx :6
learn from test samples
batch 7
horizon idx :7
learn from test samples
batch 8
horizon idx :8
learn from test samples
batch 9
horizon idx :9
learn from test samples
batch 10
horizon idx :10
learn from test samples
batch 11
horizon idx :11
learn from test samples
batch 12
horizon idx :12
learn from test samples
batch 13
horizon idx :13
learn from test samples
batch 14
horizon idx :14
learn from test samples
batch 15
horizon idx :15
learn from test samples
batch 16
horizon idx :16
learn from test samples
batch 17
horizon idx :17
learn from test samples
batch 18
horizon idx :18
learn from test samples
batch 19
horizon idx :19
learn from test samples
batch 20
horizon idx :20
learn from

In [15]:
pred.shape

(167, 23, 1)

将结果平铺开，即可放回test_df中作为预测值

In [17]:
pred_df = test_df.sort_values(by=['id', '日期'])

In [20]:
pred.ravel()

array([2.8610442, 2.8983798, 3.0003827, ..., 6.0593147, 6.093418 ,
       6.1269655], dtype=float32)

In [21]:
pred_df['pred'] = pred.ravel()

In [34]:
pred_df['pred'] = np.round(pred_df['pred'])

## 计算metrics
由于真实值存在0，会导致mape的分母除以0，所以采用smape，abs_error,mse这些指标对比

In [66]:

import pandas as pd
import numpy as np
  
# Define the function to return the SMAPE value
def calculate_smape(actual, predicted) -> float:
  
    # Convert actual and predicted to numpy
    # array data type if not already
    if not all([isinstance(actual, np.ndarray), 
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual), np.array(predicted)
  
    return round(
        np.mean(
            np.abs(predicted - actual) / 
            ((np.abs(predicted) + np.abs(actual))/2)
        ), 2
    )

In [47]:
pred_df

Unnamed: 0,日期,固网投诉率,地市编码,地市名称,id,pred
122,2022-08-01,2,30.0,HB1873194620,30_HB1873194620,3.0
123,2022-08-02,3,30.0,HB1873194620,30_HB1873194620,3.0
124,2022-08-03,4,30.0,HB1873194620,30_HB1873194620,3.0
125,2022-08-04,2,30.0,HB1873194620,30_HB1873194620,3.0
126,2022-08-05,3,30.0,HB1873194620,30_HB1873194620,3.0
...,...,...,...,...,...,...
24209,2022-08-19,7,335.0,HB1828888888,335_HB1828888888,6.0
24210,2022-08-20,6,335.0,HB1828888888,335_HB1828888888,6.0
24211,2022-08-21,6,335.0,HB1828888888,335_HB1828888888,6.0
24212,2022-08-22,6,335.0,HB1828888888,335_HB1828888888,6.0


### MSE

In [43]:
mse = mean_squared_error(pred_df['固网投诉率'], pred_df['pred'])
print(mse)

5.4162978391044


### abs_error

In [53]:
abs_error = np.sum(np.abs(pred_df['固网投诉率'] - pred_df['pred']))
print(abs_error)

5612.0


### SMAPE

In [59]:
smape = calculate_smape(pred_df['固网投诉率'], pred_df['pred'])
print(smape)

0.39


In [68]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
#非要算mape就把真实值，预测值+1对比一下
mape = mean_absolute_percentage_error(pred_df['固网投诉率']+1, pred_df['pred']+1)
print(mape)

0.38296084649638623
