In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima_model import ARMA,ARIMA
import statsmodels.api as sm
from itertools import product
import calendar
from datetime import timedelta

In [21]:
#隐藏警告
import warnings
warnings.filterwarnings('ignore')

In [22]:
raw_data = pd.read_csv('data/raw_data.csv')

In [23]:
raw_data.sort_values(by=['province','city','date'],inplace=True)

In [24]:
raw_data['date'] = pd.to_datetime(raw_data['date'])

In [25]:
raw_data.index=raw_data['date']

In [26]:
raw_data

Unnamed: 0_level_0,province,city,date,price,compare
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-01-31,一线城市,上海,2011-01-31,23058.0,2.52%↑
2011-02-28,一线城市,上海,2011-02-28,23421.0,1.57%↑
2011-03-31,一线城市,上海,2011-03-31,23548.0,0.54%↑
2011-04-30,一线城市,上海,2011-04-30,23570.0,0.09%↑
2011-05-31,一线城市,上海,2011-05-31,23679.0,0.46%↑
...,...,...,...,...,...
2019-12-31,黑龙江,齐齐哈尔,2019-12-31,5534.0,0.11%↑
2020-01-31,黑龙江,齐齐哈尔,2020-01-31,5546.0,0.22%↑
2020-02-29,黑龙江,齐齐哈尔,2020-02-29,5548.0,0.04%↑
2020-03-31,黑龙江,齐齐哈尔,2020-03-31,5596.0,0.87%↑


In [7]:
def arima_model(df):
    # 设置参数范围
    ps = range(0, 5)
    qs = range(0, 5)
    ds = range(1, 3)
    parameters = product(ps, ds, qs)
    parameters_list = list(parameters)
    # 寻找最优ARMA模型参数，即best_aic最小
    results = []
    best_aic = float("inf")  # 正无穷
    for param in parameters_list:
        try:
            #model = ARIMA(df_month.Price,order=(param[0], param[1], param[2])).fit()
            # SARIMAX 包含季节趋势因素的ARIMA模型
            model = sm.tsa.statespace.SARIMAX(df['price'],
                                              order=(param[0], param[1], param[2]),
                                              #seasonal_order=(4, 1, 2, 12),
                                              enforce_stationarity=False,
                                              enforce_invertibility=False).fit()

        except ValueError:
            print('参数错误:', param)
            continue
        aic = model.aic
        if aic < best_aic:
            best_model = model
            best_aic = aic
            best_param = param
        results.append([param, model.aic])
    # 输出最优模型
    # print('最优模型: ', best_model.summary())
    return best_model

In [9]:
# 设置future_month，需要预测的时间date_list
df_month = raw_data[['price']]
future_month = 3
last_month = pd.to_datetime(df_month.index[len(df_month)-1])
date_list = []
for i in range(future_month):
    # 计算下个月有多少天
    year = last_month.year
    month = last_month.month
    if month == 12:
        month = 1
        year = year+1
    else:
        month = month + 1
    next_month_days = calendar.monthrange(year, month)[1]
    #print(next_month_days)
    last_month = last_month + timedelta(days=next_month_days)
    date_list.append(last_month)
print('date_list=', date_list)

date_list= [Timestamp('2020-05-31 00:00:00'), Timestamp('2020-06-30 00:00:00'), Timestamp('2020-07-31 00:00:00')]


In [None]:
for index, city in enumerate(raw_data['city'].unique()):
    df = raw_data[raw_data['city'] == city]
    model = arima_model(df)
    future = pd.DataFrame(index=date_list, columns=df.columns)
    df_month = pd.concat([df, future])

    # get_prediction得到的是区间，使用predicted_mean
    df_month['forecast'] = model.get_prediction(
        start=0, end=len(df_month)).predicted_mean
    df_month['date'] = df_month.index
    df_month['province'].fillna(df['province'].values[0], inplace=True)
    df_month['city'] = city
    if index != 0:
        df_month.to_csv('data/data.csv', mode='a', header=False)
    else:
        df_month.to_csv('data/data.csv')
