基于chapter14,18，19-1，新增了是否为工作日特征

### 19.1.2 资金流入流出预测-周期因子模型预测-增加是否为工作日特征

In [1]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import datetime
import numpy as np
from chinese_calendar import is_workday, is_holiday

#### 1 数据加载

In [2]:
data = pd.read_csv("../data/purchase_redeem/user_balance_table.csv")
data

Unnamed: 0,user_id,report_date,tBalance,yBalance,total_purchase_amt,direct_purchase_amt,purchase_bal_amt,purchase_bank_amt,total_redeem_amt,consume_amt,transfer_amt,tftobal_amt,tftocard_amt,share_amt,category1,category2,category3,category4
0,1,20140805,20385,20383,2,0,0,0,0,0,0,0,0,2,,,,
1,1,20140808,20391,20389,2,0,0,0,0,0,0,0,0,2,,,,
2,1,20140811,20397,20395,2,0,0,0,0,0,0,0,0,2,,,,
3,1,20140814,20403,20401,2,0,0,0,0,0,0,0,0,2,,,,
4,1,20140817,20409,20407,2,0,0,0,0,0,0,0,0,2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840416,28033,20140825,550646,550585,61,0,0,0,0,0,0,0,0,61,,,,
2840417,28033,20140831,525707,538147,60,0,0,0,12500,12500,0,0,0,60,0.0,0.0,0.0,12500.0
2840418,28033,20140724,20487121,20484824,2297,0,0,0,0,0,0,0,0,2297,,,,
2840419,28033,20140727,20462288,20491722,2298,0,0,0,31732,0,31732,0,31732,2298,,,,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2840421 entries, 0 to 2840420
Data columns (total 18 columns):
user_id                int64
report_date            int64
tBalance               int64
yBalance               int64
total_purchase_amt     int64
direct_purchase_amt    int64
purchase_bal_amt       int64
purchase_bank_amt      int64
total_redeem_amt       int64
consume_amt            int64
transfer_amt           int64
tftobal_amt            int64
tftocard_amt           int64
share_amt              int64
category1              float64
category2              float64
category3              float64
category4              float64
dtypes: float64(4), int64(14)
memory usage: 390.1 MB


#### 2 给数据添加时间维度

In [4]:
def add_timestamp(data):
    # 时间格式转换
    data["report_date"] = pd.to_datetime(data["report_date"], format="%Y%m%d")
    # 添加时间维度
    data["day"] = data["report_date"].dt.day
    data["month"] = data["report_date"].dt.month
    data["year"] = data["report_date"].dt.year
    data["week"] = data["report_date"].dt.week
    data["weekday"] = data["report_date"].dt.weekday
    return data
data = add_timestamp(data)
data

Unnamed: 0,user_id,report_date,tBalance,yBalance,total_purchase_amt,direct_purchase_amt,purchase_bal_amt,purchase_bank_amt,total_redeem_amt,consume_amt,...,share_amt,category1,category2,category3,category4,day,month,year,week,weekday
0,1,2014-08-05,20385,20383,2,0,0,0,0,0,...,2,,,,,5,8,2014,32,1
1,1,2014-08-08,20391,20389,2,0,0,0,0,0,...,2,,,,,8,8,2014,32,4
2,1,2014-08-11,20397,20395,2,0,0,0,0,0,...,2,,,,,11,8,2014,33,0
3,1,2014-08-14,20403,20401,2,0,0,0,0,0,...,2,,,,,14,8,2014,33,3
4,1,2014-08-17,20409,20407,2,0,0,0,0,0,...,2,,,,,17,8,2014,33,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840416,28033,2014-08-25,550646,550585,61,0,0,0,0,0,...,61,,,,,25,8,2014,35,0
2840417,28033,2014-08-31,525707,538147,60,0,0,0,12500,12500,...,60,0.0,0.0,0.0,12500.0,31,8,2014,35,6
2840418,28033,2014-07-24,20487121,20484824,2297,0,0,0,0,0,...,2297,,,,,24,7,2014,30,3
2840419,28033,2014-07-27,20462288,20491722,2298,0,0,0,31732,0,...,2298,,,,,27,7,2014,30,6


In [5]:
data["weekday"].value_counts()

6    410701
5    408824
4    407509
3    405907
2    404159
1    402536
0    400785
Name: weekday, dtype: int64

#### 3 筛选部分数据

In [6]:
def get_total_balance(data, begin):
    df_temp = data.copy()
    # 按照data进行聚合
    df_temp = df_temp.groupby(["report_date"])["total_purchase_amt", "total_redeem_amt"].sum()
    #还原date字段，重新索引
    df_temp.reset_index(inplace=True)
    df_temp = df_temp[(df_temp["report_date"] >= begin)]
    return df_temp

In [7]:
#筛选从2014-04-01之后的数据，作为平稳数据
total_balance = get_total_balance(data, "2014-03-01")
total_balance

Unnamed: 0,report_date,total_purchase_amt,total_redeem_amt
243,2014-03-01,362865580,211279011
244,2014-03-02,276202230,246199417
245,2014-03-03,505305862,513017360
246,2014-03-04,524146340,250562978
247,2014-03-05,454295491,209072753
...,...,...,...
422,2014-08-27,302194801,468164147
423,2014-08-28,245082751,297893861
424,2014-08-29,267554713,273756380
425,2014-08-30,199708772,196374134


#### 4 生成测试数据

In [8]:
def generate_test_data(data):
    total_balance = data.copy()
    start = datetime.datetime(2014,9,1)
    end = datetime.datetime(2014,10,1)
    testdata = []
    while start != end:
        # 添加三个字段: date, total_purchase_amt, total_redeem_amt
        temp = [start, np.nan, np.nan]
        testdata.append(temp)
        # 日期+1
        start += datetime.timedelta(days=1)
    # 封装testdata
    testdata = pd.DataFrame(testdata)
    testdata.columns = total_balance.columns
    # 将testdata合并到total_balance中
    total_balance = pd.concat([total_balance, testdata], axis=0)
    return total_balance

In [9]:
total_balance = generate_test_data(total_balance)
total_balance

Unnamed: 0,report_date,total_purchase_amt,total_redeem_amt
243,2014-03-01,362865580.0,211279011.0
244,2014-03-02,276202230.0,246199417.0
245,2014-03-03,505305862.0,513017360.0
246,2014-03-04,524146340.0,250562978.0
247,2014-03-05,454295491.0,209072753.0
...,...,...,...
25,2014-09-26,,
26,2014-09-27,,
27,2014-09-28,,
28,2014-09-29,,


在原有的数据上增加了30天

#### 5 对新生成的数据添加时间维度

In [10]:
total_balance = add_timestamp(total_balance)
total_balance

Unnamed: 0,report_date,total_purchase_amt,total_redeem_amt,day,month,year,week,weekday
243,2014-03-01,362865580.0,211279011.0,1,3,2014,9,5
244,2014-03-02,276202230.0,246199417.0,2,3,2014,9,6
245,2014-03-03,505305862.0,513017360.0,3,3,2014,10,0
246,2014-03-04,524146340.0,250562978.0,4,3,2014,10,1
247,2014-03-05,454295491.0,209072753.0,5,3,2014,10,2
...,...,...,...,...,...,...,...,...
25,2014-09-26,,,26,9,2014,39,4
26,2014-09-27,,,27,9,2014,39,5
27,2014-09-28,,,28,9,2014,39,6
28,2014-09-29,,,29,9,2014,40,0


#### new1 添加是否为假期的特征 

In [11]:
# total_balance["is_holiday"] = total_balance["report_date"].apply(lambda x: is_holiday(x))
# total_balance["is_holiday"] = total_balance["is_holiday"].replace({True:1, False:0})
# total_balance

In [12]:
total_balance["is_holiday"] = total_balance["report_date"].apply(lambda x: 1 if is_holiday(x) else 0)
total_balance

Unnamed: 0,report_date,total_purchase_amt,total_redeem_amt,day,month,year,week,weekday,is_holiday
243,2014-03-01,362865580.0,211279011.0,1,3,2014,9,5,1
244,2014-03-02,276202230.0,246199417.0,2,3,2014,9,6,1
245,2014-03-03,505305862.0,513017360.0,3,3,2014,10,0,0
246,2014-03-04,524146340.0,250562978.0,4,3,2014,10,1,0
247,2014-03-05,454295491.0,209072753.0,5,3,2014,10,2,0
...,...,...,...,...,...,...,...,...,...
25,2014-09-26,,,26,9,2014,39,4,0
26,2014-09-27,,,27,9,2014,39,5,1
27,2014-09-28,,,28,9,2014,39,6,0
28,2014-09-29,,,29,9,2014,40,0,0


#### new2 检查异常情况

In [13]:
# 不是真的工作日=》 周日， 不是真的假日=》工作日
for index, (weekday, is_holiday) in enumerate(zip(total_balance["weekday"].values, total_balance["is_holiday"].values)):
    # 如果不是周六日，但是是hoilday，需要设置为周日
    if weekday not in (5, 6) and is_holiday == 1:
        total_balance["weekday"].values[index] = 6
        print("to hoilday: ",index)
    # 如果是周六日，但是不是holiday,需要设置为周一
    elif weekday in (5, 6) and is_holiday == 0:
        total_balance["weekday"].values[index] = 0
        print("to workday: ",index)

to hoilday:  37
to hoilday:  61
to hoilday:  62
to workday:  64
to hoilday:  93
to hoilday:  191
to workday:  211


#### 6 计算weekday权重

In [14]:
# 备份一下total_balance
total_balance_bak = total_balance.copy()

In [15]:
total_balance = total_balance_bak.copy()
total_balance

Unnamed: 0,report_date,total_purchase_amt,total_redeem_amt,day,month,year,week,weekday,is_holiday
243,2014-03-01,362865580.0,211279011.0,1,3,2014,9,5,1
244,2014-03-02,276202230.0,246199417.0,2,3,2014,9,6,1
245,2014-03-03,505305862.0,513017360.0,3,3,2014,10,0,0
246,2014-03-04,524146340.0,250562978.0,4,3,2014,10,1,0
247,2014-03-05,454295491.0,209072753.0,5,3,2014,10,2,0
...,...,...,...,...,...,...,...,...,...
25,2014-09-26,,,26,9,2014,39,4,0
26,2014-09-27,,,27,9,2014,39,5,1
27,2014-09-28,,,28,9,2014,39,0,0
28,2014-09-29,,,29,9,2014,40,0,0


In [16]:
week_weight = total_balance[["weekday", "total_purchase_amt", "total_redeem_amt"]].groupby("weekday", as_index=False).mean()
week_weight

Unnamed: 0,weekday,total_purchase_amt,total_redeem_amt
0,0,338381300.0,362141900.0
1,1,334051800.0,322126600.0
2,2,321387700.0,318196000.0
3,3,319556800.0,287699400.0
4,4,254247400.0,273919300.0
5,5,195866600.0,184054400.0
6,6,189331200.0,195007000.0


In [17]:
week_weight = week_weight.rename(columns={"total_purchase_amt": "purchase_weekday", "total_redeem_amt":"redeem_weekday"})
week_weight

Unnamed: 0,weekday,purchase_weekday,redeem_weekday
0,0,338381300.0,362141900.0
1,1,334051800.0,322126600.0
2,2,321387700.0,318196000.0
3,3,319556800.0,287699400.0
4,4,254247400.0,273919300.0
5,5,195866600.0,184054400.0
6,6,189331200.0,195007000.0


In [18]:
week_weight["purchase_weekday"] /= np.mean(total_balance["total_purchase_amt"])
week_weight["redeem_weekday"] /= np.mean(total_balance["total_redeem_amt"])

In [19]:
week_weight

Unnamed: 0,weekday,purchase_weekday,redeem_weekday
0,0,1.225286,1.317867
1,1,1.209608,1.172248
2,2,1.163752,1.157944
3,3,1.157122,1.046964
4,4,0.920635,0.996817
5,5,0.709237,0.669791
6,6,0.685572,0.709648


In [20]:
total_balance = pd.merge(total_balance, week_weight, on="weekday", how="left")
total_balance

Unnamed: 0,report_date,total_purchase_amt,total_redeem_amt,day,month,year,week,weekday,is_holiday,purchase_weekday,redeem_weekday
0,2014-03-01,362865580.0,211279011.0,1,3,2014,9,5,1,0.709237,0.669791
1,2014-03-02,276202230.0,246199417.0,2,3,2014,9,6,1,0.685572,0.709648
2,2014-03-03,505305862.0,513017360.0,3,3,2014,10,0,0,1.225286,1.317867
3,2014-03-04,524146340.0,250562978.0,4,3,2014,10,1,0,1.209608,1.172248
4,2014-03-05,454295491.0,209072753.0,5,3,2014,10,2,0,1.163752,1.157944
...,...,...,...,...,...,...,...,...,...,...,...
209,2014-09-26,,,26,9,2014,39,4,0,0.920635,0.996817
210,2014-09-27,,,27,9,2014,39,5,1,0.709237,0.669791
211,2014-09-28,,,28,9,2014,39,0,0,1.225286,1.317867
212,2014-09-29,,,29,9,2014,40,0,0,1.225286,1.317867


#### 7 统计日期因子

In [21]:
# 统计周一到周日，在1-31号中出现的频次["weekday", "day"]出现的频次
total_balance[["report_date", "weekday", "day"]]

Unnamed: 0,report_date,weekday,day
0,2014-03-01,5,1
1,2014-03-02,6,2
2,2014-03-03,0,3
3,2014-03-04,1,4
4,2014-03-05,2,5
...,...,...,...
209,2014-09-26,4,26
210,2014-09-27,5,27
211,2014-09-28,0,28
212,2014-09-29,0,29


In [22]:
total_balance[["report_date", "day", "weekday"]].groupby(["day", "weekday"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,report_date
day,weekday,Unnamed: 2_level_1
1,0,1
1,1,2
1,4,1
1,5,1
1,6,2
...,...,...
30,6,1
31,0,1
31,3,1
31,5,1


In [23]:
weekday_count = total_balance[["report_date", "day", "weekday"]].groupby(["day", "weekday"], as_index=False).count()
weekday_count

Unnamed: 0,day,weekday,report_date
0,1,0,1
1,1,1,2
2,1,4,1
3,1,5,1
4,1,6,2
...,...,...,...
173,30,6,1
174,31,0,1
175,31,3,1
176,31,5,1


In [24]:
weekday_count = pd.merge(weekday_count, week_weight, on="weekday")
weekday_count

Unnamed: 0,day,weekday,report_date,purchase_weekday,redeem_weekday
0,1,0,1,1.225286,1.317867
1,3,0,1,1.225286,1.317867
2,4,0,2,1.225286,1.317867
3,5,0,1,1.225286,1.317867
4,7,0,1,1.225286,1.317867
...,...,...,...,...,...
173,26,3,1,1.157122,1.046964
174,27,3,1,1.157122,1.046964
175,28,3,1,1.157122,1.046964
176,29,3,1,1.157122,1.046964


In [25]:
np.unique(total_balance["month"])

array([3, 4, 5, 6, 7, 8, 9], dtype=int64)

In [26]:
len(np.unique(total_balance["month"]))

7

In [27]:
# 日期计算 = 周期因子 * （周一到周日在1到31号中出现的频次） / 一共有几个月
weekday_count["purchase_weekday"] = weekday_count["purchase_weekday"] * weekday_count["report_date"] / len(np.unique(total_balance["month"]))
weekday_count

Unnamed: 0,day,weekday,report_date,purchase_weekday,redeem_weekday
0,1,0,1,0.175041,1.317867
1,3,0,1,0.175041,1.317867
2,4,0,2,0.350082,1.317867
3,5,0,1,0.175041,1.317867
4,7,0,1,0.175041,1.317867
...,...,...,...,...,...
173,26,3,1,0.165303,1.046964
174,27,3,1,0.165303,1.046964
175,28,3,1,0.165303,1.046964
176,29,3,1,0.165303,1.046964


In [28]:
weekday_count["redeem_weekday"] = weekday_count["redeem_weekday"] * weekday_count["report_date"] / len(np.unique(total_balance["month"]))
weekday_count

Unnamed: 0,day,weekday,report_date,purchase_weekday,redeem_weekday
0,1,0,1,0.175041,0.188267
1,3,0,1,0.175041,0.188267
2,4,0,2,0.350082,0.376533
3,5,0,1,0.175041,0.188267
4,7,0,1,0.175041,0.188267
...,...,...,...,...,...
173,26,3,1,0.165303,0.149566
174,27,3,1,0.165303,0.149566
175,28,3,1,0.165303,0.149566
176,29,3,1,0.165303,0.149566


In [29]:
# 得到日期因子率
day_rate = weekday_count.drop(["weekday", "report_date"], axis=1).groupby("day", as_index=False).sum()
day_rate

Unnamed: 0,day,purchase_weekday,redeem_weekday
0,1,0.94936,0.964038
1,2,0.900438,0.898125
2,3,1.043957,1.017347
3,4,1.117475,1.143789
4,5,1.013554,1.004489
5,6,0.933071,0.923294
6,7,0.935311,0.944097
7,8,0.939622,0.925338
8,9,1.01112,1.026037
9,10,1.043957,1.017347


#### 8 按照日期day计算均值

In [30]:
total_balance[["day", "total_purchase_amt", "total_redeem_amt"]]

Unnamed: 0,day,total_purchase_amt,total_redeem_amt
0,1,362865580.0,211279011.0
1,2,276202230.0,246199417.0
2,3,505305862.0,513017360.0
3,4,524146340.0,250562978.0
4,5,454295491.0,209072753.0
...,...,...,...
209,26,,
210,27,,
211,28,,
212,29,,


In [31]:
day_mean = total_balance[["day", "total_purchase_amt", "total_redeem_amt"]].groupby("day", as_index=False).mean()
day_mean

Unnamed: 0,day,total_purchase_amt,total_redeem_amt
0,1,325339100.0,234767600.0
1,2,248125500.0,218607400.0
2,3,299480800.0,296824700.0
3,4,315980200.0,292559800.0
4,5,328627200.0,255248400.0
5,6,299845300.0,246095800.0
6,7,283672500.0,237483700.0
7,8,291866400.0,247525500.0
8,9,285660300.0,244026900.0
9,10,344634700.0,259385400.0


#### 9 合并day_mean和day_rate

In [32]:
day_rate

Unnamed: 0,day,purchase_weekday,redeem_weekday
0,1,0.94936,0.964038
1,2,0.900438,0.898125
2,3,1.043957,1.017347
3,4,1.117475,1.143789
4,5,1.013554,1.004489
5,6,0.933071,0.923294
6,7,0.935311,0.944097
7,8,0.939622,0.925338
8,9,1.01112,1.026037
9,10,1.043957,1.017347


In [33]:
day_pred = pd.merge(day_mean, day_rate, on="day", how="left")
day_pred

Unnamed: 0,day,total_purchase_amt,total_redeem_amt,purchase_weekday,redeem_weekday
0,1,325339100.0,234767600.0,0.94936,0.964038
1,2,248125500.0,218607400.0,0.900438,0.898125
2,3,299480800.0,296824700.0,1.043957,1.017347
3,4,315980200.0,292559800.0,1.117475,1.143789
4,5,328627200.0,255248400.0,1.013554,1.004489
5,6,299845300.0,246095800.0,0.933071,0.923294
6,7,283672500.0,237483700.0,0.935311,0.944097
7,8,291866400.0,247525500.0,0.939622,0.925338
8,9,285660300.0,244026900.0,1.01112,1.026037
9,10,344634700.0,259385400.0,1.043957,1.017347


#### 10 生成去掉周期因子影响之后的纯净的日期因子

In [34]:
day_pred["total_purchase_amt"] /= day_pred["purchase_weekday"]
day_pred["total_redeem_amt"] /= day_pred["redeem_weekday"]

In [35]:
day_pred

Unnamed: 0,day,total_purchase_amt,total_redeem_amt,purchase_weekday,redeem_weekday
0,1,342693100.0,243525200.0,0.94936,0.964038
1,2,275560900.0,243404300.0,0.900438,0.898125
2,3,286870800.0,291763600.0,1.043957,1.017347
3,4,282762700.0,255781200.0,1.117475,1.143789
4,5,324232600.0,254107700.0,1.013554,1.004489
5,6,321353000.0,266541000.0,0.933071,0.923294
6,7,303292200.0,251545900.0,0.935311,0.944097
7,8,310621000.0,267497500.0,0.939622,0.925338
8,9,282518600.0,237834400.0,1.01112,1.026037
9,10,330123400.0,254962600.0,1.043957,1.017347


#### 11 添加预测日期的report_data和weekday字段

In [36]:
# 添加日期字段report_date
for index, row in day_pred.iterrows():
    if row["day"] == 31: # 9月没有31日
        break
    day_pred.loc[index, "report_date"] = pd.to_datetime("2014/09/" + str(int(row["day"])))
day_pred

Unnamed: 0,day,total_purchase_amt,total_redeem_amt,purchase_weekday,redeem_weekday,report_date
0,1,342693100.0,243525200.0,0.94936,0.964038,2014-09-01
1,2,275560900.0,243404300.0,0.900438,0.898125,2014-09-02
2,3,286870800.0,291763600.0,1.043957,1.017347,2014-09-03
3,4,282762700.0,255781200.0,1.117475,1.143789,2014-09-04
4,5,324232600.0,254107700.0,1.013554,1.004489,2014-09-05
5,6,321353000.0,266541000.0,0.933071,0.923294,2014-09-06
6,7,303292200.0,251545900.0,0.935311,0.944097,2014-09-07
7,8,310621000.0,267497500.0,0.939622,0.925338,2014-09-08
8,9,282518600.0,237834400.0,1.01112,1.026037,2014-09-09
9,10,330123400.0,254962600.0,1.043957,1.017347,2014-09-10


In [37]:
# 添加weekday字段
day_pred["weekday"] = day_pred["report_date"].dt.weekday
day_pred

Unnamed: 0,day,total_purchase_amt,total_redeem_amt,purchase_weekday,redeem_weekday,report_date,weekday
0,1,342693100.0,243525200.0,0.94936,0.964038,2014-09-01,0.0
1,2,275560900.0,243404300.0,0.900438,0.898125,2014-09-02,1.0
2,3,286870800.0,291763600.0,1.043957,1.017347,2014-09-03,2.0
3,4,282762700.0,255781200.0,1.117475,1.143789,2014-09-04,3.0
4,5,324232600.0,254107700.0,1.013554,1.004489,2014-09-05,4.0
5,6,321353000.0,266541000.0,0.933071,0.923294,2014-09-06,5.0
6,7,303292200.0,251545900.0,0.935311,0.944097,2014-09-07,6.0
7,8,310621000.0,267497500.0,0.939622,0.925338,2014-09-08,0.0
8,9,282518600.0,237834400.0,1.01112,1.026037,2014-09-09,1.0
9,10,330123400.0,254962600.0,1.043957,1.017347,2014-09-10,2.0


#### 12 合并周期因子

In [38]:
day_pred = day_pred[["day", "total_purchase_amt", "total_redeem_amt", "report_date", "weekday"]]
day_pred

Unnamed: 0,day,total_purchase_amt,total_redeem_amt,report_date,weekday
0,1,342693100.0,243525200.0,2014-09-01,0.0
1,2,275560900.0,243404300.0,2014-09-02,1.0
2,3,286870800.0,291763600.0,2014-09-03,2.0
3,4,282762700.0,255781200.0,2014-09-04,3.0
4,5,324232600.0,254107700.0,2014-09-05,4.0
5,6,321353000.0,266541000.0,2014-09-06,5.0
6,7,303292200.0,251545900.0,2014-09-07,6.0
7,8,310621000.0,267497500.0,2014-09-08,0.0
8,9,282518600.0,237834400.0,2014-09-09,1.0
9,10,330123400.0,254962600.0,2014-09-10,2.0


In [39]:
day_pred = pd.merge(day_pred, week_weight, on="weekday")
day_pred

Unnamed: 0,day,total_purchase_amt,total_redeem_amt,report_date,weekday,purchase_weekday,redeem_weekday
0,1,342693100.0,243525200.0,2014-09-01,0.0,1.225286,1.317867
1,8,310621000.0,267497500.0,2014-09-08,0.0,1.225286,1.317867
2,15,290950300.0,227429700.0,2014-09-15,0.0,1.225286,1.317867
3,22,239030200.0,258116900.0,2014-09-22,0.0,1.225286,1.317867
4,29,240290900.0,265885800.0,2014-09-29,0.0,1.225286,1.317867
5,2,275560900.0,243404300.0,2014-09-02,1.0,1.209608,1.172248
6,9,282518600.0,237834400.0,2014-09-09,1.0,1.209608,1.172248
7,16,310952300.0,304559200.0,2014-09-16,1.0,1.209608,1.172248
8,23,225786100.0,255907800.0,2014-09-23,1.0,1.209608,1.172248
9,30,236676900.0,286278300.0,2014-09-30,1.0,1.209608,1.172248


#### 13 利用周期因子，进行预测

In [40]:
# 利用周期因子，进行预测：base * 周期因子=预测结果
day_pred["total_purchase_amt"] *= day_pred["purchase_weekday"]
day_pred["total_redeem_amt"] *= day_pred["redeem_weekday"]
day_pred

Unnamed: 0,day,total_purchase_amt,total_redeem_amt,report_date,weekday,purchase_weekday,redeem_weekday
0,1,419897000.0,320933900.0,2014-09-01,0.0,1.225286,1.317867
1,8,380599400.0,352526100.0,2014-09-08,0.0,1.225286,1.317867
2,15,356497200.0,299722100.0,2014-09-15,0.0,1.225286,1.317867
3,22,292880300.0,340163800.0,2014-09-22,0.0,1.225286,1.317867
4,29,294425000.0,350402200.0,2014-09-29,0.0,1.225286,1.317867
5,2,333320800.0,285330200.0,2014-09-02,1.0,1.209608,1.172248
6,9,341736900.0,278800900.0,2014-09-09,1.0,1.209608,1.172248
7,16,376130500.0,357018800.0,2014-09-16,1.0,1.209608,1.172248
8,23,273112800.0,299987400.0,2014-09-23,1.0,1.209608,1.172248
9,30,286286400.0,335589100.0,2014-09-30,1.0,1.209608,1.172248


In [41]:
# 根据日期进行排序
day_pred = day_pred.sort_values("report_date")[["report_date", "total_purchase_amt", "total_redeem_amt"]]
day_pred

Unnamed: 0,report_date,total_purchase_amt,total_redeem_amt
0,2014-09-01,419897000.0,320933900.0
5,2014-09-02,333320800.0,285330200.0
10,2014-09-03,333846400.0,337846000.0
14,2014-09-04,327191000.0,267793700.0
18,2014-09-05,298499900.0,253299000.0
22,2014-09-06,227915500.0,178526600.0
26,2014-09-07,207928700.0,178509100.0
1,2014-09-08,380599400.0,352526100.0
6,2014-09-09,341736900.0,278800900.0
11,2014-09-10,384181700.0,295232500.0


In [42]:
# 将日期改为需要的格式
day_pred["report_date"] = day_pred["report_date"].apply(lambda x: str(x).replace("-", "")[:8])
day_pred

Unnamed: 0,report_date,total_purchase_amt,total_redeem_amt
0,20140901,419897000.0,320933900.0
5,20140902,333320800.0,285330200.0
10,20140903,333846400.0,337846000.0
14,20140904,327191000.0,267793700.0
18,20140905,298499900.0,253299000.0
22,20140906,227915500.0,178526600.0
26,20140907,207928700.0,178509100.0
1,20140908,380599400.0,352526100.0
6,20140909,341736900.0,278800900.0
11,20140910,384181700.0,295232500.0


+ 思路：
+ 先去掉周期因子的影响，算出日期因子；
+ 再根据每个月的1号到31号的每天分别是周几，结合周期因子，得出预测值

In [44]:
day_pred.to_csv("rule_base_add_chinese_hoilday.csv", index=False, header=None)
# finalScore:127.5475

#### 14 各模型分数对比

+ Prophet   finalScore:78.5184
+ ARIMA模型  finalScore:100.9975
+ rule_base finalScore:130.6812
+ rule_base add chinese holiday finalScore：127.5475

可以看出，周期因子模型最好，ARIMA其次，Prophet最差

 ![](分数.png)