In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import LSTM, Dense,Dropout
from sklearn.preprocessing import MinMaxScaler

Using TensorFlow backend.


## 导入原始数据

In [2]:
sales = pd.read_csv('../data/sales_train.csv', parse_dates=['date'], infer_datetime_format=True, dayfirst=True)
test = pd.read_csv('../data/test.csv')

In [3]:
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


## 数据预处理


#### 处理异常值
 - 去除商品价格大于100000的商品记录
 - 去除商品销量大于1000的商品记录
 - 用商品价格的中位数去替换商品价格小于0的商品记录

In [4]:
sales = sales[(sales.item_price < 100000)&(sales.item_cnt_day < 1001)]

median = sales[(sales.shop_id==32)&(sales.item_id==2973)&(sales.date_block_num==4)&
              (sales.item_price > 0)].item_price.median()
sales.loc[sales.item_price<=0, 'item_price'] = median

- 对每一个商店的每一个商品销售量按月份汇总
- 数据集的每一行代某一个商店的某一个商品34个月的销售量，如果销售量为NaN，则用0表示

In [5]:
df_sales = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'shop_id','item_id','date_block_num']).sum().reset_index()
df_sales = df_sales[['date','shop_id','item_id','item_cnt_day']]
df_sales = df_sales.pivot_table(index=['shop_id','item_id'],columns='date',values='item_cnt_day',fill_value=0).reset_index()

In [6]:
df_sales.head()

date,shop_id,item_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- 将sales和test数据进行连接操作，结果产生一些NaN值，说明test中有一些不可见的**商店-商品**数据
- 不可见数据处理方法：用0进行替换

In [7]:
df_sales_test = pd.merge(test,df_sales,on=['shop_id','item_id'],how='left')
df_sales_test.head()

Unnamed: 0,ID,shop_id,item_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,,,,,,,,...,,,,,,,,,,
2,2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,5268,,,,,,,,...,,,,,,,,,,


In [8]:
df_sales_test = df_sales_test.fillna(0)
df_sales_test = df_sales_test.drop(labels=['ID', 'shop_id', 'item_id'], axis=1)
df_sales_test.head()

Unnamed: 0,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,2013-10,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- 商品价格归一化

In [9]:
scaler = MinMaxScaler(feature_range=(0, 1))
sales["item_price"] = scaler.fit_transform(sales["item_price"].values.reshape(-1,1))

- 对每一个商店的每一个商品价格按月份汇总
- 数据集的每一行代某一个商店的某一个商品34个月的价格

In [10]:
df_price = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'shop_id','item_id','date_block_num']).mean().reset_index()
df_price = df_price[['date','shop_id','item_id','item_price']]
df_price = df_price.pivot_table(index=['shop_id','item_id'],columns='date',values='item_price').reset_index()
df_price.head()

date,shop_id,item_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,30,,0.004475,,,,,,,...,,,,,,,,,,
1,0,31,,0.00733,,,,,,,...,,,,,,,,,,
2,0,32,0.003732,0.003732,,,,,,,...,,,,,,,,,,
3,0,33,0.00586,0.00586,,,,,,,...,,,,,,,,,,
4,0,35,0.004171,0.004171,,,,,,,...,,,,,,,,,,


- 将price和test数据进行连接操作，结果产生一些NaN值，说明test中有一些不可见的**商店-商品**数据
- 不可见数据处理方法：先用相邻的月份价格替换，如果没有平均值则用0补全

In [11]:
df_price_test = pd.merge(test,df_price,on=['shop_id','item_id'],how='left')
df_price_test.head()

Unnamed: 0,ID,shop_id,item_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,5,5037,,,,,,,,...,0.033766,,,,0.021941,0.02532,0.02532,0.016877,0.012659,
1,1,5,5320,,,,,,,,...,,,,,,,,,,
2,2,5,5233,,,,,,,,...,,,,,0.015185,0.010117,,0.010117,0.016874,0.020252
3,3,5,5232,,,,,,,,...,,,,,,,,0.010117,,
4,4,5,5268,,,,,,,,...,,,,,,,,,,


In [12]:
df_price_test = df_price_test.drop(labels=['ID', 'shop_id', 'item_id'], axis=1)
df_price_test = df_price_test.fillna(method='ffill',axis=1)
df_price_test = df_price_test.fillna(method='backfill',axis=1)
df_price_test = df_price_test.fillna(0)
df_price_test.head()

Unnamed: 0,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,2013-10,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0.043901,0.043901,0.043901,0.043901,0.043901,0.043901,0.043901,0.043901,0.043901,0.043901,...,0.033766,0.033766,0.033766,0.033766,0.021941,0.02532,0.02532,0.016877,0.012659,0.012659
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.015185,0.015185,0.015185,0.015185,0.015185,0.015185,0.015185,0.015185,0.015185,0.015185,...,0.015185,0.015185,0.015185,0.015185,0.015185,0.010117,0.010117,0.010117,0.016874,0.020252
3,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117,...,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117,0.010117
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 创建训练集
- 用前33个月的数据作为输入的X，第34个月的数据作为y进行训练

In [13]:
TARGET = '2015-10'

X_train_sales = df_sales_test.drop(labels=[TARGET],axis=1)
X_train_sales = X_train_sales.values
X_train_sales = X_train_sales.reshape((214200, 33, 1))

X_train_price = df_price_test.drop(labels=[TARGET],axis=1)
X_train_price = X_train_price.values
X_train_price = X_train_price.reshape((214200, 33, 1))

X_train = np.append(X_train_sales,X_train_price,axis=2)

y_train = df_sales_test[TARGET]
y_train = y_train.values
y_train = y_train.reshape(214200, 1)

## 模型建立
> http://colah.github.io/posts/2015-08-Understanding-LSTMs/

#### 循环神经网络（RNN）
![RNN](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/RNN-unrolled.png)
#### 长短时记忆网络（LSTM）
![lstm](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)

In [14]:
model = Sequential()
layers = [1, 64, 128, 1]

#隐藏层
model.add(LSTM(layers[1],input_shape=(33, 2),dropout=0.2,recurrent_dropout=0.2,return_sequences=True))
model.add(LSTM(layers[2],dropout=0.2,recurrent_dropout=0.2,return_sequences=False))

#全连接层
model.add(Dense(layers[3]))

#编译model
model.compile(loss='mse',optimizer='adam',metrics=['mean_squared_error'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 33, 64)            17152     
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 116,097
Trainable params: 116,097
Non-trainable params: 0
_________________________________________________________________
None


## 进行训练

In [15]:
# It's training time!
BATCH = 2000

print('Training time, it is...')
model.fit(X_train, y_train,
          batch_size=BATCH,
          epochs=10)

Training time, it is...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2706275c0b8>

## 创建测试集

In [16]:
X_sales_test = df_sales_test.drop(labels=['2013-01'],axis=1)
X_sales_test = X_sales_test.values
X_sales_test = X_sales_test.reshape((214200, 33, 1))

X_price_test = df_price_test.drop(labels=['2013-01'],axis=1)
X_price_test = X_price_test.values
X_price_test = X_price_test.reshape((214200, 33, 1))

X_test = np.append(X_sales_test,X_price_test,axis=2)

## 预测并输出结果

In [17]:
y_pred = model.predict(X_test).clip(0., 20.)

result = pd.DataFrame(y_pred, columns=['item_cnt_month'])
result.to_csv('submission.csv',index_label='ID')