In [165]:
# -*- encoding:urf-8 -*-
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table like and matrices
import pandas as pd
import numpy as np

# Modeling Helper
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import missingno as msno

# Configure visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8,6

train_path = '../input/train.csv'
test_path = '../input/test.csv'
hol_path = '../input/holiday.csv'
train_date_path = '../input/train_date.csv'
cache_path = '../input/cache/'
output_path = '../output/'
train = pd.read_csv(train_path,encoding='gbk',engine='python')
test = pd.read_csv(test_path)
holiday = pd.read_csv(hol_path)

In [166]:
train.rename(columns = {'BigCode':'parClass','MidCode':'Class'},inplace = True)
# train

### 重新整合train集

In [167]:
train.loc[:,'saleCount'] = 1
coord_class = train.groupby(['Class','SaleDate'],as_index=False)['saleCount'].sum()
coord_parClass = train.groupby(['parClass','SaleDate'],as_index=False)['saleCount'].sum()
coord_parClass.rename(columns = {'parClass':'Class'},inplace=True)
train_new = pd.concat([coord_class,coord_parClass],axis = 0)
train_new.loc[:,'parClass'] = train_new.Class.map(lambda x: str(x)[:2])
train_new.loc[:,'parClass'] = train_new.parClass.map(lambda x: int(x))
train_new

Unnamed: 0,Class,SaleDate,saleCount,parClass
0,1001,20150101,3,10
1,1001,20150102,6,10
2,1001,20150103,7,10
3,1001,20150104,5,10
4,1001,20150105,9,10
5,1001,20150106,5,10
6,1001,20150107,28,10
7,1001,20150108,7,10
8,1001,20150109,4,10
9,1001,20150110,4,10


增加缺失日期

In [168]:
l = train_new.Class.unique()
tmp1 = train_date.copy()
tmp2 = train_date.copy()
tmp1.loc[:,'Class'] = 0
for i in l:
    tmp2.loc[:,'Class'] = i
    tmp1 = pd.concat([tmp1,tmp2],axis=0)
tmp1 = tmp1[tmp1.Class > 0]
tmp1 = tmp1[['Class','SaleDate']]
tmp1 = pd.merge(tmp1, train_new, on=['Class','SaleDate'], how='left')
tmp1.saleCount.fillna(0,inplace=True)
tmp1.loc[:,'parClass'] = tmp1.Class.map(lambda x: str(x)[:2])
tmp1.parClass = tmp1.parClass.astype('int')
tmp1.saleCount = tmp1.saleCount.astype('int')
tmp1 = tmp1[['Class','parClass','SaleDate','saleCount']]
print 'Are count numbers equal: ', train.saleCount.sum() * 2 == tmp1.saleCount.sum()
train_new = tmp1.copy()
train_new

Are count numbers equal:  True


Unnamed: 0,Class,parClass,SaleDate,saleCount
0,1001,10,20150101,3
1,1001,10,20150102,6
2,1001,10,20150103,7
3,1001,10,20150104,5
4,1001,10,20150105,9
5,1001,10,20150106,5
6,1001,10,20150107,28
7,1001,10,20150108,7
8,1001,10,20150109,4
9,1001,10,20150110,4


### 节假日特征

In [169]:
train = pd.merge(train, holiday, on = 'SaleDate',how = 'left')
train_new = pd.merge(train_new, holiday, on = 'SaleDate',how = 'left')
test = pd.merge(test, holiday, on = 'SaleDate',how = 'left')

### 时间特征

In [170]:
def timeHandle(s):
    s = str(s)
    s = [s[:4],s[4:6],s[6:]]
    return '-'.join(s)

In [171]:
train.SaleDate = train.SaleDate.map(lambda x: timeHandle(x))
train.SaleDate = pd.to_datetime(train.SaleDate)
train.loc[:,'month'] = train.SaleDate.dt.month
train.loc[:,'dayOfWeek'] = train.SaleDate.dt.dayofweek
train.loc[:,'dayOfYear'] = train.SaleDate.dt.dayofyear
train.loc[:,'weekOfYear'] = train.SaleDate.dt.weekofyear

train_new.SaleDate = train_new.SaleDate.map(lambda x: timeHandle(x))
train_new.SaleDate = pd.to_datetime(train_new.SaleDate)
train_new.loc[:,'month'] = train_new.SaleDate.dt.month
train_new.loc[:,'dayOfWeek'] = train_new.SaleDate.dt.dayofweek
train_new.loc[:,'dayOfYear'] = train_new.SaleDate.dt.dayofyear
train_new.loc[:,'weekOfYear'] = train_new.SaleDate.dt.weekofyear

test.SaleDate = test.SaleDate.map(lambda x: timeHandle(x))
test.SaleDate = pd.to_datetime(test.SaleDate)
test.loc[:,'month'] = test.SaleDate.dt.month
test.loc[:,'dayOfWeek'] = test.SaleDate.dt.dayofweek
test.loc[:,'dayOfYear'] = test.SaleDate.dt.dayofyear
test.loc[:,'weekOfYear'] = test.SaleDate.dt.weekofyear

### 商品特征

#### 商品类别

In [172]:
cumDict = {u'一般商品':0.6089,u'生鲜':0.3782,'联营商品':0.0129}
train.CumType = train.CumType.map(cumDict)

In [173]:
midClassSet = set(train.Class)
bigClassSet = set(train.parClass)
midClassDict = {}
bigClassDict = {}
classDict = {}
for eachMid in midClassSet:
    coord = train[train.Class == eachMid].groupby('CumType')['Class'].count()
    sum = 0 
    for i in range(len(coord)):
        sum += coord.index[i] *  coord.values[i]
    rate = round(sum / (1.0 * len(coord) + 1),2) ##修正
    midClassDict[eachMid] = rate
    classDict[eachMid] = rate
for eachBig in bigClassSet:
    coord = train[train.parClass == eachBig].groupby('CumType')['Class'].count()
    sum = 0 
    for i in range(len(coord)):
        sum += coord.index[i] *  coord.values[i]
    rate = round(sum / (1.0 * len(coord) + 1),2) 
    bigClassDict[eachBig] = rate
    classDict[eachBig] = rate
train.loc[:,'cumType'] = train.Class.map(midClassDict)
train.loc[:,'parCumType'] = train.parClass.map(bigClassDict)

train_new.loc[:,'cumType'] = train_new.Class.map(midClassDict)
train_new.loc[:,'parCumType'] = train_new.parClass.map(bigClassDict)

#最开始就需要改名
test.rename(columns={'Code':'Class'},inplace = True)
test.loc[:,'parClass'] = test.Class.map(lambda x: str(x)[:2])
test.loc[:,'parClass'] = test.parClass.map(lambda x: int(x))
test.loc[:,'cumType'] = test.Class.map(classDict)
test.loc[:,'parCumType'] = test.parClass.map(bigClassDict)

# 测试集中类cumType的缺失值处理,使用临近中类的type值
test = test.fillna('pad')

#### 商品热门指数

In [174]:
hotIndexDict = {}
parHotIndexDict = {}
totHotIndexDict = {}
totSaleCount = train.shape[0]
for eachMid in midClassSet:
    rate = round(train[train.Class == eachMid].shape[0] / (1.0 * totSaleCount),5)
    hotIndexDict[eachMid] = rate
    totHotIndexDict[eachMid] = rate
for eachBig in bigClassSet:
    rate = round(train[train.parClass == eachBig].shape[0] / (1.0 * totSaleCount),5)
    parHotIndexDict[eachBig] = rate
    totHotIndexDict[eachBig] = rate
    
train.loc[:,'hotIndex'] = train.Class.map(hotIndexDict)
train.loc[:,'parHotIndex'] = train.parClass.map(parHotIndexDict)

train_new.loc[:,'hotIndex'] = train.Class.map(hotIndexDict)
train_new.loc[:,'parHotIndex'] = train.parClass.map(parHotIndexDict)

test.loc[:,'hotIndex'] = train.Class.map(totHotIndexDict)
test.loc[:,'parHotIndex'] = train.parClass.map(parHotIndexDict)
test

Unnamed: 0,Class,SaleDate,SaleNum,holidayCluster,disHoliday,month,dayOfWeek,dayOfYear,weekOfYear,parClass,cumType,parCumType,hotIndex,parHotIndex
0,1001,2015-05-01,1,3,1.0,5,4,121,18,10,175.86,157.19,0.23107,0.33148
1,1001,2015-05-02,1,3,1.0,5,5,122,18,10,175.86,157.19,0.01721,0.13720
2,1001,2015-05-03,1,3,1.0,5,6,123,18,10,175.86,157.19,0.02758,0.11165
3,1001,2015-05-04,1,1,0.5,5,0,124,19,10,175.86,157.19,0.00145,0.11165
4,1001,2015-05-05,1,1,0.3,5,1,125,19,10,175.86,157.19,0.02758,0.11165
5,1001,2015-05-06,1,1,0.3,5,2,126,19,10,175.86,157.19,0.01446,0.07166
6,1001,2015-05-07,1,1,0.0,5,3,127,19,10,175.86,157.19,0.23107,0.33148
7,1001,2015-05-08,1,1,0.0,5,4,128,19,10,175.86,157.19,0.00677,0.13720
8,1001,2015-05-09,1,2,0.0,5,5,129,19,10,175.86,157.19,0.01042,0.02076
9,1001,2015-05-10,1,2,0.0,5,6,130,19,10,175.86,157.19,0.03410,0.20668


#### 商品工作日/非工作日销量统计值 - 用train_new表

In [175]:
train_new

Unnamed: 0,Class,parClass,SaleDate,saleCount,holidayCluster,disHoliday,month,dayOfWeek,dayOfYear,weekOfYear,cumType,parCumType,hotIndex,parHotIndex
0,1001,10,2015-01-01,3,3,1.0,1,3,1,1,175.86,157.19,0.23107,0.33148
1,1001,10,2015-01-02,6,3,1.0,1,4,2,1,175.86,157.19,0.01721,0.13720
2,1001,10,2015-01-03,7,3,1.0,1,5,3,1,175.86,157.19,0.02758,0.11165
3,1001,10,2015-01-04,5,1,0.5,1,6,4,1,175.86,157.19,0.00145,0.11165
4,1001,10,2015-01-05,9,1,0.3,1,0,5,2,175.86,157.19,0.02758,0.11165
5,1001,10,2015-01-06,5,1,0.3,1,1,6,2,175.86,157.19,0.01446,0.07166
6,1001,10,2015-01-07,28,1,0.0,1,2,7,2,175.86,157.19,0.23107,0.33148
7,1001,10,2015-01-08,7,1,0.0,1,3,8,2,175.86,157.19,0.00677,0.13720
8,1001,10,2015-01-09,4,1,0.0,1,4,9,2,175.86,157.19,0.01042,0.02076
9,1001,10,2015-01-10,4,2,0.0,1,5,10,2,175.86,157.19,0.03410,0.20668


In [176]:
train_wk = train_new[train_new.holidayCluster == 1]
train_hol = train_new[train_new.holidayCluster != 1]

coord = train_wk.groupby('Class',as_index = False)['saleCount'].agg({'wkDaySaleCount_median':'median'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
test = pd.merge(test, coord, on = 'Class', how='left')
coord = train_wk.groupby('Class',as_index = False)['saleCount'].agg({'wkDaySaleCount_mean':'mean'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
test = pd.merge(test, coord, on = 'Class', how='left')
coord = train_wk.groupby('Class',as_index = False)['saleCount'].agg({'wkDaySaleCount_max':'max'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
test = pd.merge(test, coord, on = 'Class', how='left')
coord = train_wk.groupby('Class',as_index = False)['saleCount'].agg({'wkDaySaleCount_min':'min'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
test = pd.merge(test, coord, on = 'Class', how='left')

coord = train_hol.groupby('Class',as_index = False)['saleCount'].agg({'holDaySaleCount_median':'median'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
test = pd.merge(test, coord, on = 'Class', how='left')
coord = train_hol.groupby('Class',as_index = False)['saleCount'].agg({'holDaySaleCount_mean':'mean'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
test = pd.merge(test, coord, on = 'Class', how='left')
coord = train_hol.groupby('Class',as_index = False)['saleCount'].agg({'holDaySaleCount_max':'max'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
test = pd.merge(test, coord, on = 'Class', how='left')
coord = train_hol.groupby('Class',as_index = False)['saleCount'].agg({'holDaySaleCount_min':'min'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
test = pd.merge(test, coord, on = 'Class', how='left')

coord = train_hol.groupby('Class',as_index=False)['saleCount'].agg({'holSaleCount':'count'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
coord = train_wk.groupby('Class',as_index=False)['saleCount'].agg({'wkSaleCount':'count'})
train_new = pd.merge(train_new, coord, on = 'Class', how='left')
train_new.loc[:,'wkHolRatio'] = train_new['wkSaleCount'] / (1.0 * train_new['holSaleCount'])

coord = train_new.groupby('Class',as_index=False)['wkHolRatio'].mean()
test = pd.merge(test, coord, on = 'Class', how='left')

del train_new['wkSaleCount'],train_new['holSaleCount']
del train_wk,train_hol

train_new

Unnamed: 0,Class,parClass,SaleDate,saleCount,holidayCluster,disHoliday,month,dayOfWeek,dayOfYear,weekOfYear,...,parHotIndex,wkDaySaleCount_median,wkDaySaleCount_mean,wkDaySaleCount_max,wkDaySaleCount_min,holDaySaleCount_median,holDaySaleCount_mean,holDaySaleCount_max,holDaySaleCount_min,wkHolRatio
0,1001,10,2015-01-01,3,3,1.0,1,3,1,1,...,0.33148,7.5,8.95,35,0,5.0,5.350,16,0,2.0
1,1001,10,2015-01-02,6,3,1.0,1,4,2,1,...,0.13720,7.5,8.95,35,0,5.0,5.350,16,0,2.0
2,1001,10,2015-01-03,7,3,1.0,1,5,3,1,...,0.11165,7.5,8.95,35,0,5.0,5.350,16,0,2.0
3,1001,10,2015-01-04,5,1,0.5,1,6,4,1,...,0.11165,7.5,8.95,35,0,5.0,5.350,16,0,2.0
4,1001,10,2015-01-05,9,1,0.3,1,0,5,2,...,0.11165,7.5,8.95,35,0,5.0,5.350,16,0,2.0
5,1001,10,2015-01-06,5,1,0.3,1,1,6,2,...,0.07166,7.5,8.95,35,0,5.0,5.350,16,0,2.0
6,1001,10,2015-01-07,28,1,0.0,1,2,7,2,...,0.33148,7.5,8.95,35,0,5.0,5.350,16,0,2.0
7,1001,10,2015-01-08,7,1,0.0,1,3,8,2,...,0.13720,7.5,8.95,35,0,5.0,5.350,16,0,2.0
8,1001,10,2015-01-09,4,1,0.0,1,4,9,2,...,0.02076,7.5,8.95,35,0,5.0,5.350,16,0,2.0
9,1001,10,2015-01-10,4,2,0.0,1,5,10,2,...,0.20668,7.5,8.95,35,0,5.0,5.350,16,0,2.0


#### 价格特征

In [177]:
coord_class = train.groupby('Class',as_index = False)['UnitPrice'].agg({'price_mean':'mean'})
coord_par_class = train.groupby('parClass',as_index = False)['UnitPrice'].agg({'price_mean':'mean'})
coord_par_class.rename(columns = {'parClass':'Class'},inplace = True)
coord = pd.concat([coord_class, coord_par_class],axis = 0)
train_new = pd.merge(train_new, coord, on = 'Class', how = 'left')
test = pd.merge(test, coord, on = 'Class', how = 'left')

coord_class = train.groupby('Class',as_index = False)['UnitPrice'].agg({'price_median':'median'})
coord_par_class = train.groupby('parClass',as_index = False)['UnitPrice'].agg({'price_median':'median'})
coord_par_class.rename(columns = {'parClass':'Class'},inplace = True)
coord = pd.concat([coord_class, coord_par_class],axis = 0)
train_new = pd.merge(train_new, coord, on = 'Class', how = 'left')
test = pd.merge(test, coord, on = 'Class', how = 'left')

#### 促销特征

商品促销时销量与不促销时销量的比值 

In [178]:
coord_class_bonus_count = train[train['Coupon'] == 1].groupby('Class',as_index=False)['saleCount'].agg({'classBonusSaleCount':'count'})
coord_parclass_bonus_count = train[train['Coupon'] == 1].groupby('parClass',as_index=False)['saleCount'].agg({'classBonusSaleCount':'count'})
coord_parclass_bonus_count.rename(columns={'parClass':'Class'},inplace = True)
coord = pd.concat([coord_class_bonus_count,coord_parclass_bonus_count],axis=0)
train_new = pd.merge(train_new, coord, on = 'Class', how = 'left')
test = pd.merge(test, coord, on = 'Class', how = 'left')
train_new['classBonusSaleCount'] = train_new['classBonusSaleCount'].fillna(0)
test['classBonusSaleCount'] = test['classBonusSaleCount'].fillna(0)

coord_class_notbonus_count = train[train['Coupon'] == 0].groupby('Class',as_index=False)['saleCount'].agg({'classNotBonusSaleCount':'count'})
coord_parclass_notbonus_count = train[train['Coupon'] == 0].groupby('parClass',as_index=False)['saleCount'].agg({'classNotBonusSaleCount':'count'})
coord_parclass_notbonus_count.rename(columns={'parClass':'Class'},inplace = True)
coord = pd.concat([coord_class_notbonus_count,coord_parclass_notbonus_count],axis=0)
train_new = pd.merge(train_new, coord, on = 'Class', how = 'left')
test = pd.merge(test, coord, on = 'Class', how = 'left')
train_new['classNotBonusSaleCount'] = train_new['classNotBonusSaleCount'].fillna(1)
test['classNotBonusSaleCount'] = test['classNotBonusSaleCount'].fillna(1)

# 计算促销与非促销的比值
train_new.loc[:,'bonusRatio'] = np.round(train_new['classBonusSaleCount'] / (1.0 * train_new['classNotBonusSaleCount']),4)
del train_new['classBonusSaleCount'],train_new['classNotBonusSaleCount']
test.loc[:,'bonusRatio'] = np.round(test['classBonusSaleCount'] / (1.0 * test['classNotBonusSaleCount']),4)
del test['classBonusSaleCount'],test['classNotBonusSaleCount']

商品节假日时促销销量与不促销销量的比值

In [179]:
train_wk = train[train.holidayCluster == 1]
train_hol = train[train.holidayCluster != 1]

coord_class_bonus_count = train_hol[train_hol['Coupon'] == 1].groupby('Class',as_index=False)['saleCount'].agg({'classBonusSaleCount':'count'})
coord_parclass_bonus_count = train_hol[train_hol['Coupon'] == 1].groupby('parClass',as_index=False)['saleCount'].agg({'classBonusSaleCount':'count'})
coord_parclass_bonus_count.rename(columns={'parClass':'Class'},inplace = True)
coord = pd.concat([coord_class_bonus_count,coord_parclass_bonus_count],axis=0)
train_new = pd.merge(train_new, coord, on = 'Class', how = 'left')
test = pd.merge(test, coord, on = 'Class', how = 'left')
train_new['classBonusSaleCount'] = train_new['classBonusSaleCount'].fillna(0)
test['classBonusSaleCount'] = test['classBonusSaleCount'].fillna(0)

coord_class_notbonus_count = train_hol[train_hol['Coupon'] == 0].groupby('Class',as_index=False)['saleCount'].agg({'classNotBonusSaleCount':'count'})
coord_parclass_notbonus_count = train_hol[train_hol['Coupon'] == 0].groupby('parClass',as_index=False)['saleCount'].agg({'classNotBonusSaleCount':'count'})
coord_parclass_notbonus_count.rename(columns={'parClass':'Class'},inplace = True)
coord = pd.concat([coord_class_notbonus_count,coord_parclass_notbonus_count],axis=0)
train_new = pd.merge(train_new, coord, on = 'Class', how = 'left')
test = pd.merge(test, coord, on = 'Class', how = 'left')
train_new['classNotBonusSaleCount'] = train_new['classNotBonusSaleCount'].fillna(1)
test['classNotBonusSaleCount'] = test['classNotBonusSaleCount'].fillna(1)

# 计算促销与非促销的比值
train_new.loc[:,'bonusHolRatio'] = np.round(train_new['classBonusSaleCount'] / (1.0 * train_new['classNotBonusSaleCount']),4)
del train_new['classBonusSaleCount'],train_new['classNotBonusSaleCount']
test.loc[:,'bonusHolRatio'] = np.round(test['classBonusSaleCount'] / (1.0 * test['classNotBonusSaleCount']),4)
del test['classBonusSaleCount'],test['classNotBonusSaleCount']

商品非节假日时促销销量与不促销销量的比值

In [180]:
coord_class_bonus_count = train_wk[train_wk['Coupon'] == 1].groupby('Class',as_index=False)['saleCount'].agg({'classBonusSaleCount':'count'})
coord_parclass_bonus_count = train_wk[train_wk['Coupon'] == 1].groupby('parClass',as_index=False)['saleCount'].agg({'classBonusSaleCount':'count'})
coord_parclass_bonus_count.rename(columns={'parClass':'Class'},inplace = True)
coord = pd.concat([coord_class_bonus_count,coord_parclass_bonus_count],axis=0)
train_new = pd.merge(train_new, coord, on = 'Class', how = 'left')
test = pd.merge(test, coord, on = 'Class', how = 'left')
train_new['classBonusSaleCount'] = train_new['classBonusSaleCount'].fillna(0)
test['classBonusSaleCount'] = test['classBonusSaleCount'].fillna(0)

coord_class_notbonus_count = train_wk[train_wk['Coupon'] == 0].groupby('Class',as_index=False)['saleCount'].agg({'classNotBonusSaleCount':'count'})
coord_parclass_notbonus_count = train_wk[train_wk['Coupon'] == 0].groupby('parClass',as_index=False)['saleCount'].agg({'classNotBonusSaleCount':'count'})
coord_parclass_notbonus_count.rename(columns={'parClass':'Class'},inplace = True)
coord = pd.concat([coord_class_notbonus_count,coord_parclass_notbonus_count],axis=0)
train_new = pd.merge(train_new, coord, on = 'Class', how = 'left')
test = pd.merge(test, coord, on = 'Class', how = 'left')
train_new['classNotBonusSaleCount'] = train_new['classNotBonusSaleCount'].fillna(1)
test['classNotBonusSaleCount'] = test['classNotBonusSaleCount'].fillna(1)

# 计算促销与非促销的比值
train_new.loc[:,'bonusNotHolRatio'] = np.round(train_new['classBonusSaleCount'] / (1.0 * train_new['classNotBonusSaleCount']),4)
del train_new['classBonusSaleCount'],train_new['classNotBonusSaleCount']
test.loc[:,'bonusNotHolRatio'] = np.round(test['classBonusSaleCount'] / (1.0 * test['classNotBonusSaleCount']),4)
del test['classBonusSaleCount'],test['classNotBonusSaleCount']

del train_wk,train_hol

商品周几促销的比例

In [181]:
train_coupon = train[train.Coupon == 1]
coord = train_coupon.groupby(['Class','dayOfWeek'],as_index=False)['dayOfWeek'].agg({'dayOfWeekCount':'count'})
var = train_coupon.groupby(['Class'],as_index=False)['dayOfWeek'].agg({'classCouponCount':'count'})
coord = pd.merge(coord, var, on = 'Class',how='left' )
coord.loc[:,'bonusWeekProb'] = coord['dayOfWeekCount'] / np.round((1.0 * coord['classCouponCount']))
coord_c = coord.copy()

coord = train_coupon.groupby(['parClass','dayOfWeek'],as_index=False)['dayOfWeek'].agg({'dayOfWeekCount':'count'})
var = train_coupon.groupby(['parClass'],as_index=False)['dayOfWeek'].agg({'classCouponCount':'count'})
coord = pd.merge(coord, var, on = 'parClass',how='left' )
coord.loc[:,'bonusWeekProb'] = coord['dayOfWeekCount'] / np.round((1.0 * coord['classCouponCount']))
coord.rename(columns={'parClass':'Class'},inplace=True)
coord_pc = coord.copy()

coord = pd.concat([coord_c,coord_pc],axis=0)

train_new = pd.merge(train_new, coord[['Class','dayOfWeek','bonusWeekProb']], on = ['Class','dayOfWeek'],how='left')
train_new['bonusWeekProb'] = train_new['bonusWeekProb'].fillna(0)

test = pd.merge(test, coord[['Class','dayOfWeek','bonusWeekProb']], on = ['Class','dayOfWeek'],how='left')
test['bonusWeekProb'] = test['bonusWeekProb'].fillna(0)

In [182]:
test

Unnamed: 0,Class,SaleDate,SaleNum,holidayCluster,disHoliday,month,dayOfWeek,dayOfYear,weekOfYear,parClass,...,holDaySaleCount_mean,holDaySaleCount_max,holDaySaleCount_min,wkHolRatio,price_mean,price_median,bonusRatio,bonusHolRatio,bonusNotHolRatio,bonusWeekProb
0,1001,2015-05-01,1,3,1.0,5,4,121,18,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000
1,1001,2015-05-02,1,3,1.0,5,5,122,18,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000
2,1001,2015-05-03,1,3,1.0,5,6,123,18,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000
3,1001,2015-05-04,1,1,0.5,5,0,124,19,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000
4,1001,2015-05-05,1,1,0.3,5,1,125,19,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000
5,1001,2015-05-06,1,1,0.3,5,2,126,19,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000
6,1001,2015-05-07,1,1,0.0,5,3,127,19,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000
7,1001,2015-05-08,1,1,0.0,5,4,128,19,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000
8,1001,2015-05-09,1,2,0.0,5,5,129,19,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000
9,1001,2015-05-10,1,2,0.0,5,6,130,19,10,...,5.350,16.0,0.0,2.0,29.609892,29.8,0.0000,0.0000,0.0000,0.000000


### 测试集分离

In [183]:
week2 = ['2015-05-08','2015-05-09','2015-05-10','2015-05-11','2015-05-12','2015-05-13','2015-05-14']
week3 = ['2015-05-15','2015-05-16','2015-05-17','2015-05-18','2015-05-19','2015-05-20','2015-05-21']
week4 = ['2015-05-22','2015-05-23','2015-05-24','2015-05-25','2015-05-26','2015-05-27','2015-05-28']
test.loc[:,'saleCount'] = 0
del test['SaleNum']
test_1 = test[test['SaleDate'] <= '2015-05-07']
test_2 = test[test['SaleDate'].isin(week2)]
test_3 = test[test['SaleDate'].isin(week3)]
test_4 = test[test['SaleDate'].isin(week4)]
test_5 = test[test['SaleDate'] >= '2015-05-29']
test_1.to_csv(output_path + 'week1.csv',index = False)
test_2.to_csv(output_path + 'week2.csv',index = False)
test_3.to_csv(output_path + 'week3.csv',index = False)
test_4.to_csv(output_path + 'week4.csv',index = False)
test_5.to_csv(output_path + 'week5.csv',index = False)
print  np.setdiff1d(test.columns,train_new.columns)

[]


### 提取滚动特征
#### 第一周滚动特征提取

In [184]:
train_test = pd.concat([train_new, test_1], axis = 0)
train_test.head(1)

Unnamed: 0,Class,SaleDate,bonusHolRatio,bonusNotHolRatio,bonusRatio,bonusWeekProb,cumType,dayOfWeek,dayOfYear,disHoliday,...,parHotIndex,price_mean,price_median,saleCount,weekOfYear,wkDaySaleCount_max,wkDaySaleCount_mean,wkDaySaleCount_median,wkDaySaleCount_min,wkHolRatio
0,1001,2015-01-01,0.0,0.0,0.0,0.0,175.86,3,1,1.0,...,0.33148,29.609892,29.8,3,1,35.0,8.95,7.5,0.0,2.0


#### 滚动热门指数

In [220]:
# 类别上周总销量
lastWeekSaleCount_o = train_test.groupby(['Class','weekOfYear'],as_index=False)['saleCount'].agg({'lastWeekSaleCount':'sum'})
last2WeekSaleCount_o = train_test.groupby(['Class','weekOfYear'],as_index=False)['saleCount'].agg({'last2WeekSaleCount':'sum'})
lastMonthSaleCount_o = train_test.groupby(['Class','month'],as_index=False)['saleCount'].agg({'lastMothSaleCount':'sum'})
lastWeekSaleCount = lastWeekSaleCount_o.shift(1)
last2WeekSaleCount = last2WeekSaleCount_o.shift(2)
lastMonthSaleCount = lastMonthSaleCount_o.shift(1)
lastWeekSaleCount.weekOfYear = lastWeekSaleCount_o.weekOfYear
last2WeekSaleCount.weekOfYear = last2WeekSaleCount_o.weekOfYear
lastMonthSaleCount.month = lastMonthSaleCount_o.month
lastWeekSaleCount.Class = lastWeekSaleCount_o.Class
last2WeekSaleCount.Class = last2WeekSaleCount_o.Class
lastMonthSaleCount.Class = lastMonthSaleCount_o.Class
last2WeekSaleCount

Unnamed: 0,Class,weekOfYear,last2WeekSaleCount
0,10,1,
1,10,2,
2,10,3,30.0
3,10,4,87.0
4,10,5,72.0
5,10,6,87.0
6,10,7,108.0
7,10,8,49.0
8,10,9,107.0
9,10,10,52.0


#### 滚动热门指数

In [210]:
co = train_test.groupby(['Class','weekOfYear'],as_index=False)['saleCount'].sum()
var_1 = co.shift(1)
var_1.weekOfYear = co.weekOfYear
var_1.Class = co.Class
var_1 # 类别上周销量

Unnamed: 0,Class,weekOfYear,saleCount
0,10,1,
1,10,2,30.0
2,10,3,87.0
3,10,4,72.0
4,10,5,87.0
5,10,6,108.0
6,10,7,49.0
7,10,8,107.0
8,10,9,52.0
9,10,10,26.0


In [214]:
var_2 = train_test.groupby(['weekOfYear'],as_index=False)['saleCount'].sum().shift(1)
o = train_test.groupby(['weekOfYear'],as_index=False)['saleCount'].sum()
var_2.weekOfYear = o.weekOfYear
var_2.rename(columns={'saleCount':'lastWeekTotSaleCount'},inplace=True)
var_2 # 上周总销量

Unnamed: 0,weekOfYear,lastWeekTotSaleCount
0,1,
1,2,2554.0
2,3,5426.0
3,4,5346.0
4,5,6038.0
5,6,6000.0
6,7,4238.0
7,8,5532.0
8,9,6240.0
9,10,3734.0


In [215]:
new_var = pd.merge(var_1,var_2,on='weekOfYear',how='left')
new_var

Unnamed: 0,Class,weekOfYear,saleCount,lastWeekTotSaleCount
0,10,1,,
1,10,2,30.0,2554.0
2,10,3,87.0,5426.0
3,10,4,72.0,5346.0
4,10,5,87.0,6038.0
5,10,6,108.0,6000.0
6,10,7,49.0,4238.0
7,10,8,107.0,5532.0
8,10,9,52.0,6240.0
9,10,10,26.0,3734.0
