In [120]:
# -*- encoding:urf-8 -*-
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table like and matrices
import pandas as pd
import numpy as np

# Modeling Helper
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import missingno as msno

# Configure visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8,6

train_path = '../input/train.csv'
test_path = '../input/test.csv'
hol_path = '../input/holiday.csv'
train = pd.read_csv(train_path,encoding='gbk',engine='python')
test = pd.read_csv(test_path)
holiday = pd.read_csv(hol_path)

In [121]:
train.rename(columns = {'BigCode':'parClass','MidCode':'Class'},inplace = True)
train

Unnamed: 0,custid,parClass,BigName,Class,MidName,SmallCode,SmallName,SaleDate,SaleMonth,Code,Standard,CumType,Unit,SaleNum,SalePrice,UnitPrice,Coupon
0,0,12,蔬果,1201,蔬菜,120109,其它蔬菜,20150101,201501,DW-1201090311,,生鲜,个,8.000,4.00,2.00,0
1,1,20,粮油,2014,酱菜类,201401,榨菜,20150101,201501,DW-2014010019,60g,一般商品,袋,6.000,3.00,0.50,0
2,2,15,日配,1505,冷藏乳品,150502,冷藏加味酸乳,20150101,201501,DW-1505020011,150g,一般商品,袋,1.000,2.40,2.40,0
3,3,15,日配,1503,冷藏料理,150305,冷藏面食类,20150101,201501,DW-1503050035,500g,一般商品,袋,1.000,6.50,8.30,0
4,4,15,日配,1505,冷藏乳品,150502,冷藏加味酸乳,20150101,201501,DW-1505020020,100g*8,一般商品,袋,1.000,11.90,11.90,0
5,5,30,洗化,3018,卫生巾,301802,夜用卫生巾,20150101,201501,DW-3018020109,10片,一般商品,包,1.000,8.90,8.90,0
6,6,12,蔬果,1201,蔬菜,120104,花果,20150101,201501,DW-1201040022,散称,生鲜,千克,0.964,8.07,5.60,0
7,7,20,粮油,2001,袋装速食面,200101,牛肉口味,20150101,201501,DW-2001010062,120g,一般商品,袋,1.000,2.50,3.00,0
8,8,13,熟食,1308,现制中式面点,130803,现制烙类,20150101,201501,DW-1308030035,个,生鲜,个,2.000,2.00,1.00,0
9,9,22,休闲,2203,膨化点心,220302,袋装薯片,20150101,201501,DW-2203020029,45g,一般商品,袋,1.000,4.00,4.00,0


### 节假日特征

In [122]:
train = pd.merge(train, holiday, on = 'SaleDate',how = 'left')
test = pd.merge(test, holiday, on = 'SaleDate',how = 'left')

### 时间特征

In [123]:
def timeHandle(s):
    s = str(s)
    s = [s[:4],s[4:6],s[6:]]
    return '-'.join(s)

In [124]:
train.SaleDate = train.SaleDate.map(lambda x: timeHandle(x))
train.SaleDate = pd.to_datetime(train.SaleDate)
train.loc[:,'month'] = train.SaleDate.dt.month
train.loc[:,'dayOfWeek'] = train.SaleDate.dt.dayofweek
train.loc[:,'dayOfYear'] = train.SaleDate.dt.dayofyear
train.loc[:,'weekOfYear'] = train.SaleDate.dt.weekofyear

test.SaleDate = test.SaleDate.map(lambda x: timeHandle(x))
test.SaleDate = pd.to_datetime(test.SaleDate)
test.loc[:,'month'] = test.SaleDate.dt.month
test.loc[:,'dayOfWeek'] = test.SaleDate.dt.dayofweek
test.loc[:,'dayOfYear'] = test.SaleDate.dt.dayofyear
test.loc[:,'weekOfYear'] = test.SaleDate.dt.weekofyear

### 商品特征

#### 9.商品类别

In [125]:
cumDict = {u'一般商品':0.6089,u'生鲜':0.3782,'联营商品':0.0129}
train.CumType = train.CumType.map(cumDict)

In [126]:
midClassSet = set(train.Class)
bigClassSet = set(train.parClass)
midClassDict = {}
bigClassDict = {}
classDict = {}
for eachMid in midClassSet:
    coord = train[train.Class == eachMid].groupby('CumType')['Class'].count()
    sum = 0 
    for i in range(len(coord)):
        sum += coord.index[i] *  coord.values[i]
    rate = round(sum / (1.0 * len(coord) + 1),2) ##修正
    midClassDict[eachMid] = rate
    classDict[eachMid] = rate
for eachBig in bigClassSet:
    coord = train[train.parClass == eachBig].groupby('CumType')['Class'].count()
    sum = 0 
    for i in range(len(coord)):
        sum += coord.index[i] *  coord.values[i]
    rate = round(sum / (1.0 * len(coord) + 1),2) 
    bigClassDict[eachBig] = rate
    classDict[eachBig] = rate
train.loc[:,'cumType'] = train.Class.map(midClassDict)
train.loc[:,'parCumType'] = train.parClass.map(bigClassDict)

#最开始就需要改名
test.rename(columns={'Code':'Class'},inplace = True)
test.loc[:,'parClass'] = test.Class.map(lambda x: str(x)[:2])
test.loc[:,'parClass'] = test.parClass.map(lambda x: int(x))
test.loc[:,'cumType'] = test.Class.map(classDict)
test.loc[:,'parCumType'] = test.parClass.map(bigClassDict)

# 测试集中类cumType的缺失值处理
test = test.fillna('pad')