In [24]:
# -*- encoding:urf-8 -*-
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table like and matrices
import pandas as pd
import numpy as np

# Modeling Helper
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import missingno as msno

# Configure visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8,6

train_path = 'train.csv'
test_path = 'test.csv'
train = pd.read_csv(train_path,encoding='gbk',engine='python')
test = pd.read_csv(test_path)

In [2]:
train

Unnamed: 0,custid,BigCode,BigName,MidCode,MidName,SmallCode,SmallName,SaleDate,SaleMonth,Code,Standard,CumType,Unit,SaleNum,SalePrice,UnitPrice,Coupon
0,0,12,蔬果,1201,蔬菜,120109,其它蔬菜,20150101,201501,DW-1201090311,,生鲜,个,8.000,4.00,2.00,0.0
1,1,20,粮油,2014,酱菜类,201401,榨菜,20150101,201501,DW-2014010019,60g,一般商品,袋,6.000,3.00,0.50,0.0
2,2,15,日配,1505,冷藏乳品,150502,冷藏加味酸乳,20150101,201501,DW-1505020011,150g,一般商品,袋,1.000,2.40,2.40,0.0
3,3,15,日配,1503,冷藏料理,150305,冷藏面食类,20150101,201501,DW-1503050035,500g,一般商品,袋,1.000,6.50,8.30,0.0
4,4,15,日配,1505,冷藏乳品,150502,冷藏加味酸乳,20150101,201501,DW-1505020020,100g*8,一般商品,袋,1.000,11.90,11.90,0.0
5,5,30,洗化,3018,卫生巾,301802,夜用卫生巾,20150101,201501,DW-3018020109,10片,一般商品,包,1.000,8.90,8.90,0.0
6,6,12,蔬果,1201,蔬菜,120104,花果,20150101,201501,DW-1201040022,散称,生鲜,千克,0.964,8.07,5.60,0.0
7,7,20,粮油,2001,袋装速食面,200101,牛肉口味,20150101,201501,DW-2001010062,120g,一般商品,袋,1.000,2.50,3.00,0.0
8,8,13,熟食,1308,现制中式面点,130803,现制烙类,20150101,201501,DW-1308030035,个,生鲜,个,2.000,2.00,1.00,0.0
9,9,22,休闲,2203,膨化点心,220302,袋装薯片,20150101,201501,DW-2203020029,45g,一般商品,袋,1.000,4.00,4.00,0.0


### Time Handling

In [6]:
def timeHandle(s):
    s = str(s)
    s = [s[:4],s[4:6],s[6:]]
    return '-'.join(s)

In [26]:
l_trainHoliday = [20150101,20150102,20150103,20150218,20150219,20150220,20150221,20150222,20150223,20150224,20150404,20150405,20150406]
l_trainLeave = [20150101,20150102,20150103,20150218,20150219,20150220,20150221,20150222,20150223,20150224,20150404,20150405,20150406,
           20150110,20150111,20150117,20150118,20150124,20150125,20150131,
           20150201,20150207,20150208,20150214,20150215,
           20150301,20150307,20150308,20150314,20150315,20150321,20150322,20150328,20150329,
           20150411,20150412,20150418,20150419,20150425,20150426]
l_testHoliday = [20150501,20150502,20150503]
l_tesLeave = [20150501,20150502,20150503,20150509,20150510,20150516,20150517,20150523,20150524,20150530,20150531]

train.loc[:,'isHoliday'] = train.SaleDate.isin(l_trainHoliday)
train.loc[:,'isLeave'] = train.SaleDate.isin(l_trainLeave)
train.loc[:,'DayofMonth'] = train.SaleDate.map(lambda x: str(x)[6:]).astype('int')
train.isHoliday = train.isHoliday.astype('int')
train.isLeave = train.isLeave.astype('int')
train.SaleDate = train.SaleDate.map(lambda x: timeHandle(x))
train.SaleDate = pd.to_datetime(train.SaleDate)
train.loc[:,'Month'] = train.SaleDate.dt.month
train.loc[:,'Weekday'] = train.SaleDate.dt.dayofweek
train

Unnamed: 0,custid,BigCode,BigName,MidCode,MidName,SmallCode,SmallName,SaleDate,SaleMonth,Code,...,Unit,SaleNum,SalePrice,UnitPrice,Coupon,isHoliday,isLeave,DayofMonth,Month,Weekday
0,0,12,蔬果,1201,蔬菜,120109,其它蔬菜,2015-01-01,201501,DW-1201090311,...,个,8.000,4.00,2.00,0.0,1,1,1,1,3
1,1,20,粮油,2014,酱菜类,201401,榨菜,2015-01-01,201501,DW-2014010019,...,袋,6.000,3.00,0.50,0.0,1,1,1,1,3
2,2,15,日配,1505,冷藏乳品,150502,冷藏加味酸乳,2015-01-01,201501,DW-1505020011,...,袋,1.000,2.40,2.40,0.0,1,1,1,1,3
3,3,15,日配,1503,冷藏料理,150305,冷藏面食类,2015-01-01,201501,DW-1503050035,...,袋,1.000,6.50,8.30,0.0,1,1,1,1,3
4,4,15,日配,1505,冷藏乳品,150502,冷藏加味酸乳,2015-01-01,201501,DW-1505020020,...,袋,1.000,11.90,11.90,0.0,1,1,1,1,3
5,5,30,洗化,3018,卫生巾,301802,夜用卫生巾,2015-01-01,201501,DW-3018020109,...,包,1.000,8.90,8.90,0.0,1,1,1,1,3
6,6,12,蔬果,1201,蔬菜,120104,花果,2015-01-01,201501,DW-1201040022,...,千克,0.964,8.07,5.60,0.0,1,1,1,1,3
7,7,20,粮油,2001,袋装速食面,200101,牛肉口味,2015-01-01,201501,DW-2001010062,...,袋,1.000,2.50,3.00,0.0,1,1,1,1,3
8,8,13,熟食,1308,现制中式面点,130803,现制烙类,2015-01-01,201501,DW-1308030035,...,个,2.000,2.00,1.00,0.0,1,1,1,1,3
9,9,22,休闲,2203,膨化点心,220302,袋装薯片,2015-01-01,201501,DW-2203020029,...,袋,1.000,4.00,4.00,0.0,1,1,1,1,3
