In [1]:
# Order volume forecast
import numpy as np  
import pandas as pd  
from sklearn.ensemble import GradientBoostingRegressor  
from sklearn.model_selection import GridSearchCV  
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
data = pd.read_table('../../data/products_sales.txt', delimiter=',')

In [3]:
data.head()

Unnamed: 0,limit_infor,campaign_type,campaign_level,product_level,resource_amount,email_rate,price,discount_rate,hour_resouces,campaign_fee,orders
0,0,6,0,1,1,0.08,140.0,0.83,93,888,1981
1,0,0,0,1,1,0.1,144.0,0.75,150,836,986
2,0,1,1,1,1,0.12,149.0,0.84,86,1330,1416
3,0,3,1,2,1,0.12,141.0,0.82,95,2273,2368
4,0,0,0,1,1,0.1,146.0,0.59,73,1456,1529


In [4]:
data.dtypes

limit_infor          int64
campaign_type        int64
campaign_level       int64
product_level        int64
resource_amount      int64
email_rate         float64
price              float64
discount_rate      float64
hour_resouces        int64
campaign_fee         int64
orders               int64
dtype: object

In [5]:
data.isnull().sum()

limit_infor        0
campaign_type      0
campaign_level     0
product_level      0
resource_amount    0
email_rate         0
price              2
discount_rate      0
hour_resouces      0
campaign_fee       0
orders             0
dtype: int64

In [6]:
data.describe()

Unnamed: 0,limit_infor,campaign_type,campaign_level,product_level,resource_amount,email_rate,price,discount_rate,hour_resouces,campaign_fee,orders
count,731.0,731.0,731.0,731.0,731.0,731.0,729.0,731.0,731.0,731.0,731.0
mean,0.042408,2.997264,0.683995,1.395349,4.95212,0.474337,162.812071,0.809617,848.176471,3696.391245,4531.079343
std,0.404911,2.004787,0.465233,0.544894,1.838449,0.162913,14.267136,0.077679,686.622488,1908.643139,1932.532346
min,0.0,0.0,0.0,1.0,1.0,0.08,100.0,0.49,2.0,20.0,22.0
25%,0.0,1.0,0.0,1.0,3.0,0.34,152.0,0.77,315.5,2497.0,3199.0
50%,0.0,3.0,1.0,1.0,5.0,0.49,163.0,0.82,713.0,3662.0,4563.0
75%,0.0,5.0,1.0,2.0,7.0,0.61,173.0,0.87,1096.0,4795.5,6011.5
max,10.0,6.0,1.0,3.0,9.0,0.84,197.0,0.98,3410.0,33380.0,8714.0


In [7]:
col_names = ['limit_infor', 'campaign_type', 'campaign_level', 'product_level']  
for col_name in col_names: 
    unque_value = np.sort(data[col_name].unique()) 
    print ('{:*^50}'.format('{1} unique values:{0}').format(unque_value, col_name)) 

**************limit_infor unique values:[ 0  1 10]***************
**************campaign_type unique values:[0 1 2 3 4 5 6]***************
**************campaign_level unique values:[0 1]***************
**************product_level unique values:[1 2 3]***************


In [8]:
data[data["limit_infor"]==10]

Unnamed: 0,limit_infor,campaign_type,campaign_level,product_level,resource_amount,email_rate,price,discount_rate,hour_resouces,campaign_fee,orders
705,10,4,1,1,8,0.74,158.0,0.82,606,3784,4390


In [10]:
# process null value and outlier
sales_data = data.fillna(data['price'].mean())  
sales_data = sales_data[sales_data['limit_infor'].isin((0, 1))] 
sales_data['campaign_fee'] = sales_data['campaign_fee'].replace(33380, sales_data['campaign_fee'].mean())   

In [11]:
X = sales_data.ix[:, :-1]  
y = sales_data.ix[:, -1] 

model_gbr = GradientBoostingRegressor() 
parameters = {'loss': ['ls', 'lad', 'huber', 'quantile'],
              'min_samples_leaf': [1, 2, 3, 4, 5],
              'alpha': [0.1, 0.3, 0.6, 0.9]}
model_gs = GridSearchCV(estimator=model_gbr, param_grid=parameters, cv=5) 
model_gs.fit(X, y) 
print ('Best score is:', model_gs.best_score_) 
print ('Best parameter is:', model_gs.best_params_)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


Best score is: 0.9316493410140942
Best parameter is: {'alpha': 0.9, 'min_samples_leaf': 3, 'loss': 'huber'}
