# Demand prediction baseline solution

Victor Kantor, xead.wl@gmail.com

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.tsv")
test = pd.read_csv("test.tsv")
sample_submission = pd.read_csv("sample_submission.tsv")

In [2]:
from sklearn.model_selection import train_test_split

frac = 0.1 # fraction of learning examples used for model fitting

# sample the train set if your don't want to deel with all examples
train = train.sample(frac=frac, random_state=42)

X = train.drop(['Num','y'], axis=1)
y = train['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [3]:
important_features = ['week', 'item_id', 'shift', 'year', 'f1', 'f2', 'f3', 'f4', 'f5']

In [4]:
X_test = test[important_features]

In [5]:
def loss_func(F, A):
    if len(F) != len(A):
        raise BaseException
    res = 0
    for i in range(len(F)):
        res += float(abs(F[i] - A[i]) / (abs(A[i]) + abs(F[i])))
    res *= float(200) / len(F)
    return res

In [6]:
from sklearn.metrics import make_scorer
scorer = make_scorer(loss_func)

In [7]:
# XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=1,
#        learning_rate=0.1, max_delta_step=0, max_depth=15,
#        min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
#        objective='reg:linear', reg_alpha=0, reg_lambda=1,
#        scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [8]:
import xgboost as xgb

clf_config = {
    #'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators' : range(300,301), 'max_depth' : range(3,8)})
    'XGBRegressor' : (xgb.XGBRegressor(), {'max_depth' : [17], 
                                           'n_estimators' : [110],
                                           'learning_rate' : np.linspace(0.05, 0.3, 3),
                                           'min_child_weight' : range(0,5),
                                           'reg_alpha' : range(0,3),
#                                            'reg_labmda' : range(0,3),
                                           'scale_pos_weight' : range(0,3),})
}

In [10]:
from sklearn.model_selection import GridSearchCV

cv_values = {}
for name, (clf, args) in clf_config.items():
    print('Tuning parameters for %s' % name)
    gs_clf = GridSearchCV(estimator=clf, param_grid=args, scoring=scorer, verbose=10)
    gs_clf.fit(np.array(X[important_features]), np.array(y))
    
    print("\tBest params: {}".format(gs_clf.cv_results_))

Tuning parameters for XGBRegressor
Fitting 3 folds for each of 135 candidates, totalling 405 fits
[CV] reg_alpha=0, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 
[CV]  reg_alpha=0, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=40.182696, total=  12.6s
[CV] reg_alpha=0, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.9s remaining:    0.0s


[CV]  reg_alpha=0, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.347349, total=  14.5s
[CV] reg_alpha=0, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   27.8s remaining:    0.0s


[CV]  reg_alpha=0, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.234927, total=  12.7s
[CV] reg_alpha=0, scale_pos_weight=1, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   40.8s remaining:    0.0s


[CV]  reg_alpha=0, scale_pos_weight=1, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=40.182696, total=  15.5s
[CV] reg_alpha=0, scale_pos_weight=1, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   56.6s remaining:    0.0s


[CV]  reg_alpha=0, scale_pos_weight=1, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.795839, total=  14.3s
[CV] reg_alpha=0, scale_pos_weight=1, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min remaining:    0.0s


[CV]  reg_alpha=0, scale_pos_weight=1, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.407170, total=  13.7s
[CV] reg_alpha=0, scale_pos_weight=2, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.4min remaining:    0.0s


[CV]  reg_alpha=0, scale_pos_weight=2, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=40.182696, total=  13.7s
[CV] reg_alpha=0, scale_pos_weight=2, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.7min remaining:    0.0s


[CV]  reg_alpha=0, scale_pos_weight=2, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.640667, total=  14.4s
[CV] reg_alpha=0, scale_pos_weight=2, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.9min remaining:    0.0s


[CV]  reg_alpha=0, scale_pos_weight=2, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.277284, total=  14.7s
[CV] reg_alpha=1, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.2min remaining:    0.0s


[CV]  reg_alpha=1, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.952597, total=  16.2s
[CV] reg_alpha=1, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 
[CV]  reg_alpha=1, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.560921, total=  15.9s
[CV] reg_alpha=1, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 
[CV]  reg_alpha=1, scale_pos_weight=0, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.454390, total=  17.9s
[CV] reg_alpha=1, scale_pos_weight=1, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17 
[CV]  reg_alpha=1, scale_pos_weight=1, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_depth=17, score=39.952597, total=  13.6s
[CV] reg_alpha=1, scale_pos_weight=1, learning_rate=0.05, min_child_weight=0, n_estimators=110, max_de

[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed: 64.5min finished


	Best params: {'param_scale_pos_weight': masked_array(data = [0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0
 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1
 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2
 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2],
             mask = [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False Fals

In [None]:
#model = #GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=43)
# model = xgb.XGBRegressor(n_estimators=400, max_depth=10)
# model.fit(X_train, y_train)


In [104]:
preds = model.predict(np.array(test.drop(['Num'], axis=1)))

In [105]:
import numpy as np
preds = np.array(preds)
y_test = np.array(y_test)

In [106]:
loss_func(preds, y_test)

BaseException: 

In [183]:
sample_submission['y'] = new_predicitons

In [184]:
sample_submission.head(5)

Unnamed: 0,Num,y
0,348622,1467.554199
1,348623,16877.605469
2,348624,212119.640625
3,348625,30401.132812
4,348626,3810.147461


In [185]:
# In GBM you can get some negative predictions:
print sample_submission[sample_submission['y'] < 0]

         Num          y
399   349692 -49.195480
625   349918 -82.665092
851   350364 -49.195480
1077  350590 -49.195480
1078  350591 -51.080593
1523  351036 -40.327545
1524  351037 -51.080593


In [186]:
sample_submission['y'] = sample_submission['y'].map(lambda x: x if x > 0 else 0.0)

In [187]:
sample_submission.to_csv("baseline_submission.tsv", sep=',', index=False)

In [None]:
import numpy as np
range(100, 501, 40)

NameError: name 'linspace' is not defined