In [1]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor

### Preprocess

In [2]:
data = pd.read_csv('sample/train_set_weeks.csv', index_col='id')
categorical = ['idFilial', 'KanalDB', 'idSubGrp']

test = pd.read_csv('sample/test_set_weeks.csv', index_col='id')
test = test.groupby(categorical).head(1)

data = data.append(test).reset_index(drop=True).sort_values('N wk')
data.index.name = 'id'

data['previous'] = data.groupby(categorical)['value'].shift(1)
data = data.dropna(subset=['previous']).sort_index()
grouped = data.groupby(categorical)['previous']

for j in range(2, 10):
    rolled = grouped.rolling(j)
    ravg = rolled.mean().reset_index()
    ravg = ravg.set_index('id').sort_index()
    data['avg{}'.format(j)] = ravg['previous']
    
for j in range(3, 10):
    rolled = grouped.rolling(j)
    rstd = rolled.std().reset_index()
    rstd = rstd.set_index('id').sort_index()
    data['std{}'.format(j)] = rstd['previous']
    
cumulative = ['avg2', 'avg3', 'avg4', 'avg5', 'avg6', 'avg7', 'avg8',
    'avg9', 'std3', 'std4', 'std5', 'std6', 'std7', 'std8', 'std9']
data = data.dropna(subset=cumulative).drop(['N wk', 'wk'], axis=1)

test_categorical = test[categorical]
train_labels = data.loc[data['value'].notnull(), 'value']
test = data[data['value'].isnull()].drop('value', axis=1)
train = data[data['value'].notnull()].drop('value', axis=1)

In [3]:
encoder = OneHotEncoder(categorical_features=[0, 1, 2], sparse=False)
train = encoder.fit_transform(train)
test = encoder.transform(test)

### Ridge

In [4]:
regressor_ridge = Ridge(alpha=0.01, normalize=True)
regressor_ridge.fit(train, train_labels)

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [5]:
test_categorical['value'] = regressor_ridge.predict(test)
submit_ridge = (pd.read_csv('sample/test_set_weeks.csv',
    index_col='id').merge(test_categorical, on=categorical))
submit_ridge.index.name = 'id'
submit_ridge.to_csv('submit_ridge.csv')

### Trees

In [6]:
regressor_trees = LGBMRegressor(n_estimators=5000, max_depth=8, learning_rate=0.05)
regressor_trees.fit(train, train_labels, eval_metric='rmse')

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.05,
       max_bin=255, max_depth=8, min_child_samples=10, min_child_weight=5,
       min_split_gain=0.0, n_estimators=5000, n_jobs=-1, num_leaves=31,
       objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
       silent=True, subsample=1.0, subsample_for_bin=50000,
       subsample_freq=1)

In [7]:
test_categorical['value'] = regressor_trees.predict(test)
submit_trees = pd.read_csv('sample/test_set_weeks.csv',
    index_col='id').merge(test_categorical, on=categorical)
submit_trees.index.name = 'id'
submit_trees.to_csv('submit_trees.csv')

### Smooth

In [8]:
data = pd.read_csv('sample/train_set_weeks.csv', index_col='id')
data['N wk'] **= 8
data = data.merge(data.groupby(categorical)['N wk'].sum().
    rename('normalization').reset_index(), on=categorical)
data['N wk'] /= data['normalization']
data['value'] *= data['N wk']
predict = data.groupby(categorical)['value'].sum().reset_index()

In [9]:
submit_smooth = pd.read_csv('sample/test_set_weeks.csv',
    index_col='id').merge(predict, on=categorical)
submit_smooth.index.name = 'id'
submit_smooth.to_csv('submit_smooth.csv')

### Ensemble

In [13]:
submit_ensemble = pd.read_csv('sample/test_set_weeks.csv', index_col='id')
submit_ensemble['value'] = (submit_ridge['value']*0.8
    + submit_trees['value']*0.1 + submit_smooth['value']*0.1)
submit_ensemble.to_csv('submit_ensemble.csv')