#### Library

In [25]:
%%javascript
utils.load_extension("collapsible_headings/main")
utils.load_extension("hide_input/main")
utils.load_extension("autosavetime/main")
utils.load_extension("execute_time/ExecuteTime")
utils.load_extension("code_prettify/code_prettify")
utils.load_extension("scroll_down/main")
utils.load_extension("jupyter-js-widgets/extension")

<IPython.core.display.Javascript object>

In [26]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from category_encoders import TargetEncoder

from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_predict
from tools.metrics import (
    apply_metrics,
    prep_data_for_metric,
    get_avg_volumes,
    mean_absolute_percentage_error,
)
from catboost import CatBoostClassifier


def compute_metrics(preds, lower, upper, y, X, avg_volumes):

    id_cols = ["country", "brand"]

    prepped_X = prep_data_for_metric(X, avg_volumes)

    prepped_X["actuals"] = y
    prepped_X["forecast"] = preds
    prepped_X["lower_bound"] = lower
    prepped_X["upper_bound"] = upper

    return np.mean(abs(prepped_X.groupby(id_cols).apply(apply_metrics)))

In [28]:
full_df = pd.read_csv("data/gx_merged_lags.csv")
train_tuples = pd.read_csv("data/train_split.csv")
valid_tuples = pd.read_csv("data/valid_split.csv")

test_df = full_df[full_df.test == 1].copy().reset_index(drop=True)

full_df = full_df[full_df.test == 0]

train_df = full_df.merge(train_tuples, how="inner").reset_index(drop=True)
val_df = full_df.merge(valid_tuples, how="inner").reset_index(drop=True)

In [29]:
# TODO: no need for calculation every time
avg_volumes = get_avg_volumes()

to_drop = ["month_name", "volume"]

X_train = train_df.drop(columns=to_drop)
y_train = train_df.volume

X_val = val_df.drop(columns=to_drop)
y_val = val_df.volume

X_test = test_df.drop(columns=to_drop)

## Model

In [30]:
lgb = LGBMRegressor(n_jobs=-1, n_estimators=100, objective="regression_l1")

In [31]:
from sktools.encoders import QuantileEncoder

In [32]:
categorical_cols = ["country", "brand", "therapeutic_area", "presentation"]
te = TargetEncoder(cols=categorical_cols)
te = QuantileEncoder(cols=categorical_cols)
te_residual = TargetEncoder(cols=categorical_cols)

In [33]:
pipe = Pipeline([("te", te), ("cb", lgb)])

In [34]:
cv_preds = cross_val_predict(pipe, X_train, y_train, cv=3)

  elif pd.api.types.is_categorical(cols):


In [35]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('te',
                 QuantileEncoder(cols=['country', 'brand', 'therapeutic_area',
                                       'presentation'],
                                 drop_invariant=False, handle_missing='value',
                                 handle_unknown='value', m=1.0, quantile=0.5,
                                 return_df=True, verbose=0)),
                ('cb',
                 LGBMRegressor(boosting_type='gbdt', class_weight=None,
                               colsample_bytree=1.0, importance_type='split',
                               learning_rate=0.1, max_depth=-1,
                               min_child_samples=20, min_child_weight=0.001,
                               min_split_gain=0.0, n_estimators=100, n_jobs=-1,
                               num_leaves=31, objective='regression_l1',
                               random_state=None, reg_alpha=0.0, reg_lambda=0.0,
                               silent=True, subsample

In [36]:
preds = pipe.predict(X_val)

## Residuals

### Predict val error

In [37]:
lgb_residual = LGBMRegressor(n_jobs=-1, n_estimators=100, objective="regression_l1")

In [38]:
pipe_residual = Pipeline([("te", te_residual), ("lgb", lgb_residual)])

In [39]:
y_train_residual = np.abs(cv_preds - y_train)

In [40]:
pipe_residual.fit(X_train, y_train_residual)

preds_residual = pipe_residual.predict(X_val)

## Results

In [41]:
submission_df = pd.read_csv("data/submission_template.csv")

In [42]:
preds_test = pipe.predict(X_test)
preds_test_residual = pipe_residual.predict(X_test)

In [77]:

bounds = [0, 0.01, 0.1, 0.5, 1, 1.5]


min_unc = 1e8
best_bound = 0
for bound in bounds:
    print(f"Bound: {bound}")
    metric_pair = compute_metrics(
        preds,
        preds-1,
        preds + 1,
        y_val,
        X_val,
        avg_volumes,
    )
    print(metric_pair)

    unc_metric = metric_pair.values[1]

    if unc_metric < min_unc:
        min_unc = unc_metric
        best_bound = bound

Bound: 0
custom_metric          68.016131
uncertainty_metric    393.853192
dtype: float64
Bound: 0.01
custom_metric          68.016131
uncertainty_metric    393.853192
dtype: float64
Bound: 0.1
custom_metric          68.016131
uncertainty_metric    393.853192
dtype: float64
Bound: 0.5
custom_metric          68.016131
uncertainty_metric    393.853192
dtype: float64
Bound: 1
custom_metric          68.016131
uncertainty_metric    393.853192
dtype: float64
Bound: 1.5
custom_metric          68.016131
uncertainty_metric    393.853192
dtype: float64


In [75]:
preds

array([9622506.76849406, 6498575.00810418, 5747510.70862589, ...,
       1626388.14393125, 1626388.14393125, 1626388.14393125])

In [45]:
print(min_unc)
print(best_bound)

214.31630035913284
0.1


In [46]:
submission_df["pred_95_low"] = np.maximum(preds_test - best_bound * preds_test_residual, 0)

submission_df["pred_95_high"] = np.maximum(preds_test + best_bound * preds_test_residual,0)

submission_df["prediction"] = np.maximum(preds_test, 0)

In [74]:
submission_df.to_csv("submissions/baseline.csv", index=False)

In [70]:
submission_df[submission_df.pred_95_high<submission_df.prediction]

Unnamed: 0,country,brand,month_num,pred_95_low,prediction,pred_95_high


In [71]:
submission_df.iloc[1272,submission_df.columns.get_loc('pred_95_low')] = 158984
submission_df.iloc[1272,submission_df.columns.get_loc('pred_95_high')] = 163442

In [72]:
submission_df.iloc[1272]['pred_95_low']= 158984
submission_df.iloc[1272].pred_95_high = 163442

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [73]:
submission_df[submission_df.pred_95_low>submission_df.prediction]

Unnamed: 0,country,brand,month_num,pred_95_low,prediction,pred_95_high


In [49]:
submission_df.describe()

Unnamed: 0,month_num,pred_95_low,prediction,pred_95_high
count,4584.0,4584.0,4584.0,4584.0
mean,11.5,74834660.0,79013760.0,83194620.0
std,6.922942,331577100.0,350067900.0,368631500.0
min,0.0,0.0,0.0,0.0
25%,5.75,337067.4,418988.9,528946.8
50%,11.5,2764279.0,3145931.0,3537031.0
75%,17.25,16830370.0,17996400.0,19727800.0
max,23.0,2522287000.0,2688551000.0,2854815000.0
