In [None]:
import pandas as pd
import os
import numpy as np
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_predict
from tools.metrics import (
    apply_metrics,
    prep_data_for_metric,
    get_avg_volumes,
)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
def compute_metrics(preds, lower, upper, y, X, avg_volumes):

    id_cols = ["country", "brand"]

    prepped_X = prep_data_for_metric(X, avg_volumes)

    prepped_X["actuals"] = y
    prepped_X["forecast"] = preds
    prepped_X["lower_bound"] = lower
    prepped_X["upper_bound"] = upper

    return np.mean(abs(prepped_X.groupby(id_cols).apply(apply_metrics)))

In [None]:
root_dir = os.getcwd()
data_dir = os.path.join(root_dir, 'data')
train_path = os.path.join(data_dir, 'train_split.csv')
valid_path = os.path.join(data_dir, 'valid_split.csv')
features_path =  os.path.join(data_dir, 'features', 'feat_02.csv')
full_dataset_path = os.path.join(data_dir, 'gx_merged_lags_months.csv')
volume_path = os.path.join(data_dir, 'gx_volume.csv')
submision_template_path = os.path.join(data_dir, 'submission_template.csv')
submissions_folder = os.path.join(root_dir, 'submissions')

In [None]:
full = pd.read_csv(full_dataset_path)
train = pd.read_csv(train_path)
validation = pd.read_csv(valid_path)
volume = pd.read_csv(volume_path, index_col=0)
submision = pd.read_csv(submision_template_path)
features = pd.read_csv(features_path)

In [None]:
full_enriched = full.merge(
    features,
    'left',
    on=['country', 'brand', 'month_num']
)
full_enriched['target'] = np.log((full_enriched.volume + 1)/(full_enriched.volume_1 +1))

In [None]:
test = full_enriched.loc[full_enriched.test == 1, :]
train_eval = full_enriched.loc[full_enriched.test == 0, :]

In [None]:
features.loc[(features.country == 'country_9') & (features.brand == 'brand_187')]

In [None]:
train_with_features = train.merge(train_eval, 'inner', on=['country', 'brand'])
validation_with_features = validation.merge(train_eval, 'inner', on=['country', 'brand'])

In [None]:
#to_drop = ['volume', 'target', 'test', 'log_relative_volume']
to_drop = ['volume', 'target', 'test']
categorical_cols = ['country', 'brand', 'therapeutic_area', 'presentation', 'month_name']

In [None]:
train_x = train_with_features.drop(columns=to_drop)
train_y = train_with_features.log_relative_volume
val_x = validation_with_features.drop(columns=to_drop)
val_y = validation_with_features.log_relative_volume
test_x = test.drop(columns=to_drop)

In [None]:
train_x

In [None]:
categorical_cols = ['country', 'brand', 'therapeutic_area', 'presentation', 'month_name']
te = TargetEncoder(cols=categorical_cols)
pipe  = Pipeline([
    ("te", te),
    ("imp", SimpleImputer(strategy="mean")),
    ("sc", StandardScaler()),
    ("model", Lasso(alpha=0.001, max_iter=2000))
])

In [None]:
categorical_cols = ['country', 'brand', 'therapeutic_area', 'presentation', 'month_name']
te = TargetEncoder(cols=categorical_cols)
pipe  = Pipeline([
    ("te", te),
    ("model", RandomForestRegressor())
])

In [None]:
pipe.fit(train_x, train_y)

In [None]:
val_pred = pipe.predict(val_x)

In [None]:
validation_with_features['prediction'] = val_pred

In [None]:
n_months = range(24)
n_boots = 500
error_estimate = {}
for month in n_months:
    i = 0
    df_filter = validation_with_features[validation_with_features.month_num == month].reset_index()
    list_error = []
    while i < n_boots:
        idx = np.random.choice(len(df_filter), len(df_filter))
        boot_df = df_filter.iloc[idx, :]
        error = np.abs((sum(boot_df.target) - sum(boot_df.prediction))/sum(boot_df.prediction))
        list_error.append(error)
        i += 1
    error_estimate[month] = np.mean(list_error)

In [None]:
validation_with_features['predicted_volume'] = np.exp(validation_with_features.prediction) * validation_with_features.volume_1

In [None]:
def return_rate(month):
    return error_estimate[month]
validation_with_features['rate'] = validation_with_features.apply(lambda x: return_rate(x['month_num']), axis=1)

In [None]:
validation_with_features['upper'] = validation_with_features.predicted_volume*(1+validation_with_features.rate)
validation_with_features['lower'] = validation_with_features.predicted_volume*(1-validation_with_features.rate)

In [None]:
avg_volumes = get_avg_volumes()

In [None]:
metric_pair = compute_metrics(
                    preds=validation_with_features.predicted_volume,
                    lower=validation_with_features.lower,
                    upper=validation_with_features.upper,
                    y=validation_with_features.volume,
                    X=val_x,
                    avg_volumes=avg_volumes
                )
print(metric_pair)

In [None]:
test_x