In [None]:
import pandas as pd
import os
import numpy as np
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_predict
from tools.metrics import (
    apply_metrics,
    prep_data_for_metric,
    get_avg_volumes,
)

In [1]:
def compute_metrics(preds, lower, upper, y, X, avg_volumes):

    id_cols = ["country", "brand"]

    prepped_X = prep_data_for_metric(X, avg_volumes)

    prepped_X["actuals"] = y
    prepped_X["forecast"] = preds
    prepped_X["lower_bound"] = lower
    prepped_X["upper_bound"] = upper

    return np.mean(abs(prepped_X.groupby(id_cols).apply(apply_metrics)))

In [None]:
root_dir = os.getcwd()
data_dir = os.path.join(root_dir, 'data')
train_path = os.path.join(data_dir, 'train_split.csv')
valid_path = os.path.join(data_dir, 'valid_split.csv')
full_dataset_path = os.path.join(data_dir, 'gx_merged_lags_months.csv')
volume_path = os.path.join(data_dir, 'gx_volume.csv')
submision_template_path = os.path.join(data_dir, 'submission_template.csv')
submissions_folder = os.path.join(root_dir, submissions)

In [None]:
full = pd.read_csv(full_dataset_path)
train = pd.read_csv(train_path)
validation = pd.read_csv(valid_path)
volume = pd.read_csv(volume_path, index_col=0)
submision = pd.read_csv(submision_template_path)

In [None]:
volume_at_1 = volume.loc[volume.month_num == -1, ['country', 'brand', 'volume']].\
            drop_duplicates().\
            rename(columns={'volume':'volume_1'})

In [None]:
full_enriched = full.merge(volume_at_1, 'left', on=['country', 'brand'])
full_enriched['relative_volume'] = np.log((1 + full_enriched.volume)/full_enriched.volume_1)

In [None]:
full_enriched.sort_values(by = ['test', 'country', 'brand', 'month_num'], inplace=True)
full_enriched['lag_relative_volume'] = full_enriched.\
    groupby(['test', 'country', 'brand'])['relative_volume'].shift(1)
full_enriched['lag_relative_volume'] = np.where(
    full_enriched['month_num'] == 0, 
    0, 
    full_enriched['lag_relative_volume']
)

In [None]:
test = full_enriched.loc[full_enriched.test == 1, :]
train_eval = full_enriched.loc[full_enriched.test == 0, :]

In [None]:
train_with_features = train.merge(train_eval, 'inner', on=['country', 'brand'])
validation_with_features = validation.merge(train_eval, 'inner', on=['country', 'brand'])

In [None]:
to_drop = ['volume', 'relative_volume']
categorical_cols = ['country', 'brand', 'therapeutic_area', 'presentation', 'month_name']

In [None]:
train_x = train_with_features.drop(columns=to_drop)
train_y = train_with_features.relative_volume
val_x = validation_with_features.drop(columns=to_drop)
val_y = validation_with_features.relative_volume
test_x = test.drop(columns=to_drop)

In [None]:
train_with_features

In [None]:
te = TargetEncoder(cols=categorical_cols)

In [None]:
ran_forest = RandomForestRegressor()

In [None]:
pipe = Pipeline([
        ('te', te),
        ('ran_forest', ran_forest)
    ])

In [None]:
pipe.fit(train_x, train_y)

In [None]:
val_pred = pipe.predict(val_x)

In [None]:
validation_with_features['prediction'] = val_pred

In [None]:
n_months = range(24)
n_boots = 500
error_estimate = {}
for month in n_months:
    i = 0
    df_filter = validation_with_features[validation_with_features.month_num == month].reset_index()
    list_error = []
    while i < n_boots:
        idx = np.random.choice(len(df_filter), len(df_filter))
        boot_df = df_filter.iloc[idx, :]
        error = np.abs((sum(boot_df.relative_volume) - sum(boot_df.prediction))/sum(boot_df.prediction))
        list_error.append(error)
        i += 1
    error_estimate[month] = np.mean(list_error)

In [None]:
validation_with_features['predicted_volume'] = np.exp(validation_with_features.prediction) * validation_with_features.volume_1

In [None]:
def return_rate(month):
    return error_estimate[month]

validation_with_features['rate'] = validation_with_features.apply(lambda x: return_rate(x['month_num']), axis=1)

In [None]:
validation_with_features['upper'] = validation_with_features.predicted_volume*(1+validation_with_features.rate)
validation_with_features['lower'] = validation_with_features.predicted_volume*(1-validation_with_features.rate)

In [None]:
avg_volumes = get_avg_volumes()

In [None]:
metric_pair = compute_metrics(
                    preds=validation_with_features.predicted_volume,
                    lower=validation_with_features.lower,
                    upper=validation_with_features.upper,
                    y=validation_with_features.volume,
                    X=val_x,
                    avg_volumes=avg_volumes
                )
print(metric_pair)

In [None]:
test_x.reset_index(inplace=True, drop=True)
test_copy = test.copy()
test_copy.sort_values(by=['country', 'brand', 'month_num'], inplace=True)
test_copy['relative_volume'] = float('-inf')

In [None]:
i = 0
while i < len(test_copy):
    row = test_copy.iloc[i, :]

    country = row.country
    brand = row.brand
    month = row.month_num
    
    if row.month_num==0:
        row.at['lag_relative_volume'] = 0
    else:
        ind = (test_copy.brand == brand) & (test_copy.country == country) & (test_copy.month_num == month-1) 
        lag_relative_volume = test_copy.loc[ind, 'relative_volume']
        row.at['lag_relative_volume'] = lag_relative_volume
    pred_val = pipe.predict(row.to_frame().T.drop(columns=['relative_volume', 'volume']))
    ind = (test_copy.brand == brand) & (test_copy.country == country) & (test_copy.month_num == month) 
    test_copy.loc[ind, 'relative_volume'] = pred_val[0]
    i+=1

In [None]:
test_copy['prediction'] = np.exp(test_copy.relative_volume)*test_copy.volume_1
test_copy['rate'] = test_copy.apply(lambda x: return_rate(x['month_num']), axis=1)
test_copy['pred_95_low'] = (1-test_copy.rate)*test_copy.prediction
test_copy['pred_95_high'] = (1+test_copy.rate)*test_copy.prediction
test_copy_selected_colums = test_copy.loc[:, ['country', 'brand', 'month_num', 'pred_95_low', 'prediction', 'pred_95_high']]

In [None]:
test_copy_selected_colums.to_csv(
    os.path.join(submissions_folder, 'random_forest.csv'), 
    index=False
)