In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
root_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(root_dir, 'data')
full_raw_initial_dataset_path = os.path.join(data_dir, 'gx_merged_lags_months.csv')
volume_path = os.path.join(data_dir, 'gx_volume.csv')
train_path = os.path.join(data_dir, 'train_split.csv')
features_path = os.path.join(data_dir, 'features')

In [None]:
full_raw_initial_dataset = pd.read_csv(full_raw_initial_dataset_path)
volume = pd.read_csv(volume_path, index_col=0)
full_initial_dataset = full_raw_initial_dataset.loc[
    full_raw_initial_dataset.test == 0,:].drop(columns = 'test').drop_duplicates()

In [None]:
def find_closest_volume(country, brand, month_num, length_serie, func):
    ind = (volume.country == country) & (volume.brand == brand) & (volume.month_num <month_num)
    volume_filter = volume.loc[ind, :]
    volume_sorted = volume_filter.sort_values(by=['month_num'], ascending=False)
    volume_sorted.reset_index(inplace=True, drop=True)
    total_obs = len(volume_sorted)
    total_to_select = length_serie if length_serie<=total_obs else total_obs 
    volumes_selected = volume_sorted.volume[:total_to_select].values
    return func(volumes_selected)

In [None]:
base_df = full_initial_dataset.loc[:, ['country', 'brand']].drop_duplicates()

In [None]:
base_df['offset_mean_last_2_months'] = np.vectorize(find_closest_volume)(
    base_df['country'], 
    base_df['brand'],
    0,
    2,
    np.mean
)

In [None]:
base_df['offset_mean_last_12_months'] = np.vectorize(find_closest_volume)(
    base_df['country'], 
    base_df['brand'],
    0,
    12,
    np.mean
)

In [None]:
base_df['offset_median_last_12_months'] = np.vectorize(find_closest_volume)(
    base_df['country'], 
    base_df['brand'],
    0,
    12,
    np.median
)

In [None]:
base_df['offset_max_last_12_months'] = np.vectorize(find_closest_volume)(
    base_df['country'], 
    base_df['brand'],
    0,
    12,
    np.amax
)

In [None]:
base_df['offset_min_last_12_months'] = np.vectorize(find_closest_volume)(
    base_df['country'], 
    base_df['brand'],
    0,
    12,
    np.amin
)

In [None]:
base_df['offset_mean_last_month'] = np.vectorize(find_closest_volume)(
    base_df['country'], 
    base_df['brand'],
    0,
    1,
    np.mean
)

In [None]:
base_df['offset_mean_historical'] = np.vectorize(find_closest_volume)(
    base_df['country'], 
    base_df['brand'],
    0,
    400,
    np.mean
)

In [None]:
base_df.to_csv('offsets.csv', index=False)