In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


The main idea behind this model is:

1. find `N` similar `(brand, region)` in the train dataset using the `brand_3`, `brand_3_market` and `brand_12_market` features.
2. compute the `mean` and the `std` deviation of these `N` groups
3. predict `(mean - X * std, mean + X * std)`, where `X ~ 1.2815`.

In [3]:
# CONSTANTS 
N = 5
X = 1.2815

# 1 - read data

In [4]:
from utils import get_data_from_path

In [5]:
train_path = '../data/data_split/train'
train_data = get_data_from_path(train_path)

val_path = '../data/data_split/val'
val_data = get_data_from_path(val_path)

test_path = '../data/data_split/test'
test_data = get_data_from_path(test_path)

# 2 - correlations

In [6]:
import numpy as np
import pandas as pd

In [7]:
index = ['month', 'region']
correlation_features = ['brand_3', 'brand_3_market', 'brand_12_market']

In [8]:
train_sales = train_data['sales_train']
val_sales = val_data['sales_train']
test_sales = test_data['sales_train']

In [9]:
pivoted_train_sales = pd.pivot(train_sales, index=index,columns='brand', values='sales').reset_index()
pivoted_val_sales = pd.pivot(val_sales, index=index,columns='brand', values='sales').reset_index()
pivoted_test_sales = pd.pivot(test_sales, index=index,columns='brand', values='sales').reset_index()

## 2.1 - datasets for validation

In [149]:
def get_correlation_dict(A: pd.DataFrame, B: pd.DataFrame, n: int):
    d = dict()
    for a_group, a_group_data in A.groupby('region'):
        tmp = dict()
        for b_group, b_group_data in B.groupby('region'):
            if a_group == b_group:
                continue
            corrs = []
            for cf in correlation_features:
                c = np.corrcoef(a_group_data[cf], b_group_data[cf])[0][1]
                corrs.append(c)
            tmp[b_group] = np.mean(corrs)
            
        d[a_group] = sorted(tmp, reverse=True, key = lambda x: tmp[x])[:n]
    return d, tmp

In [151]:
_, tmp = corr_dict_train = get_correlation_dict(A, B, n=N)

In [152]:
tmp

{'region_0': 0.6582416535935377,
 'region_1': 0.7846026705095182,
 'region_10': 0.7277913664052148,
 'region_100': 0.7337227033677758,
 'region_101': 0.5780563981933647,
 'region_102': 0.6280860401164047,
 'region_103': 0.5646658046171983,
 'region_104': 0.7046965172920299,
 'region_105': 0.5357161150783791,
 'region_106': 0.6672154216005725,
 'region_107': 0.7018781711821173,
 'region_108': 0.8002431632189406,
 'region_109': -0.018691019500649952,
 'region_11': 0.7641073447123574,
 'region_110': 0.33863982198988585,
 'region_111': 0.7515492692727835,
 'region_112': 0.6270180151465773,
 'region_113': 0.3221960139985767,
 'region_114': 0.7358264286653311,
 'region_115': 0.8449410268207934,
 'region_116': 0.7741244789079743,
 'region_117': 0.5459592641971396,
 'region_118': 0.5797097185434944,
 'region_119': 0.8400963634528628,
 'region_12': 0.7028219003686661,
 'region_120': 0.8754372762151915,
 'region_121': 0.5710675949704852,
 'region_122': 0.6429027916065323,
 'region_123': 0.747021

In [112]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

FEATURES = ('mean', 'std', percentile(10), percentile(90), 'min', 'max')

def get_features_using_correlations(train, test, corrs):
    test_features = []
    for region, region_data in test.groupby(['region']):
        x = train[train['region'].isin(corrs[region])].groupby('month').agg({'brand_1': FEATURES,
                                                                             'brand_2': FEATURES})
        for _, c in x.columns:
            region_data[f'brand_1_similar_{c}'] = x['brand_1'][c].values
            region_data[f'brand_2_similar_{c}'] = x['brand_2'][c].values
            
        test_features.append(region_data)
    return pd.concat(test_features).reset_index(drop=True)

In [103]:
# dataset to train the validation model
A = pivoted_train_sales.copy()
B = pivoted_train_sales.copy()
corr_dict_train = get_correlation_dict(A, B, n=N)
validation_train_features = get_features_using_correlations(train=A, test=B, corrs=corr_dict_train)

In [108]:
# dataset to test the validation model
A = pivoted_val_sales.copy()
B = pivoted_train_sales.copy()
corr_dict_val = get_correlation_dict(A, B, n=N)
validation_test_features = get_features_using_correlations(train=B, test=A, corrs=corr_dict_val)

In [142]:
to_drop = ['brand_1', 'brand_12_market', 'brand_2', 'brand_3', 'brand_3_market']

In [143]:
validation_train_features.drop(to_drop, axis=1).to_csv("train_correlation_features_for_validation.csv", index=False)
validation_test_features.drop(to_drop, axis=1).to_csv("test_correlation_features_for_validation.csv", index=False)

## 2.2 - dataset for test

In [135]:
# dataset to train the test model
A = pd.concat([pivoted_train_sales, pivoted_val_sales])
B = pd.concat([pivoted_train_sales, pivoted_val_sales])
corr_dict_train = get_correlation_dict(A, B, n=N)
train_train_features = get_features_using_correlations(train=A, test=B, corrs=corr_dict_train)

In [136]:
# dataset to test the test model
A = pivoted_test_sales.copy()
B = pd.concat([pivoted_train_sales, pivoted_val_sales])
corr_dict_test = get_correlation_dict(A, B, n=N)
train_test_features = get_features_using_correlations(train=B, test=A, corrs=corr_dict_test)

In [145]:
to_drop = ['brand_12_market', 'brand_3', 'brand_3_market']

In [146]:
train_train_features.drop(to_drop, axis=1).to_csv("train_correlation_features_for_test.csv", index=False)
train_test_features.drop(to_drop, axis=1).to_csv("test_correlation_features_for_test.csv", index=False)