The idea here is to find regions with similar GDPS and number of healthcare workers and get from there the average sales per brand 1 and 2

In [1]:
%load_ext autoreload
%autoreload 2

# 1 - read data

In [2]:
from utils import get_data_from_path

In [3]:
train_path = '../data/data_split/train'
train_data = get_data_from_path(train_path)

val_path = '../data/data_split/val'
val_data = get_data_from_path(val_path)

test_path = '../data/data_split/test'
test_data = get_data_from_path(test_path)

# 2 - groups

In [4]:
import pandas as pd
import numpy as np

In [6]:
train_regions = train_data['regions']
train_regions_hcps = train_data['regions_hcps']

train_regions_data = train_regions.merge(train_regions_hcps)
train_sales = train_data['sales_train']

In [7]:
val_regions = val_data['regions']
val_regions_hcps = val_data['regions_hcps']

val_regions_data = val_regions.merge(val_regions_hcps)
val_sales = val_data['sales_train']

In [15]:
test_regions = test_data['regions']
test_regions_hcps = test_data['regions_hcps']

test_regions_data = test_regions.merge(test_regions_hcps)
test_sales = test_data['sales_train']

In [16]:
train_regions_data['total_hcps'] = train_regions_data[['Internal medicine', 'Internal medicine / pneumology',
                                             'General practicioner', 'Internal medicine and general practicioner',
                                             'Pediatrician']].sum(axis=1)
val_regions_data['total_hcps'] = val_regions_data[['Internal medicine', 'Internal medicine / pneumology',
                                         'General practicioner', 'Internal medicine and general practicioner',
                                         'Pediatrician']].sum(axis=1)

test_regions_data['total_hcps'] = test_regions_data[['Internal medicine', 'Internal medicine / pneumology',
                                         'General practicioner', 'Internal medicine and general practicioner',
                                         'Pediatrician']].sum(axis=1)


## 2.1 - datasets for validation

In [9]:
# here the groups are generated for different features 
feats_to_group = ['pci16', 'pci18', 'Internal medicine / pneumology', 'total_hcps',
                  'population', 'Pediatrician']

for f in feats_to_group:
    # groups are computed using training data
    groups = np.quantile(train_regions_data[f], q=[0, 0.25, 0.5, 0.75])
    groups = np.insert(groups, 0, 0)
    groups = np.insert(groups, len(groups), np.inf)
    train_regions_data[f'{f}_group'] = pd.cut(train_regions_data[f], groups)
    val_regions_data[f'{f}_group'] = pd.cut(val_regions_data[f], groups)

In [10]:
# here the average of these groups is computed using the training dataset
averages = dict()
for f in feats_to_group:
    x = (train_sales
         .merge(train_regions_data)
         .groupby([f'{f}_group'], as_index=False)
         [['sales']]
         .mean())
    x.rename(columns={'sales': f'{f}_mean_sales'}, inplace=True)
    averages[f] = x

In [11]:
# adding average to train
train_feats = train_regions_data
for f in feats_to_group:
    train_feats = train_feats.merge(averages[f], on=f'{f}_group')
train_feats = train_feats.merge(train_sales)
columns = ['month', 'brand', 'region'] + [c for c in train_feats.columns if c.endswith('mean_sales')]
train_feats = train_feats[columns]

In [12]:
# adding average to val
val_feats = val_regions_data
for f in feats_to_group:
    val_feats = val_feats.merge(averages[f], on=f'{f}_group')
val_feats = val_feats.merge(val_sales)
columns = ['month', 'brand', 'region'] + [c for c in val_feats.columns if c.endswith('mean_sales')]
val_feats = val_feats[columns]

In [13]:
train_feats.to_csv('../data/features/train_group_features_for_validation.csv')
val_feats.to_csv('../data/features/test_group_features_for_validation.csv')

## 2.2 - datasets for test

In [20]:
train_and_val_regions_data = pd.concat([train_regions_data, val_regions_data])
train_and_val_sales = pd.concat([train_sales, val_sales])

In [21]:
# here the groups are generated for different features 
feats_to_group = ['pci16', 'pci18', 'Internal medicine / pneumology', 'total_hcps',
                  'population', 'Pediatrician']

for f in feats_to_group:
    # groups are computed using training data
    groups = np.quantile(train_and_val_regions_data[f], q=[0, 0.25, 0.5, 0.75])
    groups = np.insert(groups, 0, 0)
    groups = np.insert(groups, len(groups), np.inf)
    train_and_val_regions_data[f'{f}_group'] = pd.cut(train_and_val_regions_data[f], groups)
    test_regions_data[f'{f}_group'] = pd.cut(test_regions_data[f], groups)

In [22]:
# here the average of these groups is computed using the training dataset
averages = dict()
for f in feats_to_group:
    x = (train_and_val_sales
         .merge(train_and_val_regions_data)
         .groupby([f'{f}_group'], as_index=False)
         [['sales']]
         .mean())
    x.rename(columns={'sales': f'{f}_mean_sales'}, inplace=True)
    averages[f] = x

In [23]:
# adding average to train
train_feats = train_and_val_regions_data
for f in feats_to_group:
    train_feats = train_feats.merge(averages[f], on=f'{f}_group')
train_feats = train_feats.merge(train_and_val_sales)
columns = ['month', 'brand', 'region'] + [c for c in train_feats.columns if c.endswith('mean_sales')]
train_feats = train_feats[columns]

In [24]:
# adding average to test
test_feats = test_regions_data
for f in feats_to_group:
    test_feats = test_feats.merge(averages[f], on=f'{f}_group')
test_feats = test_feats.merge(val_sales)
columns = ['month', 'brand', 'region'] + [c for c in test_feats.columns if c.endswith('mean_sales')]
test_feats = test_feats[columns]

In [94]:
train_feats.to_csv('../data/features/train_group_features_for_test.csv')
test_feats.to_csv('../data/features/test_group_features_for_test.csv')

In [95]:
train_feats.columns

Index(['month', 'brand', 'region', 'pci16_mean_sales', 'pci18_mean_sales',
       'Internal medicine / pneumology_mean_sales', 'total_hcps_mean_sales',
       'population_mean_sales', 'Pediatrician_mean_sales'],
      dtype='object')