In [None]:
cd ../..

In [None]:
import numpy as np
import pandas as pd

# Settings

In [None]:
DATA_RAW_PATH = "data/raw"
FEATURES_PATH = "data/features"

VOLUME_FILE = "gx_volume.csv"

BASIC_FEATURES_FILE = "basic_features.csv"
MAX_AVG_FILE = "max_avg.csv"
MONTH_ENCODED_FILE = "months_encoded.csv"
EXTRA_FEATURES_PATH = "extra_features.csv"


# GENERICS_FILE = "gx_num_generics.csv"
# PACKAGE_FILE = "gx_package.csv"
# CHANNELS_FILE = "gx_panel.csv"
# THERAPEUTIC_FILE = "gx_therapeutic_area.csv"

OUTPUT_PATH = "data/features/final_features.csv"

# Retrieve data

In [None]:
volume = pd.read_csv(f"{DATA_RAW_PATH}/{VOLUME_FILE}", index_col=0)
volume.head(2)

In [None]:
basic_features = pd.read_csv(f"{FEATURES_PATH}/{BASIC_FEATURES_FILE}")
basic_features.head(2)

In [None]:
max_avg_features = pd.read_csv(f"{FEATURES_PATH}/{MAX_AVG_FILE}")
max_avg_features.head(2)

In [None]:
month_encoded_features = pd.read_csv(f"{FEATURES_PATH}/{MONTH_ENCODED_FILE}")
month_encoded_features.head(2)

In [None]:
extra_features = pd.read_csv(f"{FEATURES_PATH}/{EXTRA_FEATURES_PATH}")
extra_features.head(2)

# Create dataset

### Basic Features

In [None]:
id_cols = ["country", "brand"]

In [None]:
dataset = volume.merge(basic_features, on=id_cols, how="left")

### Max/Avg

In [None]:
dataset = dataset.merge(max_avg_features, on=id_cols, how="left")

### Extra Features

In [None]:
dataset = dataset.merge(extra_features, on=id_cols, how="left")

### Month Encoded

In [None]:
id_cols = ["country", "brand", "month_num"]

In [None]:
month_encoded_features = month_encoded_features.drop(columns="month_name")

In [None]:
dataset = dataset.merge(month_encoded_features, on=id_cols, how="left")

## Normalizations

In [None]:
# Normalize volume
dataset["volume_norm"] = dataset["volume"] / dataset["max_volume"]

In [None]:
# Normalize Channel Rate
dataset["channel_rate_A"] = dataset["channel_rate_A"] / 100
dataset["channel_rate_B"] = dataset["channel_rate_B"] / 100
dataset["channel_rate_C"] = dataset["channel_rate_C"] / 100
dataset["channel_rate_D"] = dataset["channel_rate_D"] / 100

In [None]:
# Normalize Num Generics
# Note: We've found a max of 50 competitors
# TODO: Scale by std/mean
dataset["num_generics"] = dataset["num_generics"] / 50

### Redundant

In [None]:
redundant_columns = ["month_name", "channel_rate_D", "last_month_sin", "last_month_cos"]
dataset = dataset.drop(columns=redundant_columns)

In [None]:
dataset.sample(2)

# Save dataset

In [None]:
dataset.to_csv(OUTPUT_PATH, index=False)