In [None]:
cd ..

In [None]:
import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Settings

In [None]:
DATA_RAW_PATH = "data/raw"
VOLUME_FILE = "gx_volume.csv"
SUBMISSIONS_FILE = "submission_template.csv"
GENERICS_FILE = "gx_num_generics.csv"
PACKAGE_FILE = "gx_package.csv"
CHANNELS_FILE = "gx_panel.csv"
THERAPEUTIC_FILE = "gx_therapeutic_area.csv"
OUTPUT_PATH = "data/features/dataset_features.csv"

# Functions

In [None]:
def encode_variable(variable):
    label_encoder = LabelEncoder()
    label_encoder.fit(variable)
    return label_encoder.transform(variable)

def encode_month(month_name):
    datetime_object = datetime.datetime.strptime(month_name, "%b")
    month_number = datetime_object.month
    month_sin = np.sin(2 * np.pi * month_number/12)
    month_cos = np.cos(2 * np.pi * month_number/12)
    return month_sin, month_cos

# Retrieve data

In [None]:
volume = pd.read_csv(f"{DATA_RAW_PATH}/{VOLUME_FILE}", index_col=0)
volume.head(2)

In [None]:
generics = pd.read_csv(f"{DATA_RAW_PATH}/{GENERICS_FILE}", index_col=0)
generics.head(2)

In [None]:
packages = pd.read_csv(f"{DATA_RAW_PATH}/{PACKAGE_FILE}", index_col=0)
packages.head(2)

In [None]:
channels = pd.read_csv(f"{DATA_RAW_PATH}/{CHANNELS_FILE}", index_col=0)
channels.head(2)

In [None]:
therapeutic = pd.read_csv(f"{DATA_RAW_PATH}/{THERAPEUTIC_FILE}", index_col=0)
therapeutic.head(2)

# Create base table

## Create list of contries and brands

In [None]:
dataset = volume[["country", "brand"]].copy()
dataset.head(2)

## Encode country

In [None]:
dataset["country_id"] = encode_variable(dataset["country"])
dataset.head(2)

## Encode brand

In [None]:
dataset["brand_id"] = encode_variable(dataset["brand"])
dataset.head(2)

## Add number of generics

In [None]:
dataset = dataset.merge(generics, on=["country", "brand"], how="left").fillna(0)
dataset.head(2)

## Add encoded package

In [None]:
dataset = dataset.merge(packages, on=["country", "brand"], how="left")
dataset.head(2)

In [None]:
dataset["package_id"] = encode_variable(dataset["presentation"])
dataset = dataset.drop(columns=["presentation"])
dataset.head(2)

## Add distribution channel rates

In [None]:
channels = pd.pivot_table(channels, columns=["channel"], index=["country", "brand"]).fillna(0)
channels.columns = [f"{c[0]}_{c[1]}" for c in channels.columns]
channels.head(2)

In [None]:
dataset = dataset.merge(channels, on=["country", "brand"], how="left").fillna(0)
dataset.head(2)

## Add therapeutic area

In [None]:
dataset = dataset.merge(therapeutic, on="brand", how="left")
dataset.head(2)

In [None]:
dataset["therapeutic_id"] = encode_variable(dataset["therapeutic_area"])
dataset = dataset.drop(columns=["therapeutic_area"])
dataset.head(2)

# Add last month encoded

In [None]:
volume_last_month = volume[volume["month_num"] == -1].copy()
volume_last_month = volume_last_month.reset_index(drop=True)
volume_last_month = volume_last_month[["country", "brand", "month_name"]]

In [None]:
volume_last_month_encoded = pd.DataFrame(list(volume_last_month["month_name"].apply(encode_month)), columns=["last_month_sin", "last_month_cos"])
volume_last_month = pd.concat([volume_last_month, volume_last_month_encoded], axis=1)
volume_last_month.head()

In [None]:
dataset = dataset.merge(volume_last_month, on=["country", "brand"], how="left")
dataset = dataset.drop(columns="month_name")
dataset.head(2)

# Save dataset

In [None]:
dataset.head(10)

In [None]:
dataset.to_csv(OUTPUT_PATH, index=False)