In [None]:
cd ../..

In [None]:
import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Settings

In [None]:
DATA_RAW_PATH = "data/raw"
VOLUME_FILE = "gx_volume.csv"
GENERICS_FILE = "gx_num_generics.csv"
PACKAGE_FILE = "gx_package.csv"
CHANNELS_FILE = "gx_panel.csv"
THERAPEUTIC_FILE = "gx_therapeutic_area.csv"
OUTPUT_PATH = "data/features/extra_features.csv"

# Retrieve data

In [None]:
volume = pd.read_csv(f"{DATA_RAW_PATH}/{VOLUME_FILE}", index_col=0)
volume.head(2)

In [None]:
generics = pd.read_csv(f"{DATA_RAW_PATH}/{GENERICS_FILE}", index_col=0)
generics.head(2)

In [None]:
packages = pd.read_csv(f"{DATA_RAW_PATH}/{PACKAGE_FILE}", index_col=0)
packages.head(2)

In [None]:
channels = pd.read_csv(f"{DATA_RAW_PATH}/{CHANNELS_FILE}", index_col=0)
channels.head(2)

In [None]:
therapeutic = pd.read_csv(f"{DATA_RAW_PATH}/{THERAPEUTIC_FILE}", index_col=0)
therapeutic.head(2)

# Create base table

## List of contries and brands

In [None]:
dataset = volume[["country", "brand"]].drop_duplicates().reset_index(drop=True).copy()
dataset.head(2)

# Create extra features

## Brands in country

In [None]:
brands_in_country = dataset.groupby("country").size().to_frame(name="brands_in_country").reset_index()
brands_in_country["brands_in_country"] = brands_in_country["brands_in_country"] / brands_in_country["brands_in_country"].max()
brands_in_country.head(2)

In [None]:
dataset = dataset.merge(brands_in_country, on="country", how="left")
dataset.head(2)

## Countries present

In [None]:
countries_present = dataset.groupby("brand").size().to_frame(name="countries_present").reset_index()
countries_present["countries_present"] = countries_present["countries_present"] / countries_present["countries_present"].max()
countries_present.head(2)

In [None]:
dataset = dataset.merge(countries_present, on="brand", how="left")
dataset.head(2)

## Max volume per country/brand

In [None]:
volume_max = volume[["country", "brand", "volume"]].groupby(["country", "brand"], as_index=False).max()
volume_max = volume_max.rename(columns={"volume": "volume_max"})
volume_max["volume_max"] = volume_max["volume_max"] / volume_max["volume_max"].max()
volume_max.head(2)

In [None]:
dataset = dataset.merge(volume_max, on=["country", "brand"], how="left")
dataset.head(2)

## Max volume per package

In [None]:
packages_max = dataset[["country", "brand"]].merge(volume[["country", "brand", "volume"]], on=["country", "brand"], how="left")
packages_max = packages_max.merge(packages, on=["country", "brand"], how="left")
packages_max = packages.merge(packages_max[["presentation", "volume"]].groupby(["presentation"], as_index=False).max(), on="presentation", how="left")
packages_max = packages_max.rename(columns={"volume": "package_max_volume"})
packages_max = packages_max.drop(columns=["presentation"])
packages_max["package_max_volume"] = packages_max["package_max_volume"] / packages_max["package_max_volume"].max()
packages_max.head(2)

In [None]:
dataset = dataset.merge(packages_max, on=["country", "brand"], how="left")
dataset.head(2)

## Number of brands for the same therapeutic area in country

In [None]:
therapeutic_num = dataset[["country", "brand"]].merge(therapeutic, on="brand", how="left")
therapeutic_num = therapeutic_num.groupby(["country", "therapeutic_area"]).size().to_frame(name="therapeutic_in_country").reset_index()
therapeutic_num["therapeutic_in_country"] = therapeutic_num["therapeutic_in_country"] / therapeutic_num["therapeutic_in_country"].max()
therapeutic_num.head(2)

In [None]:
dataset = dataset.merge(therapeutic, on="brand", how="left")
dataset = dataset.merge(therapeutic_num, on=["country", "therapeutic_area"], how="left")
dataset = dataset.drop(columns=["therapeutic_area"])
dataset.head(2)

## Volume standard deviation per country/brand

In [None]:
volume_std = volume[["country", "brand", "volume"]].groupby(["country", "brand"], as_index=False).std()
volume_std = volume_std.rename(columns={"volume": "volume_std"})
volume_std["volume_std"] = volume_std["volume_std"] / volume_std["volume_std"].max()
volume_std.head(2)

In [None]:
dataset = dataset.merge(volume_std, on=["country", "brand"], how="left")
dataset.head(2)

## Time to generics

In [None]:
time_to_generics = volume[["country", "brand", "month_num"]]
time_to_generics = time_to_generics[time_to_generics["month_num"] < 0]
time_to_generics = time_to_generics.groupby(["country", "brand"]).size().to_frame(name="time_to_generics").reset_index()
time_to_generics["time_to_generics"] = time_to_generics["time_to_generics"] / time_to_generics["time_to_generics"].max()
time_to_generics.head(2)

In [None]:
dataset = dataset.merge(time_to_generics, on=["country", "brand"], how="left")
dataset.head(2)

# Save dataset

In [None]:
dataset.head(10)

In [None]:
dataset.to_csv(OUTPUT_PATH, index=False)