In [None]:
cd ..

In [None]:
import pandas as pd

from src.utils import read_raw_file, save_dataset

# Settings

In [None]:
DATA_RAW_PATH = "data/raw"
VOLUME_FILE = "gx_volume.csv"
GENERICS_FILE = "gx_num_generics.csv"
PACKAGE_FILE = "gx_package.csv"
PANEL_FILE = "gx_panel.csv"
AREA_FILE = "gx_therapeutic_area.csv"
SUBMISSION_FILE = "submission_template.csv"
DATA_CLEAN_PATH = "data/clean"
OUTPUT_NAME = "dataset_complete.csv"
OUTPUT_NAME_PROVIDED = "dataset_complete_provided.csv"

# Retrieve data

In [None]:
volume = read_raw_file(DATA_RAW_PATH, VOLUME_FILE)
volume.head(2)

In [None]:
submission = pd.read_csv(f"{DATA_RAW_PATH}/{SUBMISSION_FILE}")
submission.head(2)

In [None]:
competitors = read_raw_file(DATA_RAW_PATH, GENERICS_FILE)
competitors.head(2)

In [None]:
packages = read_raw_file(DATA_RAW_PATH, PACKAGE_FILE)
packages.head(2)

In [None]:
channels = read_raw_file(DATA_RAW_PATH, PANEL_FILE)
channels.head(2)

In [None]:
bodyparts = read_raw_file(DATA_RAW_PATH, AREA_FILE)
bodyparts.head(2)

# Create base table

## Join provided and required datasets

In [None]:
volume = volume.drop(columns=["month_name"])
volume["provided"] = True
volume["submission"] = False
volume.head(2)

In [None]:
submission

In [None]:
submission = submission.drop(columns=["pred_95_low", "pred_95_high"])
submission = submission.rename(columns={"prediction": "volume"})
submission["provided"] = False
submission["submission"] = True
submission = submission[["country", "brand", "volume", "month_num", "provided", "submission"]]

In [None]:
dataset = pd.concat([volume, submission])
dataset = dataset.groupby(["country", "brand", "month_num"], as_index=False).sum()
dataset.head(2)

## Add number of competitors

In [None]:
dataset = dataset.merge(competitors, on=["country", "brand"], how="left").fillna(0)
dataset.head(2)

## Add type of drug

In [None]:
packages = pd.get_dummies(packages, columns=["presentation"], prefix="package")
packages.head(2)

In [None]:
dataset = dataset.merge(packages, on=["country", "brand"], how="left").fillna(0)
dataset.head(2)

## Add distribution channel rates

In [None]:
channels = pd.pivot_table(channels, columns=["channel"], index=["country", "brand"]).fillna(0)
channels.columns = [f"{c[0]}_{c[1]}" for c in channels.columns]
channels.head(2)

In [None]:
dataset = dataset.merge(channels, on=["country", "brand"], how="left").fillna(0)
dataset.head(2)

## Add body part

In [None]:
bodyparts = pd.get_dummies(bodyparts, columns=["therapeutic_area"], prefix="bodypart")
bodyparts.head(2)

In [None]:
dataset = dataset.merge(bodyparts, on="brand", how="left").fillna(0)
dataset.head(2)

## Transform country and brand into dummies

In [None]:
dataset_dummies = pd.get_dummies(dataset[["country", "brand"]], columns=["country", "brand"], prefix=["", ""])
dataset_dummies.columns = [c[1:] for c in dataset_dummies.columns]
dataset = pd.concat([dataset, dataset_dummies], axis=1)
dataset.head(2)

## Reorder columns

In [None]:
first_columns = ["provided", "submission", "country", "brand", "volume"]
last_columns = [c for c in dataset.columns if c not in first_columns]
dataset = dataset[first_columns + last_columns]
dataset.head(2)

# Save complete dataset

In [None]:
dataset.head(10)

In [None]:
save_dataset(dataset, DATA_CLEAN_PATH, OUTPUT_NAME)

## Only provided

In [None]:
dataset_provided = dataset[dataset["provided"] == True]
dataset_provided.drop(columns=["provided", "submission", "country", "brand"])
save_dataset(dataset_provided, DATA_CLEAN_PATH, OUTPUT_NAME_PROVIDED)