In [None]:
import pandas as pd

# Settings

In [None]:
DATA_RAW_PATH = "../data/raw"
GENERICS_FILE = "gx_num_generics.csv"
PACKAGE_FILE = "gx_package.csv"
PANEL_FILE = "gx_panel.csv"
AREA_FILE = "gx_therapeutic_area.csv"
VOLUME_FILE = "gx_volume.csv"

# Functions

In [None]:
def read_file(path, file):
    return pd.read_csv(f"{path}/{file}", index_col=0)

# Retrieve data

## Volume (target variable)

In [None]:
volume = read_file(DATA_RAW_PATH, VOLUME_FILE)
print(len(volume))
volume.head()

## Number of competitors per brand and country

In [None]:
generics = read_file(DATA_RAW_PATH, GENERICS_FILE)
print(len(generics))
generics.head()

## Type of drug (pill, cream...)

In [None]:
package = read_file(DATA_RAW_PATH, PACKAGE_FILE)
print(len(package))
package.head()

## How the drug is distribbutes in different distribution channels

In [None]:
panel = read_file(DATA_RAW_PATH, PANEL_FILE)
print(len(panel))
panel.head()

## Drug's therapeucic area of effect

In [None]:
area = read_file(DATA_RAW_PATH, AREA_FILE)
print(len(area))
area.head()

# Exploratory data analysis

## Volume (target variable)

### Columns

In [None]:
volume.head()

### Number and size of countries

In [None]:
volume["country"].nunique()

In [None]:
volume["country"].value_counts()

### Number and size of brands

In [None]:
volume["brand"].nunique()

In [None]:
volume["brand"].value_counts()

### Distribution in months

In [None]:
volume["month_name"].value_counts()

### Months in which the generic is launched

In [None]:
volume["month_name"][volume["month_num"] == 0].value_counts()

## Packages

In [None]:
package.head()

In [None]:
len(package)

In [None]:
package["presentation"].value_counts()

## A particular brand can have different packages in different countries?

In [None]:
package.groupby("brand")["presentation"].nunique().sort_values(ascending=False)

In [None]:
package[package["brand"] == "brand_478"]

## Yes!