In [1]:
import pandas as pd

# Settings

In [2]:
DATA_RAW_PATH = "../data/raw"
GENERICS_FILE = "gx_num_generics.csv"
PACKAGE_FILE = "gx_package.csv"
PANEL_FILE = "gx_panel.csv"
AREA_FILE = "gx_therapeutic_area.csv"
VOLUME_FILE = "gx_volume.csv"

# Functions

In [3]:
def read_file(path, file):
    return pd.read_csv(f"{path}/{file}", index_col=0)

# Retrieve data

## Volume (target variable)

In [4]:
volume = read_file(DATA_RAW_PATH, VOLUME_FILE)
print(len(volume))
volume.head()

94954


Unnamed: 0,country,brand,volume,month_num,month_name
1,country_1,brand_3,18509088.6,-88,Jul
2,country_1,brand_3,19697508.0,-87,Aug
3,country_1,brand_3,18315721.8,-86,Sep
4,country_1,brand_3,19831199.4,-85,Oct
5,country_1,brand_3,18593281.8,-84,Nov


## Number of competitors per brand and country

In [5]:
generics = read_file(DATA_RAW_PATH, GENERICS_FILE)
print(len(generics))
generics.head()

1078


Unnamed: 0,country,brand,num_generics
1,country_1,brand_3,3
2,country_1,brand_4,1
3,country_1,brand_10,6
4,country_1,brand_14,1
5,country_1,brand_18,1


## Type of drug (pill, cream...)

In [6]:
package = read_file(DATA_RAW_PATH, PACKAGE_FILE)
print(len(package))
package.head()

1078


Unnamed: 0,country,brand,presentation
1,country_1,brand_3,PILL
2,country_1,brand_4,PILL
3,country_1,brand_10,PILL
4,country_1,brand_14,PILL
5,country_1,brand_18,CREAM


## How the drug is distribbutes in different distribution channels

In [7]:
panel = read_file(DATA_RAW_PATH, PANEL_FILE)
print(len(panel))
panel.head()

2371


Unnamed: 0,country,brand,channel,channel_rate
1,country_1,brand_3,B,1.189704
2,country_1,brand_3,D,98.810296
3,country_1,brand_4,B,0.090229
4,country_1,brand_4,D,99.909771
5,country_1,brand_10,B,1.015697


## Drug's therapeucic area of effect

In [8]:
area = read_file(DATA_RAW_PATH, AREA_FILE)
print(len(area))
area.head()

484


Unnamed: 0,brand,therapeutic_area
1,brand_1,Nervous_system
2,brand_2,Respiratory_and_Immuno_inflammatory
3,brand_3,Cardiovascular_Metabolic
4,brand_4,Cardiovascular_Metabolic
5,brand_5,Cardiovascular_Metabolic


# Exploratory data analysis

## Volume (target variable)

### Columns

In [9]:
volume.head()

Unnamed: 0,country,brand,volume,month_num,month_name
1,country_1,brand_3,18509088.6,-88,Jul
2,country_1,brand_3,19697508.0,-87,Aug
3,country_1,brand_3,18315721.8,-86,Sep
4,country_1,brand_3,19831199.4,-85,Oct
5,country_1,brand_3,18593281.8,-84,Nov


### Number and size of countries

In [10]:
volume["country"].nunique()

16

In [11]:
volume["country"].value_counts()

country_16    16057
country_7     14219
country_12    12704
country_3     11892
country_15    11871
country_1      9973
country_8      9217
country_4      7745
country_10      273
country_9       231
country_14      171
country_2       166
country_5       163
country_6       101
country_13       94
country_11       77
Name: country, dtype: int64

### Number and size of brands

In [12]:
volume["brand"].nunique()

484

In [13]:
volume["brand"].value_counts()

brand_438    1052
brand_187     991
brand_259     916
brand_228     820
brand_1       819
             ... 
brand_465      23
brand_406      16
brand_59       14
brand_355      13
brand_353      12
Name: brand, Length: 484, dtype: int64

### Distribution in months

In [14]:
volume["month_name"].value_counts()

Jul    8293
Aug    8203
Sep    8174
Oct    8124
Nov    8030
Dec    7970
Jan    7859
Feb    7784
Mar    7763
Apr    7690
May    7633
Jun    7431
Name: month_name, dtype: int64

### Months in which the generic is launched

In [15]:
volume["month_name"][volume["month_num"] == 0].value_counts()

Jan    173
Jun    110
Jul    103
Aug    101
Nov     89
Feb     75
Apr     73
Oct     73
May     63
Dec     60
Sep     40
Mar     22
Name: month_name, dtype: int64

## Packages

In [16]:
package.head()

Unnamed: 0,country,brand,presentation
1,country_1,brand_3,PILL
2,country_1,brand_4,PILL
3,country_1,brand_10,PILL
4,country_1,brand_14,PILL
5,country_1,brand_18,CREAM


In [17]:
len(package)

1078

In [18]:
package["presentation"].value_counts()

PILL         677
OTHER        233
INJECTION     91
EYE_DROP      34
CREAM         29
PATCH         13
INHALER        1
Name: presentation, dtype: int64

## A particular brand can have different packages in different countries?

In [19]:
package.groupby("brand")["presentation"].nunique().sort_values(ascending=False)

brand
brand_478    3
brand_357    3
brand_194    3
brand_484    3
brand_1      2
            ..
brand_365    1
brand_366    1
brand_367    1
brand_369    1
brand_316    1
Name: presentation, Length: 484, dtype: int64

In [20]:
package[package["brand"] == "brand_478"]

Unnamed: 0,country,brand,presentation
113,country_1,brand_478,INJECTION
247,country_3,brand_478,PILL
503,country_7,brand_478,OTHER
603,country_8,brand_478,OTHER
885,country_15,brand_478,INJECTION
1075,country_16,brand_478,INJECTION


## Yes!