In [1]:
cd ../..

/home/xavier/projects/godatathon_2020


In [2]:
import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Settings

In [3]:
DATA_RAW_PATH = "data/raw"
VOLUME_FILE = "gx_volume.csv"
GENERICS_FILE = "gx_num_generics.csv"
PACKAGE_FILE = "gx_package.csv"
CHANNELS_FILE = "gx_panel.csv"
THERAPEUTIC_FILE = "gx_therapeutic_area.csv"
OUTPUT_PATH = "data/features/basic_features.csv"

# Functions

In [4]:
def encode_variable(variable):
    label_encoder = LabelEncoder()
    label_encoder.fit(variable)
    return label_encoder.transform(variable)

def encode_month(month_name):
    datetime_object = datetime.datetime.strptime(month_name, "%b")
    month_number = datetime_object.month
    month_sin = np.sin(2 * np.pi * month_number/12)
    month_cos = np.cos(2 * np.pi * month_number/12)
    return month_sin, month_cos

# Retrieve data

In [5]:
volume = pd.read_csv(f"{DATA_RAW_PATH}/{VOLUME_FILE}", index_col=0)
volume.head(2)

Unnamed: 0,country,brand,volume,month_num,month_name
1,country_1,brand_3,18509088.6,-88,Jul
2,country_1,brand_3,19697508.0,-87,Aug


In [6]:
generics = pd.read_csv(f"{DATA_RAW_PATH}/{GENERICS_FILE}", index_col=0)
generics.head(2)

Unnamed: 0,country,brand,num_generics
1,country_1,brand_3,3
2,country_1,brand_4,1


In [7]:
packages = pd.read_csv(f"{DATA_RAW_PATH}/{PACKAGE_FILE}", index_col=0)
packages.head(2)

Unnamed: 0,country,brand,presentation
1,country_1,brand_3,PILL
2,country_1,brand_4,PILL


In [8]:
channels = pd.read_csv(f"{DATA_RAW_PATH}/{CHANNELS_FILE}", index_col=0)
channels.head(2)

Unnamed: 0,country,brand,channel,channel_rate
1,country_1,brand_3,B,1.189704
2,country_1,brand_3,D,98.810296


In [9]:
therapeutic = pd.read_csv(f"{DATA_RAW_PATH}/{THERAPEUTIC_FILE}", index_col=0)
therapeutic.head(2)

Unnamed: 0,brand,therapeutic_area
1,brand_1,Nervous_system
2,brand_2,Respiratory_and_Immuno_inflammatory


# Create base table

## Create list of contries and brands

In [10]:
dataset = volume[["country", "brand"]].copy()
dataset.head(2)

Unnamed: 0,country,brand
1,country_1,brand_3
2,country_1,brand_3


## Encode country

In [11]:
dataset["country_id"] = encode_variable(dataset["country"])
dataset.head(2)

Unnamed: 0,country,brand,country_id
1,country_1,brand_3,0
2,country_1,brand_3,0


## Encode brand

In [12]:
dataset["brand_id"] = encode_variable(dataset["brand"])
dataset.head(2)

Unnamed: 0,country,brand,country_id,brand_id
1,country_1,brand_3,0,222
2,country_1,brand_3,0,222


## Add number of generics

In [13]:
dataset = dataset.merge(generics, on=["country", "brand"], how="left").fillna(0)
dataset.head(2)

Unnamed: 0,country,brand,country_id,brand_id,num_generics
0,country_1,brand_3,0,222,3
1,country_1,brand_3,0,222,3


## Add encoded package

In [14]:
dataset = dataset.merge(packages, on=["country", "brand"], how="left")
dataset.head(2)

Unnamed: 0,country,brand,country_id,brand_id,num_generics,presentation
0,country_1,brand_3,0,222,3,PILL
1,country_1,brand_3,0,222,3,PILL


In [15]:
dataset["package_id"] = encode_variable(dataset["presentation"])
dataset = dataset.drop(columns=["presentation"])
dataset.head(2)

Unnamed: 0,country,brand,country_id,brand_id,num_generics,package_id
0,country_1,brand_3,0,222,3,6
1,country_1,brand_3,0,222,3,6


## Add distribution channel rates

In [16]:
channels = pd.pivot_table(channels, columns=["channel"], index=["country", "brand"]).fillna(0)
channels.columns = [f"{c[0]}_{c[1]}" for c in channels.columns]
channels.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,channel_rate_A,channel_rate_B,channel_rate_C,channel_rate_D
country,brand,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
country_1,brand_10,0.0,1.015697,0.0,98.984303
country_1,brand_102,0.0,0.109766,0.0,99.890234


In [17]:
dataset = dataset.merge(channels, on=["country", "brand"], how="left").fillna(0)
dataset.head(2)

Unnamed: 0,country,brand,country_id,brand_id,num_generics,package_id,channel_rate_A,channel_rate_B,channel_rate_C,channel_rate_D
0,country_1,brand_3,0,222,3,6,0.0,1.189704,0.0,98.810296
1,country_1,brand_3,0,222,3,6,0.0,1.189704,0.0,98.810296


## Add therapeutic area

In [18]:
dataset = dataset.merge(therapeutic, on="brand", how="left")
dataset.head(2)

Unnamed: 0,country,brand,country_id,brand_id,num_generics,package_id,channel_rate_A,channel_rate_B,channel_rate_C,channel_rate_D,therapeutic_area
0,country_1,brand_3,0,222,3,6,0.0,1.189704,0.0,98.810296,Cardiovascular_Metabolic
1,country_1,brand_3,0,222,3,6,0.0,1.189704,0.0,98.810296,Cardiovascular_Metabolic


In [19]:
dataset["therapeutic_id"] = encode_variable(dataset["therapeutic_area"])
dataset = dataset.drop(columns=["therapeutic_area"])
dataset.head(2)

Unnamed: 0,country,brand,country_id,brand_id,num_generics,package_id,channel_rate_A,channel_rate_B,channel_rate_C,channel_rate_D,therapeutic_id
0,country_1,brand_3,0,222,3,6,0.0,1.189704,0.0,98.810296,2
1,country_1,brand_3,0,222,3,6,0.0,1.189704,0.0,98.810296,2


# Add last month encoded

In [20]:
volume_last_month = volume[volume["month_num"] == -1].copy()
volume_last_month = volume_last_month.reset_index(drop=True)
volume_last_month = volume_last_month[["country", "brand", "month_name"]]

In [21]:
volume_last_month[["last_month_sin", "last_month_cos"]] = pd.DataFrame(list(volume_last_month["month_name"].apply(encode_month)), columns=["last_month_sin", "last_month_cos"])
volume_last_month.head()

Unnamed: 0,country,brand,month_name,last_month_sin,last_month_cos
0,country_1,brand_3,Oct,-0.8660254,0.5
1,country_1,brand_4,Oct,-0.8660254,0.5
2,country_1,brand_10,May,0.5,-0.866025
3,country_1,brand_14,Dec,-2.449294e-16,1.0
4,country_1,brand_18,Dec,-2.449294e-16,1.0


In [22]:
dataset = dataset.merge(volume_last_month, on=["country", "brand"], how="left")
dataset = dataset.drop(columns="month_name")
dataset.head(2)

Unnamed: 0,country,brand,country_id,brand_id,num_generics,package_id,channel_rate_A,channel_rate_B,channel_rate_C,channel_rate_D,therapeutic_id,last_month_sin,last_month_cos
0,country_1,brand_3,0,222,3,6,0.0,1.189704,0.0,98.810296,2,-0.866025,0.5
1,country_1,brand_3,0,222,3,6,0.0,1.189704,0.0,98.810296,2,-0.866025,0.5


# Drop Duplicates

In [23]:
dataset = dataset.drop_duplicates()

# Save dataset

In [24]:
dataset.head(10)

Unnamed: 0,country,brand,country_id,brand_id,num_generics,package_id,channel_rate_A,channel_rate_B,channel_rate_C,channel_rate_D,therapeutic_id,last_month_sin,last_month_cos
0,country_1,brand_3,0,222,3,6,0.0,1.189704,0.0,98.810296,2,-0.8660254,0.5
112,country_1,brand_4,0,333,1,6,0.0,0.090229,0.0,99.909771,2,-0.8660254,0.5
224,country_1,brand_10,0,1,6,6,0.0,1.015697,0.0,98.984303,6,0.5,-0.8660254
295,country_1,brand_14,0,45,1,6,0.0,1.118446,0.0,98.881554,2,-2.449294e-16,1.0
349,country_1,brand_18,0,89,1,0,0.0,1.118671,0.0,98.881329,3,-2.449294e-16,1.0
463,country_1,brand_20,0,112,2,3,0.0,99.403053,0.0,0.596947,1,0.5,-0.8660254
606,country_1,brand_23,0,145,1,6,0.0,0.339134,0.0,99.660866,2,0.5,-0.8660254
717,country_1,brand_25,0,167,1,6,0.0,1.310122,0.0,98.689878,8,-1.0,-1.83697e-16
780,country_1,brand_32,0,245,1,6,0.0,0.016193,0.0,99.983807,2,-2.449294e-16,1.0
831,country_1,brand_46,0,400,11,6,0.0,8.178941,0.0,91.821059,7,-2.449294e-16,1.0


In [25]:
dataset.to_csv(OUTPUT_PATH, index=False)