#### Libraries

In [1]:

%%javascript
utils.load_extension('collapsible_headings/main')
utils.load_extension('hide_input/main')
utils.load_extension('autosavetime/main')
utils.load_extension('execute_time/ExecuteTime')
utils.load_extension('code_prettify/code_prettify')
utils.load_extension('scroll_down/main')
utils.load_extension('jupyter-js-widgets/extension')

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

from sklearn.metrics import roc_auc_score

In [3]:
def plot_feature_importance(
    columnas, model_features, columns_ploted=10, model_name="Catboost"
):
    """
    This method is yet non-tested

    This function receives a set of columns feeded to a model, and the importance of each of feature.
    Returns a graphical visualization

    Call it fot catboost pipe example:
    plot_feature_importance(pipe_best_estimator[:-1].transform(X_tr).columns,pipe_best_estimator.named_steps['cb'].get_feature_importance(),20)

    Call it for lasso pipe example:
    plot_feature_importance(pipe_best_estimator[:-1].transform(X_tr).columns,np.array(pipe_best_estimator.named_steps['clf'].coef_.squeeze()),20)
    """

    feature_importance = pd.Series(index=columnas, data=np.abs(model_features))
    n_selected_features = (feature_importance > 0).sum()
    print(
        "{0:d} features, reduction of {1:2.2f}%".format(
            n_selected_features,
            (1 - n_selected_features / len(feature_importance)) * 100,
        )
    )
    plt.figure()
    feature_importance.sort_values().tail(columns_ploted).plot(
        kind="bar", figsize=(18, 6)
    )
    plt.title("Feature Importance for {}".format(model_name))
    plt.show()

In [4]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


## Joins

### Generic

In [5]:
generic = pd.read_csv("gx_num_generics.csv").drop(columns="Unnamed: 0")
generic.head(1)

FileNotFoundError: [Errno 2] No such file or directory: 'gx_num_generics.csv'

### Package

In [None]:
package = pd.read_csv("gx_package.csv").drop(columns="Unnamed: 0")
package.head()

In [None]:
package.presentation.unique()

In [None]:
package.country.nunique()

In [None]:
package.brand.nunique()

In [None]:
package.brand.value_counts()

### Panel

In [None]:
panel = pd.read_csv("gx_panel.csv").drop(columns="Unnamed: 0")

panel.head(2)

In [None]:
panel.brand.nunique()

In [None]:
panel.channel.unique()

### Therapeutic

In [None]:
therapeutic_area = pd.read_csv("gx_therapeutic_area.csv").drop(columns="Unnamed: 0")
therapeutic_area.head(1)

In [None]:
therapeutic_area.therapeutic_area.nunique()

### Volume

In [None]:
volume = pd.read_csv("gx_volume.csv").drop(columns="Unnamed: 0")
volume.head(1)

In [None]:
volume[(volume.country == "country_1") & (volume.brand == "brand_3")]

### Subm

In [None]:
subm = pd.read_csv("submission_template.csv")
subm

In [None]:
pd.merge(
    volume,
    subm,
    left_on=["country", "brand", "month_num"],
    right_on=["country", "brand", "month_num"],
)

In [None]:
594 / 4584

## Full

In [None]:
volume

In [None]:
generic

In [None]:
a = pd.merge(
    volume,
    generic,
    how="left",
    left_on=["country", "brand"],
    right_on=["country", "brand"],
)

In [None]:
full = pd.merge(
    volume,
    generic,
    how="left",
    left_on=["country", "brand"],
    right_on=["country", "brand"],
)


# package
full = pd.merge(
    full,
    package,
    how="left",
    left_on=["country", "brand"],
    right_on=["country", "brand"],
)
full

In [None]:
panel

In [None]:
panel.groupby(["country", "brand", "channel"], as_index=False).agg(
    ["min", "max", "sum", "mean", "median"]
)

In [None]:
full

In [None]:
# generic
full = pd.merge(
    volume,
    generic,
    how="left",
    left_on=["country", "brand"],
    right_on=["country", "brand"],
)


# package
full = pd.merge(
    full,
    package,
    how="left",
    left_on=["country", "brand"],
    right_on=["country", "brand"],
)

# panel
full = pd.merge(
    full, panel, how="left", left_on=["country", "brand"], right_on=["country", "brand"]
)
full.shape

In [None]:
# generic
full = pd.merge(
    volume,
    generic,
    how="left",
    left_on=["country", "brand"],
    right_on=["country", "brand"],
)


# package
full = pd.merge(
    full,
    package,
    how="left",
    left_on=["country", "brand"],
    right_on=["country", "brand"],
)

# panel
full = pd.merge(
    full, panel, how="left", left_on=["country", "brand"], right_on=["country", "brand"]
)

# therapeutic
full = pd.merge(
    full, therapeutic_area, how="left", left_on=["brand"], right_on=["brand"]
)
full.head(1)

In [None]:
full.shape

## Adversarial Trainning

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from category_encoders.m_estimate import MEstimateEncoder

In [None]:
adv = pd.read_csv("data/gx_merged.csv")
adv = adv.drop(
    columns=[
        "month_name",
        "volume",
        #'brand','B','C','D','num_generics'
    ]
)

In [None]:
adv["random"] = np.random.random(adv.shape[0])

In [None]:
me = MEstimateEncoder()

In [None]:
X = adv.drop(columns=["test"])
y = adv.test

In [None]:
X = me.fit_transform(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [None]:
cb = CatBoostClassifier(iterations=100, verbose=0)
cb.fit(X_train, y_train)

In [None]:
plot_feature_importance(X.columns, cb.get_feature_importance())

In [None]:
roc_auc_score(y_test, cb.predict(X_test))

In [None]:
X.columns

In [None]:
adv

## Splitting

In [None]:
df = pd.read_csv("data/gx_merged.csv")

# Take out test
df = df[df.test == 0]

# Create our unique index variable
df["count_brand"] = df["country"].astype(str) + "-" + df["brand"]

# Unique index
lista = df["count_brand"].unique()
df["count_brand"].nunique()

In [None]:
# Get the ones that have not 24months
a = pd.DataFrame(df.groupby(["country", "brand"]).month_num.max()).reset_index()
a = a[a.month_num < 23]
a["count_brand"] = a["country"].astype(str) + "-" + a["brand"]
deformed = a.count_brand.unique()

In [None]:
buenos = list(set(lista) - set(list(deformed)))

In [None]:
split = int(len(buenos) * 0.75)
split_train_list = buenos[:split]
split_valid_list = buenos[split:]

In [None]:
len(split_train_list)

In [None]:
len(split_valid_list)

In [None]:
train_split = df[df["count_brand"].isin(split_train_list)]
valid_split = df[df["count_brand"].isin(split_valid_list)]

train_split = train_split[["country", "brand"]]
valid_split = valid_split[["country", "brand"]]

train_split.shape

train_split.drop_duplicates().to_csv("data/train_split_noerror.csv", index=False)
valid_split.drop_duplicates().to_csv("data/valid_split.csv", index=False)

In [None]:
split_train_split_deformed = list(set((split_train_list + list(deformed))))

In [None]:
train_split = df[df["count_brand"].isin(split_train_split_deformed)]


train_split = train_split[["country", "brand"]]


train_split.drop_duplicates().to_csv("data/train_split.csv", index=False)

In [None]:
576 / 768

In [None]:
len(buenos)

In [None]:
pd.read_csv("data/train_split.csv").shape

In [None]:
pd.read_csv("data/valid_split.csv").shape

In [None]:
pd.read_csv("data/train_split_noerror.csv").shape

### Split test

In [None]:
df = pd.read_csv("data/gx_merged.csv")

# Take out test
df = df[df.test == 1]

# Create our unique index variable
df["count_brand"] = df["country"].astype(str) + "-" + df["brand"]

# Unique index
lista = df["count_brand"].unique()
df["count_brand"].nunique()

In [None]:
split_test_list = lista

In [None]:
test_split = df[df["count_brand"].isin(split_test_list)]

test_split = test_split[["country", "brand"]]

In [None]:
test_split.drop_duplicates().to_csv("data/test_split.csv", index=False)