In [1]:
%config Completer.use_jedi = False

In [2]:
cd NN_files/

/Users/cmougan/Desktop/Novartis2021/NN_files


In [3]:
from torch.utils.data import TensorDataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("seaborn")
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import mean_squared_error
from gauss_rank_scaler import GaussRankScaler

from nnet import ReadDataset, Net
import time
from loss_functions import interval_score_loss

from sklearn.pipeline import Pipeline

tic = time.time()


In [4]:
import random
from eda.checker import check_train_test

random.seed(0)

sales_train = pd.read_csv("../data/data_raw/sales_train.csv")
df_full = pd.read_csv("../data/split.csv")
df_region = pd.read_csv("../data/data_raw/regions.csv")
regions_hcps = pd.read_csv("../data/data_raw/regions_hcps.csv")
activity_features = pd.read_csv("../data/features/activity_features.csv")
brands_3_12 = pd.read_csv("../data/features/brand_3_12_market_features_lagged.csv")
rte_basic = pd.read_csv("../data/features/rte_basic_features.csv").drop(
    columns=["sales", "validation"]
)

market_size = pd.read_csv("../data/market_size.csv")

# For reproducibility
random.seed(0)
VAL_SIZE = 38
SUBMISSION_NAME = "empty_extractor_target_encoder"
RETRAIN = True

# %% Training weights
market_size = (
    market_size
    .assign(weight=lambda x: 100 / x['sales'])
    .rename(columns={"sales": 'market_size'})
)

market_size

# %% Add region data
df_feats = df_full.merge(df_region, on="region", how="left")
df_feats = pd.merge(left=df_feats, right=regions_hcps, how="left", on="region")
df_feats = df_feats.merge(
    activity_features, on=["month", "region", "brand"], how="left"
)
df_feats = df_feats.merge(rte_basic, on=["month", "region", "brand"], how="left")
df_feats = df_feats.merge(brands_3_12, on=["month", "region"], how="left")
df_feats["whichBrand"] = np.where(df_feats.brand == "brand_1", 1, 0)

df_feats = df_feats.merge(market_size, on='region', how="left")

df_feats['month_brand'] = df_feats.month + '_' + df_feats.brand

# drop sum variables
cols_to_drop = ["region", "sales", "validation", "market_size", "weight"]

# %% Split train val test
X_train = df_feats.query("validation == 0").drop(columns=cols_to_drop)
y_train = df_feats.query("validation == 0").sales
weights_train = df_feats.query("validation == 0").weight

X_val = df_feats.query("validation == 1").drop(columns=cols_to_drop)
y_val = df_feats.query("validation == 1").sales

X_full = df_feats.query("validation.notnull()", engine="python").drop(
    columns=cols_to_drop
)
y_full = df_feats.query("validation.notnull()", engine="python").sales
weights_full = df_feats.query("validation.notnull()", engine="python").weight

X_test = df_feats.query("validation.isnull()", engine="python").drop(
    columns=cols_to_drop
)
y_test = df_feats.query("validation.isnull()", engine="python").sales



In [5]:
from category_encoders import TargetEncoder
from sklego.preprocessing import ColumnSelector
from sktools import IsEmptyExtractor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

In [6]:
select_cols = [
    'whichBrand',
    # 'Internal medicine',
    # 'Pediatrician',
    # 'null_tiers_Internal medicine',
    'count',
    'inverse_tier_f2f',
    'hcp_distinct_Internal medicine / pneumology',
    'sales_brand_3',
    'sales_brand_3_market',
    'sales_brand_12_market',
    'month_brand',
    'month',
]

In [7]:
model = LinearRegression()
pipe = Pipeline(
        [   
            ("te", TargetEncoder(cols=["month_brand", "month", "brand"])),
            ("selector", ColumnSelector(columns=select_cols)),
            ("empty", IsEmptyExtractor()),
            ("imputer", SimpleImputer(strategy="median")), 
            ("lgb", model)
        ]
    )


In [8]:
pipe.fit(X_train,y_train);

In [9]:
X_train = pipe[:-1].transform(X_train)
X_val = pipe[:-1].transform(X_val)


In [10]:
scaler = GaussRankScaler()
scaler.fit(X_train,y_train)

GaussRankScaler()

In [11]:
X_train

array([[  1.,  77.,  27., ...,   1.,   1.,   1.],
       [  0.,  15.,  18., ...,   0.,   0.,   1.],
       [  1.,  77.,  27., ...,   1.,   1.,   1.],
       ...,
       [  0.,  53.,  32., ...,   0.,   0.,   1.],
       [  1., 312.,  90., ...,   0.,   0.,   0.],
       [  0., 317.,  86., ...,   0.,   0.,   0.]])

In [12]:
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [13]:
# Use gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


# Neural Network
nnet = Net(X_train.shape[1]).to(device)

In [14]:
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train.values)
X_val = torch.tensor(X_val)
y_val = torch.tensor(y_val.values)

In [15]:
data_set = TensorDataset(X_train, y_train)
train_batches = DataLoader(data_set, batch_size=128, shuffle=False)

In [16]:
# Loss function
criterion = nn.MSELoss()

# Optimizer
optimizer = optim.Adam(
    nnet.parameters(), lr=0.0000001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.0001
)


# Train the net
loss_per_iter = []
loss_per_batch = []


# Train the net
losses = []
auc_train = []
auc_test = []

# hyperparameteres
n_epochs = 100


In [17]:
for epoch in range(n_epochs):

    i = 0
    for inputs, labels in train_batches:
        i = i + 1

        X = inputs.to(device)
        y = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forwarde
        outputs = nnet(X.float())

        # Compute diff

        loss = interval_score_loss(outputs, y.float())

        # Compute gradient
        loss.backward()

        # update weights
        optimizer.step()

        # Save loss to plot

        losses.append(loss.item())


    auc_train.append(loss.detach().numpy())
    pred = nnet(torch.tensor(X_val.float()))
    auctest = interval_score_loss(pred,y_val).detach().numpy()
    auc_test.append(auctest)

    # Figure
    plt.figure()
    plt.plot(auc_train, label="train")
    plt.plot(auc_test, label="test")
    plt.legend()
            #plt.ylim([0, 3000])
    plt.savefig("output/auc_NN.png")
    plt.savefig("output/auc_NN.svg", format="svg")
    plt.close()
    

print("Elapsed time: ", np.abs(tic - time.time()))
print("done")

  pred = nnet(torch.tensor(X_val.float()))


Elapsed time:  26.49464702606201
done


In [18]:
nnet(X_train.float())

tensor([[ 2.1955,  0.0367],
        [-0.4875, -0.5762],
        [ 2.1234, -0.7024],
        ...,
        [ 0.1152,  0.0806],
        [-0.0432, -0.6146],
        [ 0.7388,  0.9459]], grad_fn=<SqueezeBackward0>)