#### Libraries

In [1]:
%config Completer.use_jedi = False

In [2]:
%%javascript
utils.load_extension("collapsible_headings/main")
utils.load_extension("hide_input/main")
utils.load_extension("autosavetime/main")
utils.load_extension("execute_time/ExecuteTime")
utils.load_extension("code_prettify/code_prettify")
utils.load_extension("scroll_down/main")
utils.load_extension("jupyter-js-widgets/extension")

<IPython.core.display.Javascript object>

In [3]:
cd NN_files/

/Users/cmougan/Desktop/Novartis2021/NN_files


In [4]:
import sys
sys.path.append("../")
from metrics.metric_participants import ComputeMetrics

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import TargetEncoder
import seaborn as sns

plt.style.use("seaborn")
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import mean_squared_error

from nnet import ReadDataset, Net,ResNet
import time
from loss_functions import interval_score_loss

from pytorch_tabnet.tab_model import TabNetClassifier
from gauss_rank_scaler import GaussRankScaler
tic = time.time()


In [6]:
def curation_post(data):
    aux = data.copy()

    # Save arrays
    aux_low = aux[aux["pred_95_high"] < aux["pred_95_low"]].pred_95_low
    aux_high = aux[aux["pred_95_high"] < aux["pred_95_low"]].pred_95_high
    aux_index = aux[aux["pred_95_high"] < aux["pred_95_low"]].index

    # Modify
    aux.loc[aux_index, "pred_95_low"] = aux_high
    aux.loc[aux_index, "pred_95_high"] = aux_low
    
    
    if aux[aux["pred_95_high"] < aux["pred_95_low"]].shape[0]>0:
        print('If errors they should appear: ')
        print(aux[aux["pred_95_high"] < aux["pred_95_low"]].shape)
        print(aux[aux["pred_95_high"] < aux["pred_95_low"]].shape)

    preds_aux = np.mean([aux.pred_95_low, aux.pred_95_high], axis=0)
    
    aux['prediction'] = preds_aux
    return aux


In [8]:
def postprocess_submission(submission_df, solve_submission_issues=True):

    join_on = ["country", "brand", "month_num"]
    keep = join_on + ["volume"]

    df_vol = pd.read_csv("../data/gx_volume.csv").loc[:, keep]

    both_ds = submission_df.merge(
        df_vol,
        on=join_on,
        how="left",
    )

    both_ds.loc[both_ds["volume"].notnull(), "prediction"] = both_ds[both_ds["volume"].notnull()]["volume"].values
    both_ds.loc[both_ds["volume"].notnull(), "pred_95_high"] = both_ds[both_ds["volume"].notnull()]["volume"].values + 0.01
    both_ds.loc[both_ds["volume"].notnull(), "pred_95_low"] = both_ds[both_ds["volume"].notnull()]["volume"].values - 0.01

    final_cols = join_on + ["pred_95_low", "prediction", "pred_95_high"]

    final_df =  both_ds.loc[:, final_cols]

    if solve_submission_issues:

        if (final_df.pred_95_low > final_df.pred_95_high).any():
            raise("Stop please, upper < lower")

        cond_lower_mean = final_df.pred_95_low > final_df.prediction
        if cond_lower_mean.any():
            print("Solving lower > mean")
            final_df.loc[cond_lower_mean, "prediction"] = \
                final_df.loc[cond_lower_mean, "pred_95_low"] + 0.01

        cond_upper_mean = final_df.prediction > final_df.pred_95_high
        if cond_upper_mean.any():
            print("Solving upper < mean")
            final_df.loc[cond_upper_mean, "prediction"] = \
                final_df.loc[cond_upper_mean, "pred_95_high"] - 0.01


    return final_df


In [9]:
def my_metric(pred, lower, upper):

    metric_pair = compute_metrics(
        preds=pred,
        lower=lower,
        upper=upper,
        y=val_y_raw,
        offset=val_offset,
        X=val_x_orig,
        avg_volumes=avg_volumes,
    )
    return metric_pair[0],metric_pair[1]

In [10]:
import random
from eda.checker import check_train_test

random.seed(0)

sales_train = pd.read_csv("../data/data_raw/sales_train.csv")
df_full = pd.read_csv("../data/split.csv")
df_region = pd.read_csv("../data/data_raw/regions.csv")
regions_hcps = pd.read_csv("../data/data_raw/regions_hcps.csv")
activity_features = pd.read_csv("../data/features/activity_features.csv")
brands_3_12 = pd.read_csv("../data/features/brand_3_12_market_features_lagged.csv")
rte_basic = pd.read_csv("../data/features/rte_basic_features.csv").drop(
    columns=["sales", "validation"]
)

market_size = pd.read_csv("../data/market_size.csv")

# For reproducibility
random.seed(0)
VAL_SIZE = 38
SUBMISSION_NAME = "empty_extractor_target_encoder"
RETRAIN = True

# %% Training weights
market_size = (
    market_size
    .assign(weight=lambda x: 100 / x['sales'])
    .rename(columns={"sales": 'market_size'})
)

market_size

# %% Add region data
df_feats = df_full.merge(df_region, on="region", how="left")
df_feats = pd.merge(left=df_feats, right=regions_hcps, how="left", on="region")
df_feats = df_feats.merge(
    activity_features, on=["month", "region", "brand"], how="left"
)
df_feats = df_feats.merge(rte_basic, on=["month", "region", "brand"], how="left")
df_feats = df_feats.merge(brands_3_12, on=["month", "region"], how="left")
df_feats["whichBrand"] = np.where(df_feats.brand == "brand_1", 1, 0)

df_feats = df_feats.merge(market_size, on='region', how="left")

df_feats['month_brand'] = df_feats.month + '_' + df_feats.brand

# drop sum variables
cols_to_drop = ["region", "sales", "validation", "market_size", "weight"]

# %% Split train val test
X_train = df_feats.query("validation == 0").drop(columns=cols_to_drop)
y_train = df_feats.query("validation == 0").sales
weights_train = df_feats.query("validation == 0").weight

X_val = df_feats.query("validation == 1").drop(columns=cols_to_drop)
y_val = df_feats.query("validation == 1").sales

X_full = df_feats.query("validation.notnull()", engine="python").drop(
    columns=cols_to_drop
)
y_full = df_feats.query("validation.notnull()", engine="python").sales
weights_full = df_feats.query("validation.notnull()", engine="python").weight

X_test = df_feats.query("validation.isnull()", engine="python").drop(
    columns=cols_to_drop
)
y_test = df_feats.query("validation.isnull()", engine="python").sales



### Categorical

In [11]:
from category_encoders import TargetEncoder
from sklego.preprocessing import ColumnSelector
from sktools import IsEmptyExtractor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [12]:
select_cols = [
    'whichBrand',
    # 'Internal medicine',
    # 'Pediatrician',
    # 'null_tiers_Internal medicine',
    'count',
    'inverse_tier_f2f',
    'hcp_distinct_Internal medicine / pneumology',
    'sales_brand_3',
    'sales_brand_3_market',
    'sales_brand_12_market',
    'month_brand',
    'month',
]

In [13]:
model = LinearRegression()
pipe = Pipeline(
        [   
            ("te", TargetEncoder(cols=["month_brand", "month", "brand"])),
            ("selector", ColumnSelector(columns=select_cols)),
            ("empty", IsEmptyExtractor()),
            ("imputer", SimpleImputer(strategy="median")), 
            ("lgb", model)
        ]
    )


In [14]:
pipe.fit(X_train,y_train);

In [15]:
X_train = pipe[:-1].transform(X_train)
X_val = pipe[:-1].transform(X_val)


### Scaling

In [16]:
scaler = GaussRankScaler()
scaler.fit(X_train,y_train)

GaussRankScaler()

In [17]:
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [18]:
class ReadDataset(Dataset):
    """Read dataset."""

    def __init__(self, XX,yy):
        """
        Args:
            csv_file (str): Path to the csv file with the students data.

        """

        self.X = XX
        self.y = yy

    def __len__(self):
        return len(self.X)

    def __shape__(self):
        return self.X.shape[1]

    def __getitem__(self, idx):
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        if isinstance(idx, torch.Tensor):
            idx = idx.tolist()

        self.X.iloc[idx].values
        self.y[idx]

        return [self.X.iloc[idx].values, self.y[idx]]

## NN

### Preprocess

In [19]:
trainset = ReadDataset(pd.DataFrame(X_train),y_train.values)
testset = ReadDataset(pd.DataFrame(X_val),y_val.values)


# Data loaders
trainloader = DataLoader(trainset, batch_size=100, shuffle=True)
# Test set


In [20]:

X_train = torch.tensor(trainset.X.values)
y_train = torch.tensor(trainset.y)



X_test = torch.tensor(testset.X.values)
y_test = torch.tensor(testset.y)

In [21]:
# Use gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [22]:
# Neural Network
nnet = ResNet(trainset.__shape__()).to(device)

In [31]:
# Optimizer
optimizer = optim.Adam(
    nnet.parameters(), lr=0.00001, betas=(0.9, 0.999), eps=1e-08,  # weight_decay=0.0001
)

### Trainning

In [None]:
# Train the net
loss_per_iter = []
loss_per_batch = []


# Train the net
losses = []
auc_train = []
auc_test = []
metric_val = []
unc_val = []


In [34]:

# hyperparameteres
n_epochs = 100

for epoch in range(n_epochs):

    for i, (inputs, labels) in enumerate(trainloader):
        X = inputs.to(device)
        y = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forwarde
        outputs = nnet(X.float())

        # Compute diff

        loss = interval_score_loss(outputs, y.float())

        # Compute gradient
        loss.backward()

        # update weights
        optimizer.step()

        # Save loss to plot

        losses.append(loss.item())

    if epoch % 5 == 0:
        print(epoch)
        auc_train.append(loss.cpu().detach().numpy())
        pred = nnet(X_test.float())
        auc_test.append(interval_score_loss(pred, y_test.float()).detach().numpy())

        preds = torch.mean(nnet(X_test.float()), axis=1).cpu().detach().numpy()
        lower = nnet(X_test.float())[:, 0].cpu().detach().numpy()
        upper = nnet(X_test.float())[:, 1].cpu().detach().numpy()

        val_preds_df = (
            df_feats.query("validation == 1")
            .loc[:, ["month", "region", "brand"]]
            .assign(sales=preds)
            .assign(lower=lower)
            .assign(upper=upper)
        )

        ground_truth_val = df_feats.query("validation == 1").loc[
            :, ["month", "region", "brand", "sales"]
        ]
        res = ComputeMetrics(val_preds_df.fillna(0), sales_train, ground_truth_val)
        print(res)


        metric_val.append(res[0])
        unc_val.append(res[1])

        # Figure
        plt.figure()
        plt.plot(auc_train, label="train")
        plt.plot(auc_test, label="test")
        plt.plot(metric_val, label="Metric")
        plt.plot(unc_val, label="Uncertainty")
        plt.legend()
        plt.ylim([0, 200])
        plt.savefig("output/auc_NN.png")
        plt.savefig("output/auc_NN.svg", format="svg")
        plt.close()

        #
        path = "output/weights" + str(epoch) + ".pt"
        torch.save(nnet.state_dict(), path)

print("Elapsed time: ", np.abs(tic - time.time()))
print("done")

0
(44.917991668810934, 152.6603901697268)
5
(44.92483150454373, 152.6524620848051)
10
(44.90500162184589, 152.63166762212677)
15
(44.992089541383066, 152.6528160025033)
20
(44.915178884036514, 152.60940857305727)
25
(44.90979941463607, 152.59701335301145)
30
(44.848592242900914, 152.56300478738092)
35
(44.88603464731981, 152.5687273746578)
40
(44.86706725932372, 152.54818912835512)
45
(44.7988124236224, 152.5058869306503)
50
(44.84391457182361, 152.51040995495285)
55
(44.863748528438784, 152.51401935215316)
60
(44.837740698308956, 152.49499142282244)
65
(44.827804711566316, 152.47626091796826)
70
(44.872486835314945, 152.48333247386958)
75
(44.74739716627189, 152.4188359902757)
80
(44.82409348427972, 152.44208587152025)
85
(44.82232914953445, 152.42188784986388)
90
(44.84146415484183, 152.4195402613783)
95
(44.747119247159766, 152.37521250753494)
Elapsed time:  1016.5806739330292
done


In [33]:
kkX_train.shape

torch.Size([3164, 12])

## Predictions Ensemble

In [None]:
def curation_post(data):
    aux = data.copy()

    # Save arrays
    aux_low = aux[aux["pred_95_high"] < aux["pred_95_low"]].pred_95_low
    aux_high = aux[aux["pred_95_high"] < aux["pred_95_low"]].pred_95_high
    aux_index = aux[aux["pred_95_high"] < aux["pred_95_low"]].index

    # Modify
    aux.loc[aux_index, "pred_95_low"] = aux_high
    aux.loc[aux_index, "pred_95_high"] = aux_low
    
    
    if aux[aux["pred_95_high"] < aux["pred_95_low"]].shape[0]>0:
        print('If errors they should appear: ')
        print(aux[aux["pred_95_high"] < aux["pred_95_low"]].shape)
        print(aux[aux["pred_95_high"] < aux["pred_95_low"]].shape)

    preds_aux = np.mean([aux.pred_95_low, aux.pred_95_high], axis=0)
    
    aux['prediction'] = preds_aux
    return aux


In [None]:
def predict_withNN(NN, data):
    # Make predictions
    preds = torch.mean(NN(X_test.float()), axis=1).detach().numpy()
    lower = NN(X_test.float())[:, 0].detach().numpy()
    upper = NN(X_test.float())[:, 1].detach().numpy()

    print(my_metric(preds, lower, upper))

    # Modify offset
    
    #preds = (preds + 1) * val_offset
    #lower = (lower + 1) * val_offset
    #upper = (upper + 1) * val_offset

    aux_data = data.copy()

    aux_data["prediction"] = preds
    aux_data["pred_95_low"] = lower
    aux_data["pred_95_high"] = upper

    aux_data = curation_post(aux_data)
    return aux_data

In [None]:
nnet60 = ResNet(trainset.__shape__()).to(device)
nnet60.load_state_dict(torch.load("output/weights60.pt",map_location=torch.device('cpu')))

In [None]:
nnet80 = ResNet(trainset.__shape__()).to(device)
nnet80.load_state_dict(torch.load("output/weights80.pt",map_location=torch.device('cpu')))

In [None]:
nnet95 = ResNet(trainset.__shape__()).to(device)
nnet95.load_state_dict(torch.load("output/weights95.pt",map_location=torch.device('cpu')))

In [None]:
nnet100 = ResNet(trainset.__shape__()).to(device)
nnet100.load_state_dict(torch.load("output/weights100.pt",map_location=torch.device('cpu')))

In [None]:
nnet120 = ResNet(trainset.__shape__()).to(device)
nnet120.load_state_dict(torch.load("output/weights140.pt",map_location=torch.device('cpu')))

In [None]:
nnet125 = ResNet(trainset.__shape__()).to(device)
nnet125.load_state_dict(torch.load("output/weights125.pt",map_location=torch.device('cpu')))

In [None]:
n_60 = predict_withNN(nnet60,val_x_orig[['country','brand','month_num']])
n_80 = predict_withNN(nnet80,val_x_orig[['country','brand','month_num']])
n_95 = predict_withNN(nnet95,val_x_orig[['country','brand','month_num']])
n_100 = predict_withNN(nnet100,val_x_orig[['country','brand','month_num']])




In [None]:
n_val_final = n_100.copy()

n_val_final.prediction = np.mean([n_60.prediction,
                                n_80.prediction,
                                n_95.prediction,
                                n_100.prediction],axis=0)

n_val_final.pred_95_low = np.mean([n_60.pred_95_low,
                                n_80.pred_95_low,
                                n_95.pred_95_low,
                                n_100.pred_95_low],axis=0)

n_val_final.pred_95_high = np.mean([n_60.pred_95_high,
                                n_80.pred_95_high,
                                n_95.pred_95_high,
                                n_100.pred_95_high],axis=0)

In [None]:
my_metric(n_val_final.prediction,
         n_val_final.pred_95_low,
         n_val_final.pred_95_high)

In [None]:
n_val_final.to_csv('output/valid_ensemble.csv',index=False)

In [None]:
preds = torch.mean(nnet(X_test.float()),axis=1).detach().numpy()
lower = nnet125(X_test.float())[:, 0].detach().numpy()
upper = nnet125(X_test.float())[:, 1].detach().numpy()

preds = (preds+1)*val_offset
lower = (lower+1)*val_offset
upper = (upper+1)*val_offset

In [None]:
val_preds = val_x

val_preds["prediction"] = preds
val_preds["pred_95_low"] = lower
val_preds["pred_95_high"] = upper

## Predict Test

In [None]:
def submission_predict(NN):

    submission_df = pd.read_csv("../data/submission_template.csv")

    preds = torch.mean(NN(torch.tensor(test_x.values).float()), axis=1).detach().numpy()
    lower = NN(torch.tensor(test_x.values).float())[:, 0].detach().numpy()
    upper = NN(torch.tensor(test_x.values).float())[:, 1].detach().numpy()

    preds = (preds + 1) * test_offset
    lower = (lower + 1) * test_offset
    upper = (upper + 1) * test_offset

    submission_df["pred_95_low"] = np.maximum(lower, 0)
    submission_df["pred_95_high"] = np.maximum(upper, 0)
    submission_df["prediction"] = np.maximum(preds, 0)
    submission_df = curation_post(submission_df)



    e = submission_df[
        submission_df["pred_95_high"] < submission_df["pred_95_low"]
    ].shape[0]
    if e > 0:
        print("WARNING:ERORR, please debug")

    e = submission_df[
        submission_df["pred_95_low"] > submission_df["pred_95_high"]
    ].shape[0]
    print(submission_df[
        submission_df["pred_95_low"] > submission_df["pred_95_high"]
    ])
    
    if e > 0:
        print("WARNING:ERORR, please debug")

    e = submission_df[
        submission_df["prediction"] > submission_df["pred_95_high"]
    ].shape[0]
    if e > 0:
        print("WARNING:ERORR, please debug")

    e = submission_df[submission_df["prediction"] < submission_df["pred_95_low"]].shape[
        0
    ]
    if e > 0:
        print("WARNING:ERORR, please debug")
        
        
    submission_df = postprocess_submission(submission_df)

    submission_df["pred_95_low"] = np.maximum(submission_df.pred_95_low, 0)
    submission_df["pred_95_high"] = np.maximum(submission_df.pred_95_high, 0)
    submission_df["prediction"] = np.maximum(submission_df.prediction, 0)
    return submission_df

In [None]:
pred60 = submission_predict(nnet60)
pred80 = submission_predict(nnet80)
pred95 = submission_predict(nnet95)
pred100 = submission_predict(nnet100)

In [None]:
pred_final = pred60.copy()

pred_final.prediction = np.mean(
    [pred60.prediction, pred80.prediction, pred95.prediction, pred100.prediction],
    axis=0,
)

pred_final.pred_95_low = np.mean(
    [pred60.pred_95_low, pred80.pred_95_low, pred95.pred_95_low, pred100.pred_95_low],
    axis=0,
)

pred_final.pred_95_high = np.mean(
    [
        pred60.pred_95_high,
        pred80.pred_95_high,
        pred95.pred_95_high,
        pred100.pred_95_high,
    ],
    axis=0,
)

In [None]:
pred_final.describe()

In [None]:
pred60.to_csv("../submissions/pred60_noPost.csv", index=False)

In [None]:
pred_final.to_csv("../submissions/pred_final_few.csv", index=False)

In [None]:
train_x.columns

In [None]:
a = pd.read_csv('../data/gx_merged_lags_monthsD.csv')

In [None]:
b = pd.read_csv('../data/gx_merged_lags_monthsM.csv')

In [None]:
a['last_before_3_after_0']

In [None]:
np.sum(b['last_before_3_after_0_vMarc'] != b['last_before_3_after_0'])

In [None]:
np.sum(a==b)

In [None]:
a.shape