## Prepare and read data

In [1]:
import gc
import os
import time
import random
import itertools
from pathlib import Path
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device(
    "cuda:0" if torch.cuda.is_available() else "cpu")

pd.set_option("max_columns", 300)
pd.set_option("max_rows", 500)

In [3]:
def init_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

In [4]:
# LR Schedulers
def adjust_learning_rate(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def get_learning_rate(optimizer):
    lr = []
    for param_group in optimizer.param_groups:
        lr += [param_group['lr']]
    assert(len(lr) == 1)
    lr = lr[0]
    return lr


class NullScheduler():
    def __init__(self, lr=0.01):
        super(NullScheduler, self).__init__()
        self.lr = lr
        self.cycle = 0

    def __call__(self, time):
        return self.lr

    def __str__(self):
        string = "NullScheduler\n" \
            + "lr={0:0.5f}".format(self.lr)
        return string


class ManualScheduler():
    def __init__(self, lr=0.01, lr_decay=0.9):
        super(ManualScheduler, self).__init__()
        self.lr_list = [lr * (lr_decay ** i) for i in range(100)]
        self.cycle = 0

    def __call__(self, time):
        if time < len(self.lr_list):
            return self.lr_list[time]
        else:
            return self.lr_list[-1]

    def __str__(self):
        string = "ManualScheduler\n" \
            + "lr={0:0.5f}".format(self.lr_list[0])
        return string


class CosineAnnealingScheduler():
    def __init__(self, eta_min=0.0001, eta_max=0.002, cycle=100, repeat=False):
        super(CosineAnnealingScheduler, self).__init__()
        self.cycle = cycle
        self.eta_min = eta_min
        self.eta_max = eta_max
        self.lr = self.eta_min + (self.eta_max - self.eta_min)
        self.repeat = repeat

    def __call__(self, epoch):
        if self.repeat:
            self.lr = self.eta_min + \
                (self.eta_max - self.eta_min) \
                * (1 + math.cos(math.pi * epoch / self.cycle)) / 2
        else:
            if epoch <= self.cycle:
                self.lr = self.eta_min + \
                    (self.eta_max - self.eta_min) \
                    * (1 + math.cos(math.pi * epoch / self.cycle)) / 2
            else:
                self.lr = self.eta_min
        return self.lr

    def __str__(self):
        string = 'CosineAnealingScheduler\n' \
                + 'lr=%0.5f ' % (self.lr)
        return strin

In [5]:
class EarlyStopping:
    """
    ref: https://github.com/Bjarten/early-stopping-pytorch
    """
    def __init__(self, patience=2, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.best_model_savepath = None
        self.best_oof_preds = None

    def __call__(self, val_loss, model, save_name, oof_preds=None):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, save_name)
        elif score < self.best_score:
            self.counter += 1
            # print(f'EarlyStopping counter: {self.counter} '
            #      'out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, save_name)
            self.best_model_savepath = save_name
            self.counter = 0
            if oof_preds is not None:
                self.best_oof_preds = oof_preds

    def save_checkpoint(self, val_loss, model, save_name):
        if self.verbose:
            print(f'Validation loss decreased ('
                  '{self.val_loss_min:.5f} --> {val_loss:.5f}'
                  ').  Saving model ...')
            print("Save model: {}".format(save_name))
        torch.save(model.state_dict(), save_name)
        self.val_loss_min = val_loss
    
    def get_best_filepath(self):
        return self.best_model_savepath

    def get_best_oof_preds(self):
        return self.best_oof_preds

In [6]:
init_seed(2020)

In [7]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [8]:
def feature_cache(reset=False):
    def _feature_cache(func):
        def wrapper(train_df, test_df, *args):
            func_name = func.__name__
            train_feat_path = Path("../feature") / f"train_{func_name}.pkl"
            test_feat_path = Path("../feature") / f"test_{func_name}.pkl"
            # if feature exists, load feature
            if train_feat_path.exists() and test_feat_path.exists() and not reset:
                train_feats = pd.read_pickle(train_feat_path).reset_index(drop=True)
                test_feats = pd.read_pickle(test_feat_path).reset_index(drop=True)
                train_df = pd.concat([train_df, train_feats], axis=1)
                test_df = pd.concat([test_df, test_feats], axis=1)
            # if not exists, make feature and save as pickle
            else:
                before_cols = train_df.columns.tolist()
                train_df, test_df = func(train_df, test_df, *args)
                after_cols = train_df.columns.tolist()
                new_cols = [c for c in after_cols if c not in before_cols]
                train_feats = train_df[new_cols]
                test_feats = test_df[new_cols]
                train_feats.to_pickle(train_feat_path)
                test_feats.to_pickle(test_feat_path)            
            return train_df, test_df
        return wrapper

    return _feature_cache

## Feature Engineering

In [9]:
def get_basic_importance_cols(use_num=50):
    # basicなモデルのimportanceを読み込み
    importance_df = pd.read_csv("../data/importance/003_importance.csv")
    imp_feats = importance_df["feature"].iloc[:use_num].tolist()
    return imp_feats

In [10]:
def get_multi_cat_cols(train_df):
    tmp = train_df.iloc[:1000]
    multi_cols = []
    for c in train_df.columns:
        sep_num = tmp[c].astype(str).fillna("").str.contains(";").sum()
        if sep_num > 10:
            multi_cols.append(c)
    return multi_cols

In [11]:
original_cols = train_df.columns.tolist()

In [12]:
multi_cat_cols = get_multi_cat_cols(train_df)

In [13]:
nume_cols = [
    c for c in list(np.setdiff1d(original_cols, multi_cat_cols))
    if c not in ["Salary", "No"] and "float" in train_df[c].dtype.name
]

In [14]:
cat_cols = [c for c in train_df.columns if c not in multi_cat_cols + nume_cols + ["Salary", "No"]]

In [15]:
len(original_cols), len(cat_cols), len(nume_cols), len(multi_cat_cols)

(128, 65, 40, 21)

### Multi-category encoding

In [16]:
for c in tqdm(multi_cat_cols):
    binarizer = MultiLabelBinarizer()
    train_multi_srs = train_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
    test_multi_srs = test_df[c].map(lambda x: x.split(";") if x is not np.nan else [])
    train_arr = binarizer.fit_transform(train_multi_srs)
    test_arr = binarizer.transform(test_multi_srs)
    feat_cols = [f"ohe_{c}_{val}" for val in binarizer.classes_]
    train_feat_df = pd.DataFrame(train_arr, columns=feat_cols, dtype=np.int8)
    test_feat_df = pd.DataFrame(test_arr, columns=feat_cols, dtype=np.int8)
    all_feat_df = pd.concat([train_feat_df, test_feat_df], axis=0, ignore_index=True)
    train_df = pd.concat([train_df, train_feat_df], axis=1)
    test_df = pd.concat([test_df, test_feat_df], axis=1)
    # ohe_featureはcategoryとnumerical両方として扱う
    nume_cols += feat_cols

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




### category encoding

In [17]:
new_cat_cols = []

imp_feats = get_basic_importance_cols(use_num=30)
multi_new_cols = [c for c in cat_cols if c in imp_feats]

for col_a, col_b in tqdm(list(itertools.combinations(multi_new_cols, 2))):
    new_col = col_a + "__" + col_b
    train_df[new_col] = train_df[col_a].astype(str) + "__" + train_df[col_b].astype(str)
    test_df[new_col] = test_df[col_a].astype(str) + "__" + test_df[col_b].astype(str)
    new_cat_cols.append(new_col)

cat_cols += new_cat_cols

HBox(children=(FloatProgress(value=0.0, max=91.0), HTML(value='')))




In [18]:
for c in tqdm(cat_cols):
    train_df[c], uniques = pd.factorize(train_df[c])
    train_df[c] += 1
    test_df[c] = uniques.get_indexer(test_df[c]) + 1

HBox(children=(FloatProgress(value=0.0, max=156.0), HTML(value='')))




In [19]:
len(cat_cols)

156

In [20]:
use_cols = [c for c in train_df.columns if c not in multi_cat_cols + ["Salary", "No"]]
print(len(use_cols))

512


## Scaling

In [21]:
scaler = StandardScaler()
train_df["Salary"] = scaler.fit_transform(train_df["Salary"].values.reshape(-1, 1))

In [22]:
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

## Training

In [23]:
class QuevicoDataset(Dataset):
    def __init__(self, df, cat_cols, nume_cols, mode="train"):
        print("Make Dataset")
        self.mode = mode
        self.length = len(df)
        ##############
        # sales
        ##############
        self.df = df.copy()
        self.cat_cols = cat_cols
        self.nume_cols = nume_cols

    def __getitem__(self, idx):
        # category
        cat_ids = self.df.iloc[idx][self.cat_cols].values.astype(int)
        nume_x = self.df.iloc[idx][self.nume_cols].values.astype(np.float32)

        target = self.df.iloc[idx]["Salary"] if self.mode == "train" else 0
        
        return cat_ids, nume_x, target

    def __len__(self):
        return self.length

In [24]:
data = QuevicoDataset(train_df, cat_cols, nume_cols, mode="train")

Make Dataset


In [25]:
data.__getitem__(0)

(array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]),
 array([ 1.,  4.,  5.,  6.,  2.,  3.,  7.,  1.,  4.,  3., 10.,  5.,  9.,
        11.,  2.,  7.,  6.,  8.,  9.,  4.,  7.,  8.,  3.,  6., 10.,  2.,
         1.,  5.,  1.,  2.,  4.,  3.,  5.,  1.,  2.,  6.,  3.,  5.,  7.,
         4.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
         1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
         0.,  1.,  1.,  0., 

In [26]:
class NNModel(nn.Module):
    def __init__(self, cat_cols, nume_cols, df):
        super(NNModel, self).__init__()

        n_hidden = 128
        self.cat_cols = cat_cols
        self.emb_dict = {}
        total_cat_hidden = 0
        for i, c in enumerate(cat_cols):
            cat_nunique = df[c].max()
            n_cat_hidden = 16 if cat_nunique > 64 else 8
            self.emb_dict[i] = nn.Embedding(cat_nunique+10, n_cat_hidden)
            total_cat_hidden += n_cat_hidden
        
        self.reg_layer = nn.Sequential(
            nn.Linear(
                len(nume_cols) + total_cat_hidden, 1024
            ),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.4),
            nn.ReLU(),
            nn.Linear(
                1024, 512
            ),
            nn.BatchNorm1d(512),
            nn.Dropout(0.4),
            nn.ReLU(),
            nn.Linear(
                512, 128
            ),
            nn.BatchNorm1d(128),
            nn.Dropout(0.4),
            nn.ReLU(),
            nn.Linear(
                128, 1
            ),
        )
    
       
    def forward(self, cat_x, nume_x):
        batch_size = cat_x.size(0)
        # level, store_id, state_id, item_id, dept_id, cat_id, month, wday
        emb_list = []
        for cat_i, c in enumerate(self.cat_cols):
            embedder = self.emb_dict[cat_i]
            emb = embedder(cat_x[:, cat_i])
            emb_list.append(emb)
        # concat nume and cate embedded feature
        concat_feature = torch.cat(emb_list, 1)
        concat_feature = concat_feature.view([batch_size, -1])
        concat_feature = torch.cat([concat_feature, nume_x], 1)
        # regression layer
        pred_y = self.reg_layer(concat_feature)
        return pred_y
    
    def predict(self, cate_x, nume_x):
        with torch.no_grad():
            return self.forward(cate_x, nume_x)

In [27]:
def training(train_df, valid_df, cat_cols, nume_cols, debug=False):

    ##################
    # Config
    ##################
    config = Config()


    ##################
    # dataset & dataloader
    ##################
    # dataset
    train_dataset = QuevicoDataset(
        train_df, cat_cols, nume_cols, mode="train"
    )
    valid_dataset = QuevicoDataset(
        valid_df, cat_cols, nume_cols, mode="train"
    )

    ##################
    # criterion
    ##################
    criterion = torch.nn.MSELoss()

    ##################
    # archtecture
    ##################
    model = NNModel(cat_cols, nume_cols, train_df)
    model.to(device)

    ##################
    # lr scheduler
    ##################
    scheduler = NullScheduler(lr=config.learning_rate)
    # scheduler = ManualScheduler(lr=config.learning_rate, lr_decay=config.lr_decay)

    #scheduler = CosineAnnealingScheduler(
    #    eta_min=ca_eta_min,
    #    eta_max=ca_eta_max,
    #    cycle=ca_cycle,
    #    repeat=False
    #)
    
    ##################
    # Optimiizer
    ##################
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=scheduler(0)
    )

    ##################
    # data loader
    ##################
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=0)
    valid_loader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=0)
    # dataloaders
    dataloaders_dict = {
        "train": train_loader,
        "valid": valid_loader,
    }

    ##################
    # early stopping
    ##################
    early_stopping = EarlyStopping(
        patience=config.n_early_stopping_patience,
        verbose=False
    )

    ##################
    # train epoch loop
    ##################
    # iteration and loss count
    iteration = 1
    epoch_train_loss = 0.0
    epoch_val_loss = 0.0
    num_epochs = config.n_epoch
    valid_period = config.valid_period
    val_rmse_score_list = []
    val_wrmsse_score_list = []
    best_oof_preds = None

    print(f"Optimizer\n  {optimizer}")
    print(f"Scheduler\n  {scheduler}")
    print(f"Batchsize\n  {config.batch_size}")
    print("** start training here! **")
    print("                      |----- VALID ----------------|------ TRAIN -------")
    print("rate     iter   epoch |  loss  metric   wrmsse |  loss    | time  ")
    print("-------------------------------------------------------------------------------------")

    for epoch in range(num_epochs+1):
        t_epoch_start = time.time()
        val_pred_list = []
        val_true_list = []
        for phase in ['train', 'valid']:
            if phase == 'train':
                lr = scheduler(epoch)
                if lr < 0:
                    break
                adjust_learning_rate(optimizer, lr)
                model.train()
            else:  # valid
                if epoch % valid_period == 0:
                    model.eval()
                else:
                    continue

            # get batch data loop
            for iter_i, (cate_x, nume_x, targets) \
                    in enumerate(dataloaders_dict[phase]):
                cate_x = torch.LongTensor(cate_x)
                cate_x = cate_x.to(device)
                nume_x = torch.FloatTensor(nume_x)
                nume_x = nume_x.to(device)
                targets = targets.float()
                targets = targets.to(device)
                # zero grad
                optimizer.zero_grad()
                # train
                with torch.set_grad_enabled(phase == 'train'):
                    y_pred = model(cate_x, nume_x)
                    # calculate loss
                    loss = criterion(
                        y_pred.reshape(-1),
                        targets.reshape(-1),
                    )
                    if phase == 'train':
                        print(f"\r{iter_i*config.batch_size} / {len(train_dataset)}", end='')
                        loss.backward()  
                        optimizer.step()
                        epoch_train_loss += loss.item()
                        iteration += 1
                    elif phase == "valid":
                        print(f"\r{iter_i*config.batch_size} / {len(valid_dataset)}", end='')
                        epoch_val_loss += loss.item()
                        if epoch > 0 and (epoch % valid_period == 0):
                            pred = model.predict(cate_x, nume_x)
                            val_pred_list.append(pred.cpu().numpy())
                            val_true_list.append(targets.cpu().numpy())

            print("", end="")
        if epoch > 0 and epoch % valid_period == 0:
            val_preds = np.concatenate(val_pred_list, axis=0)
            val_true = np.concatenate(val_true_list, axis=0)
            
            val_score = np.sqrt(mean_squared_error(
                scaler.inverse_transform(val_true),
                scaler.inverse_transform(val_preds)
            ))


        t_epoch_finish = time.time()
        elapsed_time = t_epoch_finish - t_epoch_start
        lr = get_learning_rate(optimizer)
        if epoch > 0 and epoch % valid_period == 0:
            print(f"\r", end="")
            print(
                "{0:1.5f}  {1:4d}  {2:3d}  | {3:4.3f} {4:1.5f}  {5:1.5f}  {6:4.1f} {7:4.3f}"
                .format(
                    lr,
                    iteration,
                    epoch,
                    epoch_val_loss,
                    val_score,
                    0, # val_wrmsse_score,
                    epoch_train_loss,
                    elapsed_time),
            )

            ######################
            # early stopping
            ######################
            if 0:
                model_save_path = os.path.join(
                    "./", f"checkpoint_epoch{epoch}_val{val_rmse_score:.4f}.pth"
                )
                early_stopping(val_rmse_score, model, model_save_path, oof_preds=denormalized_val_preds)
                if early_stopping.early_stop:
                    print("******** Early stopping ********")
                    print(f"Best Score: {early_stopping.best_score*(-1)}")
                    # load best model parameter
                    best_model_save_path = early_stopping.get_best_filepath()
                    model.load_state_dict(
                        torch.load(
                            best_model_save_path,
                            map_location=lambda storage,
                            loc: storage
                        )
                    )
                    best_oof_preds = early_stopping.get_best_oof_preds()
                    break
            

        t_epoch_start = time.time()
        epoch_train_loss = 0.0
        epoch_val_loss = 0.0

    return model, best_oof_preds

In [28]:
class Config:
    batch_size = 256
    n_early_stopping_patience = 10
    n_epoch = 50
    valid_period = 1
    learning_rate = 0.00005
    lr_decay = 0.9

In [29]:
_train_df, _valid_df = train_test_split(train_df, test_size=0.2, shuffle=True, random_state=2020)

In [30]:
gc.collect()

69

In [31]:
model, oof_preds = training(_train_df, _valid_df, cat_cols, nume_cols)

Make Dataset
Make Dataset
Optimizer
  Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 5e-05
    weight_decay: 0
)
Scheduler
  NullScheduler
lr=0.00005
Batchsize
  256
** start training here! **
                      |----- VALID ----------------|------ TRAIN -------
rate     iter   epoch |  loss  metric   wrmsse |  loss    | time  
-------------------------------------------------------------------------------------
0.00005   213    1  | 7.613 23498.68750  0.00000  49.1 110.741
0.00005   319    2  | 7.176 22812.63672  0.00000  43.1 103.628
0.00005   425    3  | 7.070 22637.06641  0.00000  40.0 99.301
0.00005   531    4  | 6.925 22400.08984  0.00000  38.0 99.589
0.00005   637    5  | 6.810 22211.05078  0.00000  35.1 110.859
0.00005   743    6  | 6.699 22020.19141  0.00000  33.4 109.200
0.00005   849    7  | 6.678 21982.56250  0.00000  31.7 116.680
0.00005   955    8  | 6.565 21800.01172  0.00000  30.3 102.113
0.00005  1061    9  | 6.487 21675.6

In [32]:
oof_score = np.sqrt(mean_squared_error(train_df["Salary"], oof_preds))
oof_score

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
len(use_cols)

## Prediction

In [None]:
test_pred = np.zeros(len(test_df))

for model in models:
    test_pred += model.predict(test_df[use_cols]) / len(models)

In [None]:
test_pred

In [None]:
sub_df = pd.read_csv("../input/submit.csv")

In [None]:
sub_df["Salary"] = test_pred

In [None]:
# sub_df.to_csv("../predict/024_col_shuffle_20186.csv", index=False)