In [2]:
! pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-1.5.10-py3-none-any.whl (527 kB)
[K     |████████████████████████████████| 527 kB 4.2 MB/s 
[?25hCollecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 43.7 MB/s 
Collecting PyYAML>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.1 MB/s 
Collecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 49.2 MB/s 
[?25hCollecting setuptools==59.5.0
  Downloading setuptools-59.5.0-py3-none-any.whl (952 kB)
[K     |████████████████████████████████| 952 kB 44.4 MB/s 
Collecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting torchmetrics>=0.4.1
  Downloading torchmetrics-0.7.2-py3-none-any.whl (397 kB

In [3]:
import random

import numpy as np
import pandas as pd

PAD = 0
MASK = 1


def map_column(df: pd.DataFrame, col_name: str):
    """
    Maps column values to integers
    :param df:
    :param col_name:
    :return:
    """
    values = sorted(list(df[col_name].unique()))
    mapping = {k: i + 2 for i, k in enumerate(values)}
    inverse_mapping = {v: k for k, v in mapping.items()}

    df[col_name + "_mapped"] = df[col_name].map(mapping)

    return df, mapping, inverse_mapping


def get_context(df: pd.DataFrame, split: str, context_size: int = 120, val_context_size: int = 1):
    """
    Create a training / validation samples
    Validation samples are the last horizon_size rows
    :param df:
    :param split:
    :param context_size:
    :param val_context_size:
    :return:
    """
    if split == "train":
        # 10 -> 3
        end_index = random.randint(3, df.shape[0] - val_context_size)
        
    elif split in ["val", "test"]:
        end_index = df.shape[0]
    else:
        raise ValueError

    start_index = max(0, end_index - context_size)

    context = df[start_index:end_index]

    return context


def pad_arr(arr: np.ndarray, expected_size: int = 30):
    """
    Pad top of array when there is not enough history
    :param arr:
    :param expected_size:
    :return:
    """
    arr = np.pad(arr, [(expected_size - arr.shape[0], 0), (0, 0)], mode="edge")
    return arr


def pad_list(list_integers, history_size: int, pad_val: int = PAD, mode="left"):
    """

    :param list_integers:
    :param history_size:
    :param pad_val:
    :param mode:
    :return:
    """

    if len(list_integers) < history_size:
        if mode == "left":
            list_integers = [pad_val] * (history_size - len(list_integers)) + list_integers
        else:
            list_integers = list_integers + [pad_val] * (history_size - len(list_integers))

    return list_integers


def df_to_np(df, expected_size=30):
    arr = np.array(df)
    arr = pad_arr(arr, expected_size=expected_size)
    return arr


def genome_mapping(genome):
    genome.sort_values(by=["movieId", "tagId"], inplace=True)
    movie_genome = genome.groupby("movieId")["relevance"].agg(list).reset_index()

    movie_genome = {a: b for a, b in zip(movie_genome['movieId'], movie_genome['relevance'])}

    return movie_genome



In [4]:
from typing import Optional

import pytorch_lightning as pl
import torch
import torch.nn as nn
from torch.nn import Linear
from torch.nn import functional as F


def masked_accuracy(y_pred: torch.Tensor, y_true: torch.Tensor, mask: torch.Tensor):

    _, predicted = torch.max(y_pred, 1)

    y_true = torch.masked_select(y_true, mask)
    predicted = torch.masked_select(predicted, mask)

    acc = (y_true == predicted).double().mean()

    return acc


def masked_ce(y_pred, y_true, mask):

    loss = F.cross_entropy(y_pred, y_true, reduction="none")

    loss = loss * mask

    return loss.sum() / (mask.sum() + 1e-8)


class Recommender(pl.LightningModule):
    def __init__(
        self,
        vocab_size,
        channels=128,
        cap=0,
        mask=1,
        dropout=0.4,
        lr=1e-4,
    ):
        super().__init__()

        self.cap = cap
        self.mask = mask

        self.lr = lr
        self.dropout = dropout
        self.vocab_size = vocab_size

        self.item_embeddings = torch.nn.Embedding(
            self.vocab_size, embedding_dim=channels
        )

        self.input_pos_embedding = torch.nn.Embedding(512, embedding_dim=channels)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=channels, nhead=4, dropout=self.dropout
        )

        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=6)

        self.linear_out = Linear(channels, self.vocab_size)

        self.do = nn.Dropout(p=self.dropout)

    def encode_src(self, src_items):
        src_items = self.item_embeddings(src_items)

        batch_size, in_sequence_len = src_items.size(0), src_items.size(1)
        pos_encoder = (
            torch.arange(0, in_sequence_len, device=src_items.device)
            .unsqueeze(0)
            .repeat(batch_size, 1)
        )
        pos_encoder = self.input_pos_embedding(pos_encoder)

        src_items += pos_encoder

        src = src_items.permute(1, 0, 2)

        src = self.encoder(src)

        return src.permute(1, 0, 2)

    def forward(self, src_items):

        src = self.encode_src(src_items)

        out = self.linear_out(src)

        return out

    def training_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("train_loss", loss)
        self.log("train_accuracy", accuracy)

        return loss

    def validation_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("valid_loss", loss)
        self.log("valid_accuracy", accuracy)

        return loss

    def test_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("test_loss", loss)
        self.log("test_accuracy", accuracy)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=10, factor=0.1
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "valid_loss",
        }


In [5]:
import random

import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

# from recommender.models import Recommender
# from recommender.data_processing import get_context, pad_list, map_column, MASK


def mask_list(l1, p=0.8):

    l1 = [a if random.random() < p else MASK for a in l1]

    return l1


def mask_last_elements_list(l1, val_context_size: int = 1):

    l1 = l1[:-val_context_size] + mask_list(l1[-val_context_size:], p=0.5)

    return l1


class Dataset(torch.utils.data.Dataset):
    def __init__(self, groups, grp_by, split, history_size=120):
        self.groups = groups
        self.grp_by = grp_by
        self.split = split
        self.history_size = history_size

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        group = self.groups[idx]

        df = self.grp_by.get_group(group)

        context = get_context(df, split=self.split, context_size=self.history_size)

        trg_items = context["article_id_mapped"].tolist()

        if self.split == "train":
            src_items = mask_list(trg_items)
        else:
            src_items = mask_last_elements_list(trg_items)

        pad_mode = "left" if random.random() < 0.5 else "right"
        trg_items = pad_list(trg_items, history_size=self.history_size, mode=pad_mode)
        src_items = pad_list(src_items, history_size=self.history_size, mode=pad_mode)

        src_items = torch.tensor(src_items, dtype=torch.long)

        trg_items = torch.tensor(trg_items, dtype=torch.long)

        return src_items, trg_items


def train(
    data_csv_path: str,
    log_dir: str = "/content/drive/MyDrive/colab_data/kaggle_H&M/" + "logger/recommender_logs",
    model_dir: str = "/content/drive/MyDrive/colab_data/kaggle_H&M/" + "saved_model/recommender_models",
    batch_size: int = 32,
    epochs: int = 2000,
    history_size: int = 120,
):
    data = pd.read_csv(data_csv_path)

    data.sort_values(by="t_dat", inplace=True)

    data, mapping, inverse_mapping = map_column(data, col_name="article_id")

    grp_by_train = data.groupby(by="customer_id")

    groups = list(grp_by_train.groups)

    train_data = Dataset(
        groups=groups,
        grp_by=grp_by_train,
        split="train",
        history_size=history_size,
    )
    val_data = Dataset(
        groups=groups,
        grp_by=grp_by_train,
        split="val",
        history_size=history_size,
    )

    print("len(train_data)", len(train_data))
    print("len(val_data)", len(val_data))

    train_loader = DataLoader(
        train_data,
        batch_size=batch_size,
        num_workers=0,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_data,
        batch_size=batch_size,
        num_workers=0,
        shuffle=False,
    )

#################
    model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )

#     model = Recommender(
#         vocab_size=len(mapping) + 2,
#         lr=1e-4,
#         dropout=0.3,
#     )
#     model.eval()
#     # model.load_state_dict(torch.load(model_path)["state_dict"])
#     model.load_state_dict(torch.load(model_path, map_location=device)["state_dict"])
# ##################

    logger = TensorBoardLogger(
        save_dir=log_dir,
    )

    checkpoint_callback = ModelCheckpoint(
        monitor="valid_loss",
        mode="min",
        dirpath=model_dir,
        filename="recommender",
    )

    trainer = pl.Trainer(
        max_epochs=epochs,
        # tpu_cores=8,
        gpus=1,
        logger=logger,
        callbacks=[checkpoint_callback],
    )
    trainer.fit(model, train_loader, val_loader)

    result_val = trainer.test(test_dataloaders=val_loader)

    output_json = {
        "val_loss": result_val[0]["test_loss"],
        "best_model_path": checkpoint_callback.best_model_path,
    }

    print(output_json)

    return output_json





In [6]:
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)

device = torch.device('cuda:0' if USE_CUDA else 'cpu')
print('학습을 진행하는 기기:',device)

train(
    data_csv_path='/content/drive/MyDrive/colab_data/kaggle_H&M/data/train_upto5.csv',
    epochs=20
)

True
학습을 진행하는 기기: cuda:0
len(train_data) 26053
len(val_data) 26053


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                | Type               | Params
-----------------------------------------------------------
0 | item_embeddings     | Embedding          | 2.3 M 
1 | input_pos_embedding | Embedding          | 65.5 K
2 | encoder             | TransformerEncoder | 3.6 M 
3 | linear_out          | Linear             | 2.3 M 
4 | do                  | Dropout            | 0     
-----------------------------------------------------------
8.3 M     Trainable params
0         Non-trainable params
8.3 M     Total params
33.219    Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
  "`trainer.test(test_dataloaders)` is deprecated in v1.4 and will be removed in v1.6."
  f"`.{fn}(ckpt_path=None)` was called without a model."
Restoring states from the checkpoint path at /content/drive/MyDrive/colab_data/kaggle_H&M/saved_model/recommender_models/recommender-v4.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /content/drive/MyDrive/colab_data/kaggle_H&M/saved_model/recommender_models/recommender-v4.ckpt


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': 0.0029407146592774036, 'test_loss': 8.894183158874512}
--------------------------------------------------------------------------------
{'val_loss': 8.894183158874512, 'best_model_path': '/content/drive/MyDrive/colab_data/kaggle_H&M/saved_model/recommender_models/recommender-v4.ckpt'}


{'best_model_path': '/content/drive/MyDrive/colab_data/kaggle_H&M/saved_model/recommender_models/recommender-v4.ckpt',
 'val_loss': 8.894183158874512}

In [17]:
model

NameError: ignored

In [None]:
|## item 소비 sequence가 5 이상인 소비자 대상 필터링 해서 val을 2개 추론 하는 걸로 설정

In [7]:
# import pandas as pd


df = pd.read_csv('/content/drive/MyDrive/colab_data/kaggle_H&M/data/train_upto5.csv')
df.sort_values(by="t_dat", inplace=True)
data, mapping, inverse_mapping = map_column(df, col_name="article_id")
grp_by_train = data.groupby(by="customer_id")
groups = list(grp_by_train.groups)


data[data['customer_id'].isin(grp_by_train.count().query(' article_id > 5').index)]

Unnamed: 0.1,Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,article_id_mapped
0,31266893,2020-09-08,49315380b9663f6df3a56da27bbf92438e227a7b510bc5...,824767002,0.013542,2,8498
1485,31269782,2020-09-08,5df4a576fc458f3b0fd24e450f3018fc8a2d9114413f51...,934211004,0.047441,2,17990
1484,31269772,2020-09-08,5ded2b09666ced6638157802129afa63693780afa6c137...,900279001,0.038966,2,15626
1483,31269771,2020-09-08,5ded2b09666ced6638157802129afa63693780afa6c137...,931769001,0.038966,2,17892
1482,31269770,2020-09-08,5ddfd940534361d7fee3086a50c824e602b380d81e2464...,901950002,0.059305,2,15719
...,...,...,...,...,...,...,...
244617,31788069,2020-09-22,fdb8bb2d51ad87761de9463ac9543ba608627fe833073b...,931769004,0.041356,2,17894
244618,31788082,2020-09-22,fdc53a7ef3a228d6101c8463d2c2bd2ed5eb126ec0461f...,706016062,0.033881,2,3196
244619,31788130,2020-09-22,fe1c283ab1d025ab45be18fc2160a2a0133cbef7073e53...,804992014,0.025407,2,7012
244589,31788165,2020-09-22,fe99a0069d6b3c64c2707d0ce53b9311540917471d82df...,867969008,0.033881,2,12103


In [8]:
data[data['customer_id'].isin(grp_by_train.count().query(' article_id > 5').index)]

Unnamed: 0.1,Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,article_id_mapped
0,31266893,2020-09-08,49315380b9663f6df3a56da27bbf92438e227a7b510bc5...,824767002,0.013542,2,8498
1485,31269782,2020-09-08,5df4a576fc458f3b0fd24e450f3018fc8a2d9114413f51...,934211004,0.047441,2,17990
1484,31269772,2020-09-08,5ded2b09666ced6638157802129afa63693780afa6c137...,900279001,0.038966,2,15626
1483,31269771,2020-09-08,5ded2b09666ced6638157802129afa63693780afa6c137...,931769001,0.038966,2,17892
1482,31269770,2020-09-08,5ddfd940534361d7fee3086a50c824e602b380d81e2464...,901950002,0.059305,2,15719
...,...,...,...,...,...,...,...
244617,31788069,2020-09-22,fdb8bb2d51ad87761de9463ac9543ba608627fe833073b...,931769004,0.041356,2,17894
244618,31788082,2020-09-22,fdc53a7ef3a228d6101c8463d2c2bd2ed5eb126ec0461f...,706016062,0.033881,2,3196
244619,31788130,2020-09-22,fe1c283ab1d025ab45be18fc2160a2a0133cbef7073e53...,804992014,0.025407,2,7012
244589,31788165,2020-09-22,fe99a0069d6b3c64c2707d0ce53b9311540917471d82df...,867969008,0.033881,2,12103


In [15]:
data[data['customer_id'].isin(grp_by_train.count().query(' article_id > 5').index)].to_csv('/content/drive/MyDrive/colab_data/kaggle_H&M/data/train_data_BERT_9.csv', index = False)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# val_start_date = '2020-09-16'

# train_data = data.query(f"t_dat < '{val_start_date}'").reset_index(drop=True)
# valid_data = data.query(f"t_dat >= '{val_start_date}'").reset_index(drop=True)

# train_data.to_csv('/content/drive/MyDrive/colab_data/kaggle_H&M/data/train_data.csv', index = False)
# valid_data.to_csv('/content/drive/MyDrive/colab_data/kaggle_H&M/data/val_data2.csv', index = False)

## Test Section

In [49]:
import pandas as pd

data_csv_path = '/content/drive/MyDrive/colab_data/kaggle_H&M/data/train_data_BERT_9.csv'
# data_csv_path = '/content/drive/MyDrive/colab_data/kaggle_H&M/data/train_upto5.csv'
articles_path = "/content/drive/MyDrive/colab_data/kaggle_H&M/data/articles.csv"

model_path = "/content/drive/MyDrive/colab_data/kaggle_H&M/saved_model/recommender_models/recommender-v2.ckpt"

In [50]:
data = pd.read_csv(data_csv_path)
articles = pd.read_csv(articles_path)

In [51]:
data.sort_values(by="t_dat", inplace=True)
data, mapping, inverse_mapping = map_column(data, col_name="article_id")
grp_by_train = data.groupby(by="customer_id")

random.sample(list(grp_by_train.groups), k=5)

['22b0dc426fe14989f30e2e7a6aad4eba1fb82803ce9ceda91f4e70b422667a0a',
 '12fbf7dd721c03b057a0b4c1c5d6378b6649c2c00c03e13f9dfc298d70f062ed',
 'c069cec7196d0f43ab475a6d6a7739cb40295d20e83698e82c2e8abec0bee607',
 'ee7640de35eecf623304ad0948e0bed6ab3b6bc20e5819e0e9e8d71db40466d7',
 'c9bd63ea9c60627f494162906513dc990b443c9a04ee1d76a87cc8cd5cb6804e']

In [52]:
device = torch.device('cuda:0')

model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
# model.load_state_dict(torch.load(model_path)["state_dict"])
model.load_state_dict(torch.load(model_path, map_location=device)["state_dict"])

<All keys matched successfully>

In [53]:
article_to_idx = {a: mapping[b] for a, b in zip(articles.article_id.tolist(), articles.article_id.tolist()) if b in mapping}
idx_to_article = {v: k for k, v in article_to_idx.items()}

In [27]:
def predict(list_articles, model, article_to_idx, idx_to_article):
    
    ids = [PAD] * (120 - len(list_articles) - 1) + [article_to_idx[a] for a in list_articles] + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    # print(src)
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()
    
    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    # print(sorted_predicted_ids)
    return [idx_to_article[a] for a in sorted_predicted_ids[:12] if a in idx_to_article]


In [None]:
model

### set test set

In [41]:
val_df = pd.read_csv('/content/drive/MyDrive/colab_data/kaggle_H&M/data/val_data.csv')

In [None]:
val_df.customer_id[2]

'aa3296cffa5601318f833c50128ef616a1e39c191d20eef60fa0762e78980400'

In [54]:
total_count = 0

for i in range(900):

    # target customer 세팅
    target_customer_id = val_df.customer_id[i]
    list_articles = data.query(f' customer_id == "{target_customer_id}"').article_id.values

    # 모델 질의 
    top_article = predict(list_articles, model, article_to_idx, idx_to_article)
    # label, predict
    labels = val_df[val_df.customer_id == target_customer_id].article_id.values
    predicts = articles[articles.article_id.isin(top_article)].article_id.values

    score = 0
    

    for label in labels:
        for pred in predicts:

            if label == pred:
                score += 1

    print(score)

    total_count += score

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
1
1
1
1
1
1
1
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
2
2
0
1
0
0
0
0
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [55]:
total_count

95

In [19]:
len(predicts)

12

## For Submit

In [40]:
sub_df = pd.read_csv('/content/drive/MyDrive/colab_data/kaggle_H&M/data/sample_submission.csv')

In [74]:
for idx, row in sub_df.iterrows():
    
    target_customer_id = row['customer_id']

    sub_df.iloc[idx]['prediction']

    list_articles = data.query(f' customer_id == "{target_customer_id}"').article_id.values
    
    # 모델 질의 
    top_article = predict(list_articles, model, article_to_idx, idx_to_article)
    # label, predict
    labels = val_df[val_df.customer_id == target_customer_id].article_id.values
    predicts = articles[articles.article_id.isin(top_article)].article_id.values
    predicts = np.array2string(predicts, separator=' ').strip("\n""[""]")

    sub_df.iloc[idx]['prediction'] = predicts


KeyboardInterrupt: ignored

In [75]:
sub_df

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,448509014 751471001 751471043 762846006 762846...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,448509014 751471001 751471043 762846006 762846...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,448509014 751471001 751471043 762846006 762846...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,448509014 751471001 751471043 762846006 762846...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,448509014 751471001 751471043 762846006 762846...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0706016002 0372860001 0610776002 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0706016001 0706016002 0372860001 0610776002 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0706016002 0372860001 0610776002 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0706016002 0372860001 0610776002 07...




In [63]:
np.array2string(predicts, separator=' ').strip("\n""[""]")

'448509014 751471001 751471043 762846006 762846027 865799006 896169005\n 909370001 915529003 918292001 918522001 936622001'

In [None]:
total_count = 0

for i in range(300):

    # target customer 세팅
    target_customer_id = val_df.customer_id[i]
    list_articles = data.query(f' customer_id == "{target_customer_id}"').article_id.values

    # 모델 질의 
    top_article = predict(list_articles, model, article_to_idx, idx_to_article)
    # label, predict
    labels = val_df[val_df.customer_id == target_customer_id].article_id.values
    predicts = articles[articles.article_id.isin(top_article)].article_id.values

    score = 0
    

    for label in labels:
        for pred in predicts:

            if label == pred:
                score += 1

    print(score)

    total_count += score

In [41]:
sub_df

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0706016002 0372860001 0610776002 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0706016001 0706016002 0372860001 0610776002 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0706016002 0372860001 0610776002 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0706016002 0372860001 0610776002 07...


In [83]:
# import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/colab_data/kaggle_H&M/data/transactions_raw.csv')
df.sort_values(by="t_dat", inplace=True)


In [87]:
df = df[-500000:]

In [88]:
# import pandas as pd
# df = pd.read_csv('/content/drive/MyDrive/colab_data/kaggle_H&M/transactions_raw.csv')
# df.sort_values(by="t_dat", inplace=True)
data, mapping, inverse_mapping = map_column(df, col_name="article_id")
grp_by_train = data.groupby(by="customer_id")
groups = list(grp_by_train.groups)


data[data['customer_id'].isin(grp_by_train.count().query(' article_id > 5').index)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,article_id_mapped
31266893,2020-09-08,49315380b9663f6df3a56da27bbf92438e227a7b510bc5...,824767002,0.013542,2,11194
31266939,2020-09-08,497266ddb42590bcca8e9d7ab2744a1ed3c09201515fab...,855893002,0.030492,1,14078
31266940,2020-09-08,497266ddb42590bcca8e9d7ab2744a1ed3c09201515fab...,200182001,0.013542,1,75
31266941,2020-09-08,497266ddb42590bcca8e9d7ab2744a1ed3c09201515fab...,751471001,0.033881,1,6019
31266942,2020-09-08,497f10b50acefdb2d004e257a2d38704115c8656681fd1...,909884001,0.033881,2,20857
...,...,...,...,...,...,...
31766420,2020-09-22,545e6a5e2b085bed6c1395affbaba18e75a8e230d50270...,826498003,0.033881,1,11388
31766419,2020-09-22,545e6a5e2b085bed6c1395affbaba18e75a8e230d50270...,910601002,0.042356,1,20974
31766418,2020-09-22,545e6a5e2b085bed6c1395affbaba18e75a8e230d50270...,678942055,0.016932,1,3247
31766497,2020-09-22,54e8ebd39543b5a4d69c3e7d79977558d2a606e6540ba0...,928210002,0.067780,2,22197


In [90]:
data[data['customer_id'].isin(grp_by_train.count().query(' article_id > 5').index)].to_csv('/content/drive/MyDrive/colab_data/kaggle_H&M/data/train_upto5.csv')