In [10]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

In [17]:
train_df = pd.read_parquet("train_df.parquet")
test_df = pd.read_parquet("test_df.parquet")

train_data = np.load("train_data.npy")
test_data = np.load("test_data.npy")

In [18]:
for col in ["pkgname", "ver", "slotid", "mediaid", "material"]:
    train_df[col] = train_df[col].str.replace("b'", "").str.replace("'", "")
    test_df[col] = test_df[col].str.replace("b'", "").str.replace("'", "")

In [19]:
for col in ["pkgname", "ver", "slotid", "mediaid", "material"]:
    lbl = LabelEncoder()
    lbl.fit(train_df[col].tolist() + test_df[col].tolist())

    train_df[col] = lbl.transform(train_df[col])
    test_df[col] = lbl.transform(test_df[col])

In [21]:
for col in ["pkgname", "ver", "slotid", "mediaid", "material"]:
    print(col, train_df[col].max(), test_df[col].max())

pkgname 262 262
ver 4014 4013
slotid 5999 5999
mediaid 101 101
material 277 299


In [23]:
train_df["label"].value_counts()

0              2199327
1                20210
adx_slot_id          0
Name: label, dtype: int64

In [326]:
train_data.shape

(2219537, 240)

In [374]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torchmetrics.functional import auc, auroc
torch.manual_seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MLP(nn.Module):
    def __init__(self, category_dict, layers=[45+240, 32], dropout=False):
        super().__init__()
        print(category_dict)
        self.category_dict = category_dict
        self.embedding_dict = {
            key: torch.nn.Embedding(
                self.category_dict[key]+1, int(np.log2(self.category_dict[key]))
            ).to(device)
            for key in category_dict.keys()
        }

        self.fc_layers = torch.nn.ModuleList()
        for _, (in_size, out_size) in enumerate(zip(layers[:-1], layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size).to(device))
        self.output_layer = torch.nn.Linear(layers[-1], 1).to(device)

    def forward(self, feed_dict, embed_dict):
        embedding_feet = {
            key: self.embedding_dict[key](feed_dict[key])
            for key in self.category_dict.keys()
        }

        x = torch.cat(list(embedding_feet.values()), 1)
        x = torch.cat([x, embed_dict], 1)
        for idx, _ in enumerate(range(len(self.fc_layers))):
            x = self.fc_layers[idx](x)
            x = F.relu(x)
            x = F.dropout(x)
        logit = self.output_layer(x)
        return logit

    def predict(self, feed_dict):
        for key in feed_dict:
            if type(feed_dict[key]) != type(None):
                feed_dict[key] = torch.from_numpy(feed_dict[key]).to(
                    dtype=torch.long, device="cpu"
                )
        output_scores = self.forward(feed_dict)
        return output_scores

In [375]:
category_dict = {
    "pkgname": 263,
    "ver": 4015,
    "slotid": 6000,
    "mediaid": 102,
    "material": 300,
}

model = MLP(category_dict)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.008)


{'pkgname': 263, 'ver': 4015, 'slotid': 6000, 'mediaid': 102, 'material': 300}


In [376]:
example_cate_feat = {key: torch.tensor([0, 1]).long().to(device) for key in category_dict.keys()}
model(example_cate_feat, torch.zeros(2, 240).to(device))

tensor([[0.1038],
        [0.2315]], device='cuda:0', grad_fn=<AddmmBackward>)

In [377]:
class MLPDataset(Dataset):
    def __init__(self, cate_df, embed_feat, label=None, train=True):
        self.cate_df = cate_df
        self.embed_feat = embed_feat
        self.label = label
        self.train = train

    def __getitem__(self, index):
        cate_feat = {}
        # for col in self.cate_df.columns:
        #     cate_feat[col] = self.cate_df[col].iloc[index]
        cate_feat = self.cate_df.iloc[index].to_dict()
        
        if self.train:
            label = self.label[index]
            return cate_feat, self.embed_feat[index],label
        else:
            return cate_feat, self.embed_feat[index]

    def __len__(self):
        return len(self.cate_df)

train_loader = torch.utils.data.DataLoader(
    MLPDataset(
        train_df.iloc[:-500000, 1:],
        train_data,
        train_df["label"].values[:-500000].astype(int),
    ),
    batch_size=3000,
    shuffle=True,
    num_workers=0,
)

val_loader = torch.utils.data.DataLoader(
    MLPDataset(
        train_df.iloc[-500000:, 1:],
        train_data,
        train_df["label"].values[-500000:].astype(int),
    ),
    batch_size=5000,
    shuffle=False,
    num_workers=0,
)

In [378]:
train_df.iloc[10].to_dict()

{'label': '0',
 'pkgname': 36,
 'ver': 3438,
 'slotid': 3048,
 'mediaid': 88,
 'material': 262}

In [379]:
%%time

for _ in train_loader:
    break

CPU times: user 398 ms, sys: 4.06 ms, total: 402 ms
Wall time: 319 ms


In [380]:
for _ in range(5):
    batch_idx = 0
    model.train()
    for batch_cate, batch_emb, batch_y in train_loader:
        for key in category_dict.keys():
            batch_cate[key] = batch_cate[key].long().to(device)

        batch_y = batch_y.float().to(device)
        batch_emb = batch_emb.to(device)
        pred = model(batch_cate, batch_emb).view(-1)
        # loss = loss_fn()
        pos_weight=torch.ones([len(batch_y)]) * 100
        pos_weight = pos_weight.to(device)
        loss = F.binary_cross_entropy_with_logits(pred, batch_y, 
                                           pos_weight = pos_weight)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print("{0}/{1}\t {2}/{3}".format(
                batch_idx, len(train_loader), loss.item(), 
                auc(batch_y, torch.sigmoid(pred), reorder=True).item()
            ))

        batch_idx += 1

    with torch.no_grad():
        model.eval()
        val_pred = []
        for batch_cate, batch_emb, batch_y in val_loader:
            for key in category_dict.keys():
                batch_cate[key] = batch_cate[key].long().to(device)
            batch_emb = batch_emb.to(device)
            pred = model(batch_cate, batch_emb).view(-1)
            pred = torch.sigmoid(pred).data.cpu().numpy()
            val_pred.append(pred)      
    print(roc_auc_score(val_loader.dataset.label.astype(int), np.hstack(val_pred)))

0/574	 1.3225828409194946/0.533422589302063
100/574	 0.8862520456314087/0.3743702471256256
200/574	 0.9831544756889343/0.3598862290382385
300/574	 0.8013485670089722/0.653688371181488
400/574	 0.8437849879264832/0.784477710723877
500/574	 0.8376685976982117/0.7145994305610657
0.7265792533080638
0/574	 0.634872555732727/0.7282460331916809
100/574	 0.7812550663948059/0.32703566551208496
200/574	 0.8246058821678162/0.703112781047821
300/574	 0.9505192637443542/0.7463701963424683
400/574	 0.6246465444564819/0.31641885638237
500/574	 0.6681424975395203/0.5754125714302063
0.6840086410876116
0/574	 0.8549034595489502/0.5854130387306213
100/574	 0.8255133628845215/0.7331435680389404
200/574	 0.7898080945014954/0.2889920473098755
300/574	 0.8434247970581055/0.45008161664009094
400/574	 0.8021948337554932/0.2798524498939514
500/574	 0.7587068676948547/0.4931146502494812
0.6508517459198844
0/574	 0.7204117774963379/0.3965621590614319
100/574	 0.6938701272010803/0.4912928342819214
200/574	 0.69961

In [368]:
test_loader = torch.utils.data.DataLoader(
    MLPDataset(
        test_df.iloc[:, :],
        test_data,
        None, False
    ),
    batch_size=5000,
    shuffle=False,
    num_workers=0,
)

with torch.no_grad():
    model.eval()
    test_pred = []
    for batch_cate, batch_emb in test_loader:
        for key in category_dict.keys():
            batch_cate[key] = batch_cate[key].long().to(device)
        batch_emb = batch_emb.to(device)
        pred = model(batch_cate, batch_emb).view(-1)
        pred = torch.sigmoid(pred).data.cpu().numpy()
        test_pred.append(pred)   

In [372]:
test_pred = np.hstack(test_pred)

In [373]:
pd.DataFrame({'predict': (test_pred > 0.2).astype(int)}).to_csv('sub.csv', index=None)