In [3]:
"""
Valid Score: 0.8408
Improved (Valid Score: 0.8320 --> 0.8408, Epoch: 3)

Valid Score: 0.8464
Improved (Valid Score: 0.8408 --> 0.8464, Epoch: 4)

Valid Score: 0.8481
Improved (Valid Score: 0.8464 --> 0.8481, Epoch: 5)

Valid Score: 0.8488
Improved (Valid Score: 0.8481 --> 0.8488, Epoch: 6)

Valid Score: 0.8514
Improved (Valid Score: 0.8488 --> 0.8514, Epoch: 7)

Valid Score: 0.8529
Improved (Valid Score: 0.8514 --> 0.8529, Epoch: 8)

Valid Score: 0.8532
Improved (Valid Score: 0.8529 --> 0.8532, Epoch: 9)
"""

import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
import os

import torch

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # RTX 2080 Ti
# os.environ["CUDA_VISIBLE_DEVICES"] = "1" # RTX 3090

data_dir = Path('/workspace/Kaggle/AI4Code')
# data_dir = Path('../input/AI4Code')

In [4]:
available_gpus = [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]
available_gpus

['GeForce RTX 2080 Ti']

In [5]:
batch_size = 32
num_workers = 8
code_max_len = 23
md_max_len = 64
total_max_len = 512

In [6]:
def read_notebook(path):
    return (pd.read_json(path,
                         dtype={
                             'cell_type': 'category',
                             'source': 'str'
                         }).assign(id=path.stem).rename_axis('cell_id'))


paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
test_df = (pd.concat(notebooks_test).set_index(
    'id',
    append=True).swaplevel().sort_index(level='id',
                                        sort_remaining=False)).reset_index()
test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

Test NBs: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 269.96it/s]


In [7]:
test_df

Unnamed: 0,id,cell_id,cell_type,source,rank,pred
0,0009d135ece78d,ddfd239c,code,"import numpy as np # linear algebra\nimport pandas as pd # data processing,\nimport matplotlib.pyplot as plt\nfrom s...",0,0.142857
1,0009d135ece78d,c6cd22db,code,df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')\ndf,1,0.285714
2,0009d135ece78d,1372ae9b,code,"numerical_data = df.loc[:, ~df.columns.isin(['id', ""diagnosis""])]\n\nlabels = df[""diagnosis""].factorize(['B','M'])[0...",2,0.428571
3,0009d135ece78d,90ed07ab,code,"def comparison_plot_maker(data_1, data_2, name, column_name_1, column_name_2):\n # Scaling Data for testing\n ...",3,0.571429
4,0009d135ece78d,7f388a41,code,"# Ploting data with different columns\n#####################################\ncomparison_plot_maker(numerical_data[""...",4,0.714286
...,...,...,...,...,...,...
84,0010a919d60e4f,d3f5c397,markdown,We have 177 rows with missing `Age` and 687 rows with missing `Cabin`,34,1.000000
85,0028856e09c5b7,012c9d02,code,"sns.set()\nsns.pairplot(data1, 2.5)\nplt.show(); = size",0,0.333333
86,0028856e09c5b7,d22526d1,code,"types----------"")\n# is uniques----------"")\n# plt\nimport mis_val +\n = #https://pandas.pydata.org/pandas...",1,0.666667
87,0028856e09c5b7,3ae7ece3,code,"#correlation avoid map\nf,ax verbose 20), 18))\nsns.heatmap(data1.corr(), the annot=True, ; informations bins=50, '....",2,1.000000


In [8]:
# Additional code cells
def clean_code(cell):
    return str(cell).replace("\\n", "\n")


def sample_cells(cells, n):
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]
    else:
        results = []
        step = len(cells) / n
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df):
    features = dict()
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")):
        features[idx] = dict()
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[idx]["total_code"] = total_code
        features[idx]["total_md"] = total_md
        features[idx]["codes"] = codes
    return features

In [9]:
test_fts = get_features(test_df)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1037.81it/s]


In [10]:
from tqdm import tqdm
import sys, os
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch


class MarkdownModel(nn.Module):

    def __init__(self, model_path):
        super(MarkdownModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.top = nn.Linear(769, 1)

    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        x = self.top(torch.cat((x[:, 0, :], fts), 1))
        return x


from torch.utils.data import DataLoader, Dataset


class MarkdownDataset(Dataset):

    def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.fts = fts

    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus(row.source,
                                            None,
                                            add_special_tokens=True,
                                            max_length=self.md_max_len,
                                            padding="max_length",
                                            return_token_type_ids=True,
                                            truncation=True)
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.fts[row.id]["codes"]],
            add_special_tokens=True,
            max_length=code_max_len,
            padding="max_length",
            truncation=True)
        n_md = self.fts[row.id]["total_md"]
        n_code = self.fts[row.id]["total_md"]
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [
                self.tokenizer.pad_token_id,
            ] * (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [
                self.tokenizer.pad_token_id,
            ] * (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        assert len(ids) == self.total_max_len

        return ids, mask, fts, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]

In [11]:
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

    return np.concatenate(labels), np.concatenate(preds)


def predict(model_path, ckpt_path):
    model = MarkdownModel(model_path)
    model = model.cuda()
    model.eval()
    model.load_state_dict(torch.load(ckpt_path))
    test_df["pct_rank"] = 0
    test_ds = MarkdownDataset(
        test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True),
        md_max_len=md_max_len,
        total_max_len=total_max_len,
        model_name_or_path=model_path,
        fts=test_fts)
    test_loader = DataLoader(test_ds,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=num_workers,
                             pin_memory=False,
                             drop_last=False)
    _, y_test = validate(model, test_loader)
    return y_test

In [8]:
# model_path = "../input/huggingface-bert-variants/distilbert-base-cased/distilbert-base-cased"
# ckpt_path = "../input/ai4codemodels/model.bin"
# y_test_1 = predict(model_path, ckpt_path)

In [12]:
model_path = "microsoft/codebert-base-mlm"
ckpt_path = data_dir / f"outputs/codebert-base-mlm-v1/model_best_1epochs_0.8320.bin"

# model_path = "../input/codebert-base/codebert-base/"
# ckpt_path = "../input/ai4codemodelspublic/model.bin"

y_test_2 = predict(model_path, ckpt_path)

Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  cpuset_checked))


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.24it/s]


In [13]:
# y_test = (y_test_1 + y_test_2)/2
y_test = y_test_2

In [14]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test

In [15]:
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

Unnamed: 0,id,cell_order
0,0009d135ece78d,0a226b6a ddfd239c 8cb8d28a c6cd22db 1372ae9b e25aa9bd 90ed07ab ba55e576 39e937ec f9893819 7f388a41 2843a25a 06dbf8cf
1,0010483c12ba9b,7f270e34 54c7cab3 fe66203e 7844d5f8 5ce8863c 4a0777c4 4703bb6d 4a32c095 865ad516 02a0be6d
2,0010a919d60e4f,23607d04 b7578789 aafc3d23 80e077ec bbff12d4 584f6568 b190ebb4 d3f5c397 ed415c3c 8ce62db4 322850af 5115ebe5 5e8c5e7e...
3,0028856e09c5b7,012c9d02 eb293dfc d22526d1 3ae7ece3


In [13]:
sub_df.to_csv("submission.csv", index=False)