# Task:
Predict correct positions of markdown cells in the Jupyter notebook based on the given ordred code cells.

# Input:
- markdown cells text
    - unordered
- code cells text
    - must be ordered

# Output:
The original positions of the markdown cells in the Jupyter notebook

# Import and Load pipeline

In [1]:
import pathlib
MODEL_ROOT = pathlib.Path("/kaggle/input/ai4code-parallel-bert/pytorch/deberta-v3-large/2")
!ls $MODEL_ROOT

ai4code_parallel_bert  checkpoint


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import sys
sys.path.append(str(MODEL_ROOT))
import ai4code_parallel_bert
sys.path.pop(-1)

'/kaggle/input/ai4code-parallel-bert/pytorch/deberta-v3-large/2'

In [3]:
CHECKPOINT_PATH = MODEL_ROOT / "checkpoint/deberta-v3-large"
DEVICE = "cuda:0"
BERT_PROCESS_BATCH_SIZE = 4

pipe = ai4code_parallel_bert.pipeline.Pipeline(CHECKPOINT_PATH, DEVICE, BERT_PROCESS_BATCH_SIZE)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Load competition data

In [4]:
import pathlib
COMPETITION_ROOT = pathlib.Path("/kaggle/input/AI4Code/")
!ls $COMPETITION_ROOT

sample_submission.csv  test  train  train_ancestors.csv  train_orders.csv


In [5]:
SAMPLE_TEST_FILE = COMPETITION_ROOT / "test/0009d135ece78d.json"
notebook_data = ai4code_parallel_bert.pipeline.load_ai4code_json_file(SAMPLE_TEST_FILE)
notebook_data.keys()

dict_keys(['notebook', 'code', 'markdown'])

In [6]:
print(" ========== first code cell text ==========")
print(notebook_data["code"]["texts"][0])
print()

print(" ========== second code cell text ==========")
print(notebook_data["code"]["texts"][1])
print()

print(" ========== code cell ids ==========")
print(notebook_data["code"]["cell_ids"])

import numpy as np # linear algebra
import pandas as pd # data processing,
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.impute import SimpleImputer


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df

['ddfd239c', 'c6cd22db', '1372ae9b', '90ed07ab', '7f388a41', '2843a25a', '06dbf8cf']


In [7]:
print(" ========== a markdown cell text ==========")
print(notebook_data["markdown"]["texts"][0])
print()

print(" ========== another markdown cell text ==========")
print(notebook_data["markdown"]["texts"][1])
print()

print(" ========== markdown cell ids ==========")
print(notebook_data["markdown"]["cell_ids"])

# Scaling Data ⚖
Let's scale the data so PCA can be applied

## Testing Plots >w>
Let's these mystery soliving plots! :O

['f9893819', 'ba55e576', '39e937ec', 'e25aa9bd', '0a226b6a', '8cb8d28a']


# Predict

In [8]:
pipeline_input = {
    "code": {
        "texts": notebook_data["code"]["texts"],
        "cell_ids": notebook_data["code"]["cell_ids"],
    },
    "markdown": {
        "texts": notebook_data["markdown"]["texts"],
        "cell_ids": notebook_data["markdown"]["cell_ids"],
    },
}

import torch
with torch.no_grad():
  out = pipe(pipeline_input)

print(" ========== keys of output dict ==========")
print(out.keys())

dict_keys(['prediction', 'probs', 'cosines', 'code_reprs', 'markdown_reprs'])


In [9]:
print(" ========== predicted ordered cell_ids ==========")
print(out["prediction"])

['0a226b6a', 'ddfd239c', '8cb8d28a', 'c6cd22db', '1372ae9b', 'e25aa9bd', '90ed07ab', 'ba55e576', '7f388a41', 'f9893819', '2843a25a', '39e937ec', '06dbf8cf']


In [10]:
print(" ========== predicted probabilities ==========")
print(out["probs"])

tensor([[4.4896e-05, 3.2192e-05, 1.0594e-03, 7.1733e-05, 1.5469e-05, 9.9742e-01,
         1.3352e-03, 1.7657e-05],
        [2.3167e-04, 7.8122e-05, 4.7290e-04, 7.9150e-02, 9.1836e-01, 2.2528e-04,
         1.0984e-03, 3.8792e-04],
        [7.5998e-04, 2.1722e-04, 1.1652e-04, 4.4985e-04, 5.6027e-04, 2.0520e-03,
         9.9424e-01, 1.6015e-03],
        [1.3921e-04, 8.4084e-04, 2.7190e-03, 9.9551e-01, 3.3452e-04, 1.8055e-04,
         1.3800e-04, 1.3298e-04],
        [9.9845e-01, 1.2712e-03, 8.2164e-05, 4.3098e-05, 3.4636e-06, 2.3869e-05,
         4.1500e-06, 1.2209e-04],
        [1.4749e-03, 9.9662e-01, 1.2392e-03, 1.6844e-04, 4.5129e-04, 4.0670e-05,
         1.5635e-06, 6.6760e-06]], device='cuda:0')


## Since the pipeline appends begin-of-code-cell and end-of-code-cell to code cells, num_code_cell_pairs is one bigger than num_code_cells.
NOTE: Here I use the strings "Begin-of-CodeCell" and "End-of-CodeCell" for ease of visualization, however, during the actual training and inference, an additional BERT special token "[BLANK_CODE]" is used to represent both of them. Don't use the strings "Begin-of-CodeCell" nor "End-of-CodeCell" for the input of the bert-tokenizer. 

In [11]:
# codes_with_sentinel = ["[BLANK_CODE]"] + list(pipeline_input["code"]["texts"]) + ["[BLANK_CODE]"]
codes_with_sentinel = ["Begin-of-CodeCell"] + list(pipeline_input["code"]["texts"]) + ["End-of-CodeCell"]

code_pairs = list()
for i in range(len(codes_with_sentinel)-1):
    pair = [codes_with_sentinel[i], codes_with_sentinel[i+1]]
    code_pairs.append(pair)

print("num_code_cells:", len(pipeline_input["code"]["texts"]))
print("num_code_cells_with_sentinel:", len(codes_with_sentinel))
print("num_code_cell_pairs:", len(code_pairs))
print()
print("num_markdown_cells:", len(pipeline_input["markdown"]["texts"]))
print()
print("shape of output probs(=[num_markdown_cells, num_code_cell_pairs]):", out["probs"].shape)

num_code_cells: 7
num_code_cells_with_sentinel: 9
num_code_cell_pairs: 8

num_markdown_cells: 6

shape of output probs(=[num_markdown_cells, num_code_cell_pairs]): torch.Size([6, 8])


# Check predicted positioning

In [12]:
TARGET_MARKDOWN_I = 0

print(" ========== target markdown cell index ==========")
print(TARGET_MARKDOWN_I)
print()

print(" ========== target markdown cell text ==========")
target_markdown_cell = pipeline_input["markdown"]["texts"][TARGET_MARKDOWN_I]
print(target_markdown_cell)
print()

print(" ========== probs ==========")
probs_for_target_markdown = out["probs"][TARGET_MARKDOWN_I]
for i, prob in enumerate(probs_for_target_markdown):
    print(f'{i}-th code-pair: {prob*100.0:5.02f}%')
print()

print(" ========== insert to predicted position ==========")
selected_code_pair_i = probs_for_target_markdown.argmax().item()
pre_markdown_cell, post_markdown_cell = code_pairs[selected_code_pair_i]
# selected_code_pair = [pre_markdown, post_markdown]

print(" ---------- pre-markdown code_cell ----------")
print(pre_markdown_cell)
print()
print(" ---------- target markdown_cell ----------")
print(target_markdown_cell)
print()
print(" ---------- post-markdown code_cell ----------")
print(post_markdown_cell)


0

# Scaling Data ⚖
Let's scale the data so PCA can be applied

0-th code-pair:  0.00%
1-th code-pair:  0.00%
2-th code-pair:  0.11%
3-th code-pair:  0.01%
4-th code-pair:  0.00%
5-th code-pair: 99.74%
6-th code-pair:  0.13%
7-th code-pair:  0.00%

 ---------- pre-markdown code_cell ----------
# Ploting data with different columns
#####################################
comparison_plot_maker(numerical_data["radius_mean"], numerical_data["radius_worst"], "Mean Radius vs Worst Radius", "Mean Radius", "Worst Radius")
comparison_plot_maker(numerical_data["perimeter_se"], numerical_data["perimeter_worst"], "S.D Perimeter vs Worst Perimeter", "S.D Perimeter", "Worst Perimeter")
comparison_plot_maker(numerical_data["compactness_mean"], numerical_data["compactness_se"], "Mean Compactness vs S.D Compactness", "Mean Compactness", "S.D Compactness")
comparison_plot_maker(numerical_data["smoothness_mean"], numerical_data["smoothness_worst"], "Mean Smoothness vs Worst Smoothness","Mean Smoothness", "

## The output has representation vectors for code cell pairs and markdown cells.

In [13]:
print("code-pair_reprs(=[num_pairs, repr_dim]):", out["code_reprs"].shape)
out["code_reprs"]

code-pair_reprs(=[num_pairs, repr_dim]): torch.Size([8, 256])


tensor([[ 0.0507, -0.0952, -0.0227,  ...,  0.0543,  0.0070,  0.0838],
        [ 0.0105, -0.1007,  0.0046,  ...,  0.0105, -0.0048,  0.0334],
        [-0.0101,  0.0086, -0.0494,  ..., -0.0774,  0.0316,  0.0843],
        ...,
        [-0.0481,  0.0524, -0.0391,  ..., -0.0064, -0.0361,  0.0217],
        [-0.1147, -0.0101,  0.0007,  ...,  0.0097,  0.0057,  0.0456],
        [-0.0273, -0.0920, -0.0476,  ..., -0.0317,  0.0168,  0.0790]],
       device='cuda:0')

In [14]:
print("markdown_reprs(=[num_markdowns, repr_dim]):", out["markdown_reprs"].shape)
out["markdown_reprs"]

markdown_reprs(=[num_markdowns, repr_dim]): torch.Size([6, 256])


tensor([[-0.0059,  0.0160, -0.0207,  ...,  0.0183, -0.0030, -0.0654],
        [-0.0787, -0.0384,  0.0820,  ..., -0.0433,  0.0243, -0.0063],
        [-0.1392, -0.0541,  0.0666,  ...,  0.0148, -0.0438, -0.0464],
        [-0.0809,  0.0125,  0.0241,  ..., -0.1171,  0.0241,  0.1333],
        [ 0.0766, -0.0293, -0.1278,  ...,  0.0680,  0.0494,  0.0891],
        [ 0.0181, -0.0692,  0.0147,  ...,  0.0345, -0.0260, -0.1126]],
       device='cuda:0')

# Head-less forwarding
If you only need representation vectors, use `pipe.get_network_output` with `reprs_only=True`.

In [15]:
from typing import List
code_texts:List[str] = notebook_data["code"]["texts"]
markdown_texts:List[str] = notebook_data["markdown"]["texts"]

with torch.no_grad():
    network_out = pipe.get_network_output(code_texts, markdown_texts, reprs_only=True)

print("code-pair_reps:", network_out.code_reprs.shape)
print("markdown_reps:", network_out.markdown_reprs.shape)

code-pair_reps: torch.Size([8, 256])
markdown_reps: torch.Size([6, 256])


------------

# Supplemental: step-by-step prediction

We can divide the pipeline process into tokenization and network-forwarding processes. This could be useful for the further fine-tuning.

For more infomation, see the source code of `ai4code_parallel_bert`.

In [16]:
bert_tokenizer = pipe.tokenizer

each_code_max_length = pipe.each_code_max_length
markdown_max_length = pipe.markdown_max_length

# tokenize
code_texts = notebook_data["code"]["texts"]
markdown_texts = notebook_data["markdown"]["texts"]

code_tokenized = bert_tokenizer(code_texts, add_special_tokens=False, truncation=True, max_length=each_code_max_length)["input_ids"]
markdown_tokenized = bert_tokenizer(markdown_texts, add_special_tokens=False, truncation=True, max_length=markdown_max_length)["input_ids"]

# forward
code_inputs, markdown_inputs = pipe.format_input(code_tokenized, markdown_tokenized) # this adds special tokens and begin/end-of-code-cell, then make pairs of code cells.
with torch.no_grad():
    probs, cosines, code_reprs, markdown_reprs = pipe.network_forward(code_inputs, markdown_inputs, reprs_only=False)

# make ordered cell_ids
preds = probs.argmax(1).tolist()
code_cell_ids = notebook_data["code"]["cell_ids"]
markdown_cell_ids = notebook_data["markdown"]["cell_ids"]

pred_order = [list() for _ in range(probs.shape[1])]
for code_i, code_cell_id in enumerate(code_cell_ids):
    pred_order[code_i+1].append(code_cell_id)
for pred_position, markdown_cell_id in zip(preds, markdown_cell_ids):
    pred_order[pred_position].append(markdown_cell_id)
pred_order = sum(pred_order, list())

output = {
    "prediction": pred_order,
    "probs": probs,
    "cosines": cosines,
    "code_reprs": code_reprs,
    "markdown_reprs": markdown_reprs,
}

print("probs:", probs.shape)
print("code-pair_reprs:", code_reprs.shape)
print("markdown_reprs:", markdown_reprs.shape)

probs: torch.Size([6, 8])
code-pair_reprs: torch.Size([8, 256])
markdown_reprs: torch.Size([6, 256])
