In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW 
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, AutoTokenizer

from sklearn.preprocessing import LabelEncoder

import polars as pl
import numpy as np

import json
from tqdm import tqdm
import datetime, time
import random
from math import ceil
import os
from copy import deepcopy as dc
from pprint import pp

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = []
with open('../../data/raw/2024-06-21-category-1-sorted-cplabels.json') as file:
    lines = file.readlines()
    for line in lines:
        ob = json.loads(line)
        text = dc(ob)
        text.pop('label')
        no = {
            'text': json.dumps(text),
            'label': ob['label']
		}
        data.append(no)

In [3]:
df = pl.DataFrame(data)

In [4]:
df.write_parquet('../../data/prepared/matteo-label-sorted.parquet')

In [5]:
df.head()

text,label
str,i64
"""{""kind"": ""Event"", ""apiVersion""…",3694672
"""{""kind"": ""Event"", ""apiVersion""…",4176
"""{""kind"": ""Event"", ""apiVersion""…",61648
"""{""kind"": ""Event"", ""apiVersion""…",151632
"""{""kind"": ""Event"", ""apiVersion""…",3280976


In [6]:
labels = df['label'].unique() # get only unique labels
n_labels = labels.shape[0]
labels

label
i64
-2
4144
4176
4368
4400
…
3297360
3694672
3695024
3702832


In [7]:
# Create a LabelEncoder to map the original Label to a int64 scalar value
le = LabelEncoder()
le.fit(labels)
classes = le.transform(le.classes_)
# le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('Mapping classses/labels...')
le_labels_mapping = pl.DataFrame(
        list(zip(le.classes_, classes))
    ).transpose().rename({
        'column_0': 'label',
        'column_1': 'm_label'
    })

Mapping classses/labels...


In [8]:
le_labels_mapping

label,m_label
i64,i64
-2,0
4144,1
4176,2
4368,3
4400,4
…,…
3297360,109
3694672,110
3695024,111
3702832,112


In [9]:
print('Creating one-hot encoding for each layer...')
labels_t = torch.tensor(classes, dtype=torch.long)
labels_one_hot = F.one_hot(labels_t[None, :], num_classes=n_labels)
labels_one_hot = labels_one_hot.squeeze()

le_labels_mapping = le_labels_mapping.with_columns(
    pl.Series(
        labels_one_hot.squeeze().numpy()
    ).alias('one_hot')
)
le_labels_mapping

Creating one-hot encoding for each layer...


label,m_label,one_hot
i64,i64,"array[i64, 114]"
-2,0,"[1, 0, … 0]"
4144,1,"[0, 1, … 0]"
4176,2,"[0, 0, … 0]"
4368,3,"[0, 0, … 0]"
4400,4,"[0, 0, … 0]"
…,…,…
3297360,109,"[0, 0, … 0]"
3694672,110,"[0, 0, … 0]"
3695024,111,"[0, 0, … 0]"
3702832,112,"[0, 0, … 0]"


In [10]:
print('Joining labels/labels_text...')
df = df.join(le_labels_mapping, on='label').rename({
    'label': 'label_name',
    'm_label': 'label'
})

Joining labels/labels_text...


In [11]:
df.write_parquet('../../data/prepared/matteo-sorted-labels-complete-df.parquet')

In [12]:
df

text,label_name,label,one_hot
str,i64,i64,"array[i64, 114]"
"""{""kind"": ""Event"", ""apiVersion""…",3694672,110,"[0, 0, … 0]"
"""{""kind"": ""Event"", ""apiVersion""…",4176,2,"[0, 0, … 0]"
"""{""kind"": ""Event"", ""apiVersion""…",61648,65,"[0, 0, … 0]"
"""{""kind"": ""Event"", ""apiVersion""…",151632,96,"[0, 0, … 0]"
"""{""kind"": ""Event"", ""apiVersion""…",3280976,105,"[0, 0, … 0]"
…,…,…,…
"""{""kind"": ""Event"", ""apiVersion""…",61616,64,"[0, 0, … 0]"
"""{""kind"": ""Event"", ""apiVersion""…",119216,86,"[0, 0, … 0]"
"""{""kind"": ""Event"", ""apiVersion""…",119232,87,"[0, 0, … 0]"
"""{""kind"": ""Event"", ""apiVersion""…",119216,86,"[0, 0, … 0]"


In [13]:

class MNDataset(Dataset):
    def __init__(self, tokenizer, dataframe, max_len):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        text = self.df["text"][idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            padding="max_length",
            truncation=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        targets = self.df["one_hot"][idx].to_numpy()
        return {
            "ids" : torch.tensor(ids, dtype=torch.long),
            "mask" : torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(targets, dtype=torch.long)
        }

In [14]:
class MatchingNetwork(nn.Module):
    def __init__(self, n_labels):
        super(MatchingNetwork, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
        self.l2 = nn.Dropout(0.3)
        self.l3 = nn.Linear(768, n_labels)
        
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output


In [16]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
ds = MNDataset(tokenizer, df, 512)
dl = DataLoader(ds, batch_size=16, num_workers=0, shuffle=True)

In [17]:
for i in dl:
    print(i)
    break

{'ids': tensor([[  101,  1063,  1000,  ...,     0,     0,     0],
        [  101,  1063,  1000,  ...,  2891,  1012,   102],
        [  101,  1063,  1000,  ...,  1010,  1000,   102],
        ...,
        [  101,  1063,  1000,  ...,  1000, 13045,   102],
        [  101,  1063,  1000,  ..., 16798,  2549,   102],
        [  101,  1063,  1000,  ...,  1011,  6134,   102]]), 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'targets': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
     

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
bert = BertModel.from_pretrained('bert-base-uncased')

for batch in dl:
    ids = batch['ids'].to(device, dtype=torch.long)
    mask = batch['mask'].to(device, dtype=torch.long)
    token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)

    output1, output2= bert(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
    break
