In [23]:
%autosave 30

Autosaving every 30 seconds


In [24]:
cd /home/huawei123/kwx1991442/tests-ui-classification

/home/huawei123/kwx1991442/tests-ui-classification


## Imports

In [62]:
import json
import os
import glob
from collections import namedtuple
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from typing import *

In [76]:
with open('config.json') as file:
    cfg = json.load(file)

# DATA_FOLDER = "data/rust/src/test/ui/"
DATA_FOLDER = cfg['DATA_FOLDER']
EXCLUDED_SUBDIRS = cfg['EXCLUDED_SUBDIRS']
CODEPARROT_FOLDER = cfg['CODEPARROT_FOLDER']

In [78]:
DATA_FOLDER, CODEPARROT_FOLDER

('data/rust/tests/ui/', 'data/codeparrot-small')

## Dataloader

In [33]:
Item = namedtuple("Item", "fname relpath")
"""Structure to store location of each file

fname (str) : filename without extension (filename.*)
relpath (str) : relative path to the file (tests/ui/{relpath}/filename.*)
"""

'Structure to store location of each file\n\nfname (str) : filename without extension (filename.*)\nrelpath (str) : relative path to the file (tests/ui/{relpath}/filename.*)\n'

In [75]:
def encode_whitespaces(text: str, start_extra_id: int, max_len: int):
    """Encode whitespaces with extra tokens"""
    for i in np.arange(max_len, 1, -1):
        token_id = start_extra_id + i - 2
        text = text.replace(" " * i, f"<extratoken_{token_id}>")
    return text

def decode_whitespaces(text: str, start_extra_id: int, max_len: int):
    """Decode the whitespace-encoded strings produced by encode_whitespaces"""
    for l in range(2, max_len + 1):
        token_id = start_extra_id - 2 + l
        token = f"<extratoken_{token_id}>"
        text = text.replace(token, ' ' * l)

class UITestsDataset(Dataset):
    data_folder: str
    mode: str
    items: List[Item]

    def __init__(self, tests_ui_folder: str = DATA_FOLDER, mode: str = 'train') -> None:
        super(UITestsDataset).__init__()
        self.data_folder = tests_ui_folder
        self.mode = mode
        self.items = self.get_items(tests_ui_folder, mode)

    def get_items(self, test_ui_folder: str, mode: str) -> List[Item]:
        items = []

        if mode == 'train':
            for path, _, files in os.walk(test_ui_folder):

                # Check that file is placed in a subdirectory
                if len(path) == len(test_ui_folder):
                    continue

                relpath = os.path.relpath(path, start=test_ui_folder)
                
                # Check that subdirectory should not be excluded
                if len(set(relpath.split(os.sep)) & set(EXCLUDED_SUBDIRS)) > 0:
                    continue

                for f in files:
                    fname, fext = os.path.splitext(f)

                    # Omit non .rs files
                    if fext != ".rs":
                        continue

                    new_item = Item(fname, relpath)
                    items.append(new_item)

        elif mode == 'infer':
            for path, _, files in os.walk(test_ui_folder):

                # Check that file is NOT placed in a subdirectory
                if len(path) != len(test_ui_folder):
                    continue

                for f in files:
                    fname, fext = os.path.splitext(f)

                    # Omit non .rs files
                    if fext != ".rs":
                        continue

                    new_item = Item(fname, '')
                    items.append(new_item)

                # Skip all other paths as they are in a subdirectories
                break
        else:
            return ValueError("Unknown `mode` passed to `get_items()` function. " 
            "Possible variants: `train` or `infer`. ")

        return items

    def item2text(self, item: Item) -> str:
        """Load and concatenate files for item
        
        {item.fname}.[rs,stderr,stdout]
        """
        search = os.path.join(self.data_folder, item.relpath, item.fname)
        extensions = ['.rs', '.stderr', '.stdout']

        text = []
        for fname in [f for f in glob.glob(f"{search}*") \
            if os.path.splitext(f)[1] in extensions]:

            with open(fname, 'r') as file:
                text.append(file.read())
        
        return '\n\n'.join(text)

    def __len__(self):
        return len(self.items)

    def __getitem__(self, index: int) -> Tuple[str, str]:
        item = self.items[index]

        text = self.item2text(item)
        label = item.relpath.split(os.sep)[0]

        return (text, label)

    @property
    def classes(self):
        return list(set([item.relpath.split(os.sep)[0] for item in self.items]))

In [57]:
train = UITestsDataset(mode='train')
infer = UITestsDataset(mode='infer')

In [50]:
len(train), len(infer)

(11327, 391)

In [58]:
len(train.classes), len(infer.classes)

(307, 1)

In [80]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(CODEPARROT_FOLDER)

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [83]:
# tokenizer.encode(train[1][0])

In [84]:
from tokenizers.tools import EncodingVisualizer

visualize = EncodingVisualizer(tokenizer._tokenizer)

In [86]:
visualize(code)