In [2]:
%autosave 30

Autosaving every 30 seconds


In [3]:
cd ..

/home/huawei123/kwx1991442/code-classification


# baseline 

In [4]:
import json
import os
import glob
from collections import namedtuple
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from typing import *

In [5]:
with open('config.json') as file:
    cfg = json.load(file)

# DATA_FOLDER = "data/rust/src/test/ui/"
DATA_FOLDER = cfg['DATA_FOLDER']
EXCLUDED_SUBDIRS = cfg['EXCLUDED_SUBDIRS']
CODEPARROT_FOLDER = cfg['CODEPARROT_FOLDER']

In [6]:
Item = namedtuple("Item", "fname relpath")
"""Structure to store location of each file

fname (str) : filename without extension (filename.*)
relpath (str) : relative path to the file (tests/ui/{relpath}/filename.*)
"""

'Structure to store location of each file\n\nfname (str) : filename without extension (filename.*)\nrelpath (str) : relative path to the file (tests/ui/{relpath}/filename.*)\n'

In [17]:
import re
import tokenize
from io import BytesIO

class UITestsDataset(Dataset):
    data_folder: str
    mode: str
    items: List[Item]

    def __init__(self, tests_ui_folder: str = DATA_FOLDER, mode: str = 'train') -> None:
        super(UITestsDataset).__init__()
        self.data_folder = tests_ui_folder
        self.mode = mode
        self.items = self.get_items(tests_ui_folder, mode)

        self.compiled = re.compile(r'[A-Za-z_]+|-?\d+\.\d+|\d+|\W')
        # self.compiled = re.compile(r'[A-Za-z]+|-?\d+(?:\.\d+)?|(?:[^\w-]+|-(?!\d))+')


    def get_items(self, test_ui_folder: str, mode: str) -> List[Item]:
        items = []

        if mode == 'train':
            for path, _, files in os.walk(test_ui_folder):

                # Check that file is placed in a subdirectory
                if len(path) == len(test_ui_folder):
                    continue

                relpath = os.path.relpath(path, start=test_ui_folder)
                
                # Check that subdirectory should not be excluded
                if len(set(relpath.split(os.sep)) & set(EXCLUDED_SUBDIRS)) > 0:
                    continue

                for f in files:
                    fname, fext = os.path.splitext(f)

                    # Omit non .rs files
                    if fext != ".rs":
                        continue

                    new_item = Item(fname, relpath)
                    items.append(new_item)

        elif mode == 'infer':
            for path, _, files in os.walk(test_ui_folder):

                # Check that file is NOT placed in a subdirectory
                if len(path) != len(test_ui_folder):
                    continue

                for f in files:
                    fname, fext = os.path.splitext(f)

                    # Omit non .rs files
                    if fext != ".rs":
                        continue

                    new_item = Item(fname, '')
                    items.append(new_item)

                # Skip all other paths as they are in a subdirectories
                break
        else:
            return ValueError("Unknown `mode` passed to `get_items()` function. " 
            "Possible variants: `train` or `infer`. ")

        return items

    def item2text(self, item: Item) -> str:
        """Load and concatenate files for item
        
        {item.fname}.[rs,stderr,stdout]
        """
        search = os.path.join(self.data_folder, item.relpath, item.fname)
        extensions = ['.rs'] #, '.stderr', '.stdout']

        text = []
        for fname in [f for f in glob.glob(f"{search}*") \
            if os.path.splitext(f)[1] in extensions]:

            with open(fname, 'r') as file:
                text.append(file.read())
        
        return '\n\n'.join(text)

    def __len__(self):
        return len(self.items)

    def __getitem__(self, index: int) -> Tuple[np.ndarray, str]:
        item = self.items[index]
        text = self.item2text(item) #.split()
        label = item.relpath.split(os.sep)[0]
        return (text, label)

    @property
    def classes(self):
        return list(set([item.relpath.split(os.sep)[0] for item in self.items]))

    @property
    def vocab(self):
        vocab = set()
        for index in range(self.__len__()):
            words = self.__getitem__(index)[0]
            words = set(self.compiled.findall(words))
            # try:
            #     tokens = tokenize.tokenize(BytesIO(words.encode('utf-8')).readline)
            #     tokens = [token.string for token in tokens]
            vocab.update(words)
        return list(vocab)


In [18]:
t = UITestsDataset(DATA_FOLDER, 'train')

In [19]:
vocab = t.vocab

In [20]:
len(vocab)
# vocab[50:100]

27702

In [21]:
vocab

['VK',
 'Unnamed',
 'mem',
 '198',
 'cuviper',
 'uninferred',
 'Representing',
 'run_fut',
 'to_vector',
 '1957',
 'Heap',
 'Luma',
 '8624',
 'ParallelIterator',
 'FooX',
 'borrow_local_inline_done',
 'anon_const_non_local',
 '2263',
 'stable_in_unstable_std',
 'OtherStream',
 'is_es_identifier_start',
 'TRAIT_MAGIC',
 'clauses',
 '0165',
 '63151',
 'semis',
 'composition',
 '01000',
 'initializing',
 'Three',
 'assignment_to_field_projection',
 'entry_a',
 'ctpop',
 'Read',
 'SliceIndex',
 '2521',
 'TEST',
 'rc_foo',
 'Token',
 '1730',
 'implicit_ptr_trait',
 'equals_self_wrapper',
 '95327',
 'const_intrinsic_copy',
 'loaded',
 'PolyGood',
 '51',
 'forall',
 'test_op_assigns',
 'right',
 'impatient',
 'ExampleOtherTuple',
 'closable',
 'no_relationships_late',
 'HasMethod',
 'assign',
 'inherent_method_returning_unnameable_type',
 '1078',
 'bound_inv_a_b_vs_bound_inv_a',
 'generality',
 '1292',
 'key_value_expansion',
 'HOOK',
 'almost_swapped',
 'block_on',
 'Using',
 'tables',
 'OT'

Эта строка написана DD.MM.YYYY, а могла бы и DD.MM.YYYY


In [53]:
import re
match = re.findall(r"([a-z]+)([0-9]+)|([0-9]+)([a-z]+)", 'foofo21rew2342sdf', re.I)
# if match:
#     items = match.groups()
# print(items)

In [54]:
match

[('foofo', '21', '', ''), ('rew', '2342', '', '')]

In [67]:
re.split(r"([a-z]+)([0-9]+)", 'fdsf423() _) 32dsf234')

['', 'fdsf', '423', '() _) 32', 'dsf', '234', '']

In [104]:
# re.split(r"([a-z]+)([0-9]+)", s[:100])
print(re.split(r"([a-z]+)", ' '.join(s.split())))

['#![', 'crate', '_', 'type', '="', 'lib', '"] #![', 'feature', '(', 'c', '_', 'variadic', ')] ', 'pub', ' ', 'unsafe', ' ', 'extern', ' "C" ', 'fn', ' ', 'use', '_', 'vararg', '_', 'lifetime', '( ', 'x', ': ', 'usize', ', ', 'y', ': ... ) -> &', 'usize', ' { //~ ERROR ', 'missing', ' ', 'lifetime', ' ', 'specifier', ' &0 } ', 'pub', ' ', 'unsafe', ' ', 'extern', ' "C" ', 'fn', ' ', 'use', '_', 'normal', '_', 'arg', '_', 'lifetime', '(', 'x', ': &', 'usize', ', ', 'y', ': ...) -> &', 'usize', ' { // OK ', 'x', ' } ', 'error', '[E0106]: ', 'missing', ' ', 'lifetime', ' ', 'specifier', ' --> $DIR/', 'variadic', '-', 'ffi', '-6.', 'rs', ':7:6 | LL | ) -> &', 'usize', ' { | ^ ', 'expected', ' ', 'named', ' ', 'lifetime', ' ', 'parameter', ' | = ', 'help', ': ', 'this', ' ', 'function', "'", 's', ' ', 'return', ' ', 'type', ' ', 'contains', ' ', 'a', ' ', 'borrowed', ' ', 'value', ', ', 'but', ' ', 'there', ' ', 'is', ' ', 'no', ' ', 'value', ' ', 'for', ' ', 'it', ' ', 'to', ' ', 'be', ' '

In [107]:
re.split(r"(\W+)", ' '.join(s.split()))

['',
 '#![',
 'crate_type',
 '="',
 'lib',
 '"] #![',
 'feature',
 '(',
 'c_variadic',
 ')] ',
 'pub',
 ' ',
 'unsafe',
 ' ',
 'extern',
 ' "',
 'C',
 '" ',
 'fn',
 ' ',
 'use_vararg_lifetime',
 '( ',
 'x',
 ': ',
 'usize',
 ', ',
 'y',
 ': ... ) -> &',
 'usize',
 ' { //~ ',
 'ERROR',
 ' ',
 'missing',
 ' ',
 'lifetime',
 ' ',
 'specifier',
 ' &',
 '0',
 ' } ',
 'pub',
 ' ',
 'unsafe',
 ' ',
 'extern',
 ' "',
 'C',
 '" ',
 'fn',
 ' ',
 'use_normal_arg_lifetime',
 '(',
 'x',
 ': &',
 'usize',
 ', ',
 'y',
 ': ...) -> &',
 'usize',
 ' { // ',
 'OK',
 ' ',
 'x',
 ' } ',
 'error',
 '[',
 'E0106',
 ']: ',
 'missing',
 ' ',
 'lifetime',
 ' ',
 'specifier',
 ' --> $',
 'DIR',
 '/',
 'variadic',
 '-',
 'ffi',
 '-',
 '6',
 '.',
 'rs',
 ':',
 '7',
 ':',
 '6',
 ' | ',
 'LL',
 ' | ) -> &',
 'usize',
 ' { | ^ ',
 'expected',
 ' ',
 'named',
 ' ',
 'lifetime',
 ' ',
 'parameter',
 ' | = ',
 'help',
 ': ',
 'this',
 ' ',
 'function',
 "'",
 's',
 ' ',
 'return',
 ' ',
 'type',
 ' ',
 'contains',
 ' '

In [108]:
s

'#![crate_type="lib"]\n#![feature(c_variadic)]\n\npub unsafe extern "C" fn use_vararg_lifetime(\n    x: usize,\n    y: ...\n) -> &usize { //~ ERROR missing lifetime specifier\n    &0\n}\n\npub unsafe extern "C" fn use_normal_arg_lifetime(x: &usize, y: ...) -> &usize { // OK\n    x\n}\n\n\nerror[E0106]: missing lifetime specifier\n  --> $DIR/variadic-ffi-6.rs:7:6\n   |\nLL | ) -> &usize {\n   |      ^ expected named lifetime parameter\n   |\n   = help: this function\'s return type contains a borrowed value, but there is no value for it to be borrowed from\nhelp: consider using the `\'static` lifetime\n   |\nLL | ) -> &\'static usize {\n   |       +++++++\n\nerror: aborting due to previous error\n\nFor more information about this error, try `rustc --explain E0106`.\n'

In [172]:
s="""// note that these aux-build directives must be in this order
// aux-build:svh-uta-base.rs
// aux-build:svh-utb.rs
// aux-build:svh-uta-change-use-trait.rs
// normalize-stderr-test: "(crate `(\w+)`:) .*" -> "$1 $$PATH_$2"

//! "svh-uta-trait.rs" is checking that we detect a
//! change from `use foo::TraitB` to use `foo::TraitB` in the hash

extern crate uta;
extern crate utb; //~ ERROR: found possibly newer version of crate `uta` which `utb` depends

fn main() {
    utb::foo()
}"""

In [185]:
from io import BytesIO
import tokenize


# with open('/home/huawei123/kwx1991442/code-classification/data/classifui/src/main.rs', 'rb') as f:
# tokens = tokenize.tokenize(f.readline)
tokens = tokenize.tokenize(BytesIO(s.encode('utf-8')).readline)
tokens = [token.string for token in tokens]