/
spec.py
177 lines (136 loc) · 11.2 KB
/
spec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# SPDX-FileCopyrightText: 2020 Hlib Babii <hlibbabii@gmail.com>
#
# SPDX-License-Identifier: Apache-2.0
import logging
import docopt_subcommands as dsc
from codeprep.cli.impl import handle_splitting, handle_learnbpe
from codeprep.config import app_name, version
logger = logging.getLogger(__name__)
@dsc.command()
def nosplit_handler(args):
"""usage: {program} nosplit (-p <path> [-o <path-out>] | <text>) [-e <ext>]
[--no-spaces] [--no-unicode] [--no-com] ( [--no-str] | [-L=<max-str-length>] [--full-strings] )
[--calc-vocab] [--verbose]
Preprocesses the dataset without splitting compound identifier.
Options:
-p, --path <path> Path to the dataset to be preprocessed.
-o <path-out>, --output-path <path-out> Directory to which the pre-preprocessed corpus is to be written. If not specified, equals to '<path>_preprocessed'.
<text> Text to be preprocessed.
-e --ext <ext> In case of using --path argument:
Limits the set of input files to the files with the specified extension(s).
The format is the following: "py|java|...|js" If not specififed, all the files are read.
In case of passing text:
extension which a file containing source code written in this programming language would have,
e.g. 'java', 'py', 'js'.
--no-spaces, -0 Preserve newlines and tabs.
--no-unicode, -U Replace words containing non-ascii characters with <non-en> placeholders.
--no-com, -C Replace comments with <comment> placeholders.
--no-str, -S Replace strings with <string> placeholders.
--full-strings, -f Leave string literals as they are (without even splitting on whitespace characters)
-L, --max-str-length=<max-str-length> Replace string literal with `""` if its length including quotes exceeds `max_str_length`;
equals to `sys.maxsize` if not specified.
--calc-vocab -V Calculate vocabulary of the preprocessed dataset afterwards
--verbose, -v Print logs with log level DEBUG and higher to stdout.
"""
handle_splitting(args)
@dsc.command()
def chars_handler(args):
"""usage: {program} chars (-p <path> [-o <path-out>] | <text>) [-e <ext>]
[--no-spaces] [--no-unicode] [--no-com] [--no-str | -L=<max-str-length>]
[--calc-vocab] [--verbose]
Preprocesses the dataset by splitting identifiers into characters.
Options:
-p, --path <path> Path to the dataset to be preprocessed.
-o <path-out>, --output-path <path-out> Directory to which the pre-preprocessed corpus is to be written. If not specified, equals to '<path>_preprocessed'.
<text> Text to be preprocessed.
-e --ext <ext> In case of using --path argument:
Limits the set of input files to the files with the specified extension(s).
The format is the following: "py|java|...|js" If not specififed, all the files are read.
In case of passing text:
extension which a file containing source code written in this programming language would have,
e.g. 'java', 'py', 'js'.
--no-spaces, -0 Preserve newlines and tabs.
--no-unicode, -U Replace words containing non-ascii characters with <non-en> placeholders.
--no-com, -C Replace comments with <comment> placeholders.
--no-str, -S Replace strings with <string> placeholders.
-L, --max-str-length=<max-str-length> Replace string literal with `""` if its length including quotes exceeds `max_str_length`;
equals to `sys.maxsize` if not specified.
--calc-vocab -V Calculate vocabulary of the preprocessed dataset afterwards
--verbose, -v Print logs with log level DEBUG and higher to stdout.
"""
handle_splitting(args)
@dsc.command()
def basic_handler(args):
"""usage: {program} basic (-p <path> [-o <path-out>] | <text>) [-e <ext>] [-n [-r [-s]]]
[--no-spaces] [--no-unicode] [--no-case] [--no-com] [--no-str | -L=<max-str-length>]
[--calc-vocab] [--verbose]
Preprocesses the dataset by splitting compound identifiers according to CamelCase and snake_case conventions.
Options:
-p, --path <path> Path to the dataset to be preprocessed.
-o <path-out>, --output-path <path-out> Directory to which the pre-preprocessed corpus is to be written. If not specified, equals to '<path>_preprocessed'.
<text> Text to be preprocessed.
-e --ext <ext> In case of using --path argument:
Limits the set of input files to the files with the specified extension(s).
The format is the following: "py|java|...|js" If not specififed, all the files are read.
In case of passing text:
extension which a file containing source code written in this programming language would have,
e.g. 'java', 'py', 'js'.
--split-numbers, -n Split numbers into digits
--ronin, -r Preprocesses the dataset splitting identifiers with Ronin algorithm: http://joss.theoj.org/papers/10.21105/joss.00653.
--stem, -s Do stemming with Porter stemmer
--no-spaces, -0 Preserve newlines and tabs.
--no-unicode, -U Replace words containing non-ascii characters with <non-en> placeholders.
--no-case, -l Lowercase words and encode information about case in <Cap> <CAP> tokens.
--no-com, -C Replace comments with <comment> placeholders.
--no-str, -S Replace strings with <string> placeholders.
-L, --max-str-length=<max-str-length> Replace string literal with `""` if its length including quotes exceeds `max_str_length`;
equals to `sys.maxsize` if not specified.
--calc-vocab -V Calculate vocabulary of the preprocessed dataset afterwards
--verbose, -v Print logs with log level DEBUG and higher to stdout.
"""
handle_splitting(args)
@dsc.command()
def bpe_handler(args):
"""usage: {program} bpe (1k | 5k | 10k | <bpe-codes-id>) (-p <path> [-o <path-out>] | <text>) [-e <ext>]
[--no-str | -L=<max-str-length>] [--no-com] [--no-spaces] [--no-unicode] [--calc-vocab] [--verbose]
Preprocesses the dataset by splitting compound identifiers according to CamelCase and snake_case conventions,
and applies byte-pair encoding (BPE) on top.
Options:
-p, --path <path> Path to the dataset to be preprocessed.
-o <path-out>, --output-path <path-out> Directory to which the pre-preprocessed corpus is to be written. If not specified, equals to '<path>_preprocessed'.
<text> Text to be preprocessed.
-e --ext <ext> In case of using --path argument:
Limits the set of input files to the files with the specified extension(s).
The format is the following: "py|java|...|js" If not specififed, all the files are read.
In case of passing text:
extension which a file containing source code written in this programming language would have,
e.g. 'java', 'py', 'js'.
--no-str, -S Replace strings with <string> placeholders.
--no-com, -C Replace comments with <comment> placeholders.
--no-spaces, -0 Preserve newlines and tabs.
--no-unicode, -U Replace words containing non-ascii characters with <non-en> placeholders.
-L, --max-str-length=<max-str-length> Replace string literal with `""` if its length including quotes exceeds `max_str_length`;
equals to `sys.maxsize` if not specified.
--calc-vocab -V Calculate vocabulary of the preprocessed dataset afterwards
--verbose, -v Print logs with log level DEBUG and higher to stdout.
"""
handle_splitting(args)
@dsc.command()
def bpelearn_handler(args):
"""usage: {program} learn-bpe <n-merges> -p <path> [-e <ext>] [--id <bpe-codes-id>] [--no-unicode | --bytes] [--word-end] [--legacy] [--verbose]
Trains bpe codes on a specified corpus.
Options:
<n-merges> The number of BPE merges to compute
-p, --path <path> Path to the dataset to be used to learn bpe codes.
-e --ext <ext> Limits the set of input files to the files with the specified extension(s).
The format is the following: "ext1|ext2|...|extN" If not specififed, all the files are read.
--id <bpe-codes-id> Give an id to bpe-codes. If not specified, will be assigned automatically based on the name of the directory bpe codes were learned from
--no-unicode, -U Ignore words containing non-ascii characters.
--bytes, -b Treat non-ascii characters as 2 bytes and do real byte-pair encoding.
--word-end, -z Add a special character to the end of each word.
--legacy Parse using legacy parser (only files with extension “.java” will be processed)
--verbose, -v Print logs with log level DEBUG and higher to stdout.
"""
handle_learnbpe(args)
def parse_and_run(args):
dsc.main(app_name, f'{app_name} {version}', argv=args, exit_at_end=False)