In [1]:
from transformers import AutoTokenizer

# Load the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Example sentence
sentence = "Hello, how are you"

# Tokenize the sentence without returning tensors
encoded_input = tokenizer(sentence)

# Print the encoded inputs
print(encoded_input)

# Decode the input IDs to get subwords (tokens)
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])

# Print the tokens
print(tokens)
print(tokenizer.get_vocab()['Hello'])

{'input_ids': [101, 8667, 117, 1293, 1132, 1128, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'Hello', ',', 'how', 'are', 'you', '[SEP]']
8667


In [8]:
from transformers import AutoTokenizer

# Load the codeBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# Example sentence
code = '''
import 'package:flutter/material.dart';
class MyApp extends StatelessWidget {
    @override
    Widget build(BuildContext context) {
        return MaterialApp(
            home: Scaffold(
            appBar: AppBar(title: Text('Flutter App')),
            body: Center(child: Text('Hello, world!')),
            ),
        );
    }
}
'''

# Tokenize the sentence without returning tensors
encoded_input = tokenizer(code)

# Print the encoded inputs
print(encoded_input)

# Decode the input IDs to get subwords (tokens)
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])

# Print the tokens
print(tokens)
print(tokenizer.get_vocab()['def'])
print(len(tokenizer.get_vocab()))

{'input_ids': [0, 50118, 41975, 128, 46181, 35, 4825, 12158, 73, 36739, 4, 417, 2013, 23500, 50118, 4684, 1308, 19186, 14269, 331, 1672, 48612, 25522, 50118, 1437, 1437, 1437, 787, 2137, 23167, 50118, 1437, 1437, 1437, 305, 20356, 1119, 1640, 36590, 48522, 5377, 43, 25522, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 671, 26188, 19186, 1640, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 184, 35, 2741, 3707, 279, 1640, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1553, 14507, 35, 3166, 14507, 1640, 14691, 35, 14159, 45803, 16197, 12158, 3166, 27645, 238, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 809, 35, 824, 1640, 14069, 35, 14159, 45803, 31414, 6, 232, 328, 27645, 238, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 31311, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 47162, 50118, 1437, 1437, 1437, 35524, 50118, 24303, 50118, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1

In [9]:
from utils.code_tokenizer import CodeTokenizer
from pygments.lexers import DartLexer
from utils.vocab import dart_vocab

flutter_code_tokenizer = CodeTokenizer(
    DartLexer(),
    framework_vocab=['Scaffold'],
    language_vocab=dart_vocab,
)

In [10]:
flutter_code_tokenizer.language_offset, flutter_code_tokenizer.framework_offset, len(flutter_code_tokenizer)

(28999, 29116, 29117)

In [11]:
code = """
import 'package:flutter/material.dart';
class MyApp extends StatelessWidget {
    @override
    Widget build(BuildContext context) {
        return MaterialApp(
            home: Scaffold(
            appBar: AppBar(title: Text('Flutter App')),
            body: Center(child: Text('Hello, world!')),
            ),
        );
    }
}

void main() {
    int x = 5;
    x += 1;
    x++;
    runApp(MyApp());
}
"""
tokens = flutter_code_tokenizer.lex_code(code)

for token in tokens:
    print(token)

(Token.Keyword, 'import')
(Token.Text.Whitespace, ' ')
(Token.Literal.String.Single, "'")
(Token.Literal.String.Single, 'package:flutter/material.dart')
(Token.Literal.String.Single, "'")
(Token.Punctuation, ';')
(Token.Text.Whitespace, '\n')
(Token.Keyword.Declaration, 'class')
(Token.Text.Whitespace, ' ')
(Token.Name.Class, 'MyApp')
(Token.Text.Whitespace, ' ')
(Token.Keyword.Declaration, 'extends')
(Token.Text.Whitespace, ' ')
(Token.Name, 'StatelessWidget')
(Token.Text.Whitespace, ' ')
(Token.Punctuation, '{')
(Token.Text.Whitespace, '\n')
(Token.Text.Whitespace, '    ')
(Token.Name.Decorator, '@override')
(Token.Text.Whitespace, '\n')
(Token.Text.Whitespace, '    ')
(Token.Name, 'Widget')
(Token.Text.Whitespace, ' ')
(Token.Name, 'build')
(Token.Punctuation, '(')
(Token.Name, 'BuildContext')
(Token.Text.Whitespace, ' ')
(Token.Name, 'context')
(Token.Punctuation, ')')
(Token.Text.Whitespace, ' ')
(Token.Punctuation, '{')
(Token.Text.Whitespace, '\n')
(Token.Text.Whitespace, '     

In [12]:
all_tokens = flutter_code_tokenizer.tokenize(code)
# convert generator to list
all_tokens = list(all_tokens)
print(len(all_tokens))
for token in all_tokens:
    print(token)

256
import
 
'
package
:
flutter
/
material
.
dart
'
;


class
 
My
##A
##pp
 
extends
 
State
##less
##W
##idge
##t
 
{


 
 
 
 
@
over
##ride


 
 
 
 
W
##idge
##t
 
build
(
B
##uild
##C
##onte
##x
##t
 
context
)
 
{


 
 
 
 
 
 
 
 
return
 
Material
##A
##pp
(


 
 
 
 
 
 
 
 
 
 
 
 
home
:
 
(


 
 
 
 
 
 
 
 
 
 
 
 
app
##B
##ar
:
 
A
##pp
##B
##ar
(
title
:
 
Text
(
'
F
##lut
##ter
A
##pp
'
)
)
,


 
 
 
 
 
 
 
 
 
 
 
 
body
:
 
Center
(
child
:
 
Text
(
'
Hello
,
world
!
'
)
)
,


 
 
 
 
 
 
 
 
 
 
 
 
)
,


 
 
 
 
 
 
 
 
)
;


 
 
 
 
}


}




void
 
main
(
)
 
{


 
 
 
 
int
 
x
 
=
 
5
;


 
 
 
 
x
 
+
=
 
1
;


 
 
 
 
x
+
+
;


 
 
 
 
run
##A
##pp
(
My
##A
##pp
(
)
)
;


}


