In [1]:
import os
import pathlib
import pickle
import re
import emoji
import pandas as pd
from tokenizers import BertWordPieceTokenizer
from transformers import AlbertTokenizerFast, ElectraTokenizerFast, ElectraConfig, ElectraForMaskedLM, \
    BertTokenizerFast, ElectraTokenizer
from utils.datasets import load_helper_file, ensure_dataset

In [2]:
vocabulary_bert = set(load_helper_file('helper_bert_uncased_vocabulary'))
vocabulary_words = load_helper_file('custom_vocabulary_words')
vocabulary_extra = load_helper_file('custom_vocabulary_extra')
vocabulary = vocabulary_bert.union(set(vocabulary_words)).union(set(vocabulary_extra))
emoji_dict = set(e for lang in emoji.UNICODE_EMOJI.values() for e in lang)

tokenizer_custom = {
    '@HTAG': '[HTAG]',
    '@USR': '[USR]',
    '@CURR': '[CURR]',
    '@EMOJI': '[EMOJI]',
    '@URL': '[URL]',
    '@TIME': '[TIME]',
    '@DATE': '[DATE]',
    '@NUM': '[NUM]'
}

OUTPUT_PATH = '../../data/bitcoin_twitter_corpus'
print(len(vocabulary))

30918


In [4]:
re_num = re.compile('(@NUM|\[|\])')

def process_row(text):
    words = text.split()
    for i, word in enumerate(words):
        if word in vocabulary and not word.startswith('@'):
            continue
        if word.startswith('@NUM'):
            value = round(float(re_num.sub('', word)), 3)
            words[i] = '[NUM] ' + ('%f' % value).rstrip('0').rstrip('.')
            continue
        if prefix := next(filter(words[i].startswith, tokenizer_custom.keys()), None):
            value = words[i].replace(prefix, '').replace(']', '').replace('[', '')
            words[i] = f'{tokenizer_custom[prefix]} {value}'
            continue
        if word in emoji_dict:
            words[i] = '[EMOJI]'
            continue
    return ' '.join(words)

ensure_dataset(OUTPUT_PATH, delete=True)
u = 0
with open(os.path.join(OUTPUT_PATH, 'train.tokens'), 'w') as f_train,\
    open(os.path.join(OUTPUT_PATH, 'test.tokens'), 'w') as f_test, \
    open(os.path.join(OUTPUT_PATH, 'validate.tokens'), 'w') as f_validate:
    files = pathlib.Path("../../data/bitcoin_twitter_processed/").glob("part_*.parquet")
    for chunk, file in enumerate(files):
        print(f'Processing chunk: {chunk}')
        data = pd.read_parquet(file)
        for i, row in data.iterrows():
            words = row['text'].split()
            text = process_row(row['text'])
            if i != 0:  text = '\n' + text
            if u < 14: f_train.write(text)
            elif u < 17: f_test.write(text)
            else: f_validate.write(text)
            u = (u + 1) % 20

Processing chunk: 0
Processing chunk: 1
Processing chunk: 2
Processing chunk: 3
Processing chunk: 4
Processing chunk: 5
Processing chunk: 6
Processing chunk: 7
Processing chunk: 8
Processing chunk: 9
Processing chunk: 10
Processing chunk: 11
Processing chunk: 12
Processing chunk: 13
Processing chunk: 14
Processing chunk: 15
Processing chunk: 16
Processing chunk: 17
Processing chunk: 18
Processing chunk: 19
Processing chunk: 20
Processing chunk: 21
Processing chunk: 22
Processing chunk: 23
Processing chunk: 24
Processing chunk: 25
Processing chunk: 26
Processing chunk: 27
Processing chunk: 28


In [53]:
# https://github.com/German-NLP-Group/german-transformer-training/blob/master/src/02_train_01.py#L10

paths = [str(x) for x in pathlib.Path(OUTPUT_PATH).glob("**/*.tokens")]
tokenizer = BertWordPieceTokenizer(strip_accents=True)

In [54]:
tokenizer.train(files=paths, vocab_size=30_000, min_frequency=1000, special_tokens=[
    "[CLS]",
    "[PAD]",
    "[SEP]",
    "[UNK]",
    "[MASK]",
     *list(tokenizer_custom.values())
])
tokenizer.add_special_tokens({
    'additional_special_tokens': list(tokenizer_custom.values())
})

In [55]:
tokenizer.save_model("../../data/vocab/bitcoin_twitter", "bitcoin_twitter")

['../../data/vocab/bitcoin_twitter/bitcoin_twitter-vocab.txt']

In [57]:
text = r"new antminer s9j - 14 . @NUM 5.0 t from @USR[bitmain] mining sha256 ( @HTAG[btc] @CURR[chainlink] @CURR[dogecoin] @CURR[bitcoin_cash] ) with hashrate @NUM 1.45 th / @NUM 1650 w added to tracking list s9j"
text = r"my [HTAG] cryptocurrency portfolio was [NUM] 80 percent [CURR] bitcoin until [USR] elonmusk [HTAG] tesla bought [NUM] 150000000 usd worth of [CURR] bitcoin this morning . my portfolio is [NUM] 9.87 percent [CURR] bitcoin now"
print(' '.join(tokenizer.encode(text).tokens))
aa = ' '.join(tokenizer.encode(text).tokens)

my [HTAG] cryptocurrency portfolio was [NUM] 80 percent [CURR] bitcoin until [USR] elonmusk [HTAG] tesla bought [NUM] 15000000 ##0 usd worth of [CURR] bitcoin this morning . my portfolio is [NUM] 9 . 87 percent [CURR] bitcoin now


In [58]:
tokenizer = ElectraTokenizer(vocab_file='../../data/vocab/bitcoin_twitter/bitcoin_twitter-vocab.txt')
tokenizer.get_vocab()

{'[CLS]': 0,
 '[PAD]': 1,
 '[SEP]': 2,
 '[UNK]': 3,
 '[MASK]': 4,
 '[HTAG]': 5,
 '[USR]': 6,
 '[CURR]': 7,
 '[EMOJI]': 8,
 '[URL]': 9,
 '[TIME]': 10,
 '[DATE]': 11,
 '[NUM]': 12,
 '!': 13,
 '"': 14,
 '#': 15,
 '$': 16,
 '%': 17,
 '&': 18,
 "'": 19,
 '(': 20,
 ')': 21,
 '*': 22,
 '+': 23,
 ',': 24,
 '-': 25,
 '.': 26,
 '/': 27,
 '0': 28,
 '1': 29,
 '2': 30,
 '3': 31,
 '4': 32,
 '5': 33,
 '6': 34,
 '7': 35,
 '8': 36,
 '9': 37,
 ':': 38,
 ';': 39,
 '<': 40,
 '=': 41,
 '>': 42,
 '?': 43,
 '@': 44,
 '[': 45,
 '\\': 46,
 ']': 47,
 '^': 48,
 '_': 49,
 'a': 50,
 'b': 51,
 'c': 52,
 'd': 53,
 'e': 54,
 'f': 55,
 'g': 56,
 'h': 57,
 'i': 58,
 'j': 59,
 'k': 60,
 'l': 61,
 'm': 62,
 'n': 63,
 'o': 64,
 'p': 65,
 'q': 66,
 'r': 67,
 's': 68,
 't': 69,
 'u': 70,
 'v': 71,
 'w': 72,
 'x': 73,
 'y': 74,
 'z': 75,
 '{': 76,
 '|': 77,
 '}': 78,
 '~': 79,
 '¬°': 80,
 '¬¢': 81,
 '¬£': 82,
 '¬§': 83,
 '¬•': 84,
 '¬¶': 85,
 '¬ß': 86,
 '¬©': 87,
 '¬™': 88,
 '¬¨': 89,
 '¬Æ': 90,
 '¬∞': 91,
 '¬±': 92,
 '¬≤': 

In [66]:
rev_vocab = {i: s for s, i in tokenizer.get_vocab().items()}
text = r"  [CURR]  new antminer s9j - 14 . @num 5.0 t from @USR[bitmain] mining sha256 (@CURR[chainlink] @CURR[dogecoin] @CURR[bitcoin_cash] ) with hashrate @NUM 1.45 th / @NUM 1650 w added to tracking list s9j"
text = r"my [HTAG] cryptocurrency portfolio was [NUM] 80 percent [CURR] bitcoin until [USR] elonmusk [HTAG] tesla bought [NUM] 150000000 usd worth of [CURR] bitcoin this morning . my portfolio is [NUM] 9.87 percent [CURR] bitcoin now"
' '.join([rev_vocab[int(i)] for i in tokenizer.encode(text)])

'[CLS] my [HTAG] cryptocurrency portfolio was [NUM] 80 percent [CURR] bitcoin until [USR] elonmusk [HTAG] tesla bought [NUM] 15000000 ##0 usd worth of [CURR] bitcoin this morning . my portfolio is [NUM] 9 . 87 percent [CURR] bitcoin now [SEP]'

In [64]:
tokenizer.add_special_tokens({
    'additional_special_tokens': list(tokenizer_custom.values())
})

0