In [None]:
!nvidia-smi

In [None]:
!pip install wandb transformers
!pip install git+https://github.com/huggingface/datasets.git@master

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/6f/6b/4aacb6a29a52c2b2c27afe8ece383d0235a2ac8ec96b7257486f4e4328ea/wandb-0.10.20-py2.py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 8.7MB/s 
[?25hCollecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 48.2MB/s 
Collecting sentry-sdk>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/15/74/59016eecaefa52443cd69cbb50e01851fa8bf3d9526771e2fae60ac6270c/sentry_sdk-0.20.3-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 47.3MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl
Collecting configparser>=3.8.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!tar -xvf  '/content/drive/MyDrive/Projects/bigbrain/bitcoin_twitter_corpus.tar.gz' -C '.'
!cp /content/drive/MyDrive/Projects/bigbrain/bitcoin_twitter-vocab.txt ./bitcoin_twitter-vocab.txt 

bitcoin_twitter_corpus/
bitcoin_twitter_corpus/test.tokens
bitcoin_twitter_corpus/train.tokens
bitcoin_twitter_corpus/validate.tokens


In [None]:
import os, math
import torch
import wandb
from torch import nn
from datasets import load_dataset, load_from_disk
from transformers import ElectraTokenizer, TextDataset, ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, \
    set_seed, DataCollatorForLanguageModeling, PreTrainedModel, PreTrainedTokenizer, Trainer, ElectraTokenizerFast, \
    TrainingArguments, EvaluationStrategy
from transformers.tokenization_utils_base import PaddingStrategy

In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
device = 'cuda:0'
torch.cuda.set_device(device)
print(torch.cuda.get_device_name(torch.cuda.current_device()))

Tesla P100-PCIE-16GB


In [None]:

tokenizer_custom = {
    '@HTAG': '[HTAG]',
    '@USR': '[USR]',
    '@CURR': '[CURR]',
    '@EMOJI': '[EMOJI]',
    '@URL': '[URL]',
    '@TIME': '[TIME]',
    '@DATE': '[DATE]',
    '@NUM': '[NUM]'
}
additional_tokens = list(tokenizer_custom.values())

DATASET_DIR = '../../data/bitcoin_twitter_corpus'
VOCAB_FILE = '../../data/bitcoin_twitter-vocab.txt'
TRAIN_DS = os.path.join(DATASET_DIR, 'train.tokens')
TEST_DS = os.path.join(DATASET_DIR, 'test.tokens')
VALIDATE_DS = os.path.join(DATASET_DIR, 'validate.tokens')
DATASET_PRESAVE_DIR = './bitcoin_twitter_tokenized'

model_path = './bitcoin_twitter'
seq_length = 256
accum_multipler = 1
batch_size = 128
epochs = 1
warmup_ratio = 0.06
lr = 5e-4
vocab_size = 16537
block_size = 200
seed = 1337

set_seed(seed)

In [None]:
tokenizer = ElectraTokenizerFast(vocab_file=VOCAB_FILE)
tokenizer.add_special_tokens({
    'additional_special_tokens': list(tokenizer_custom.values())
})

assert tokenizer.vocab_size == vocab_size

In [None]:
dataset = load_dataset("text", data_files={
    # 'train': TEST_DS,
    'train': TRAIN_DS,
    'test': TEST_DS,
    'validate': VALIDATE_DS
}, cache_dir='./cache')

Using custom data configuration default-f5345d1c90082ae1


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to ./cache/text/default-f5345d1c90082ae1/0.0.0/293ecb642f9fca45b44ad1f90c8445c54b9d80b95ab3fca3cfa5e1e3d85d4a57...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset text downloaded and prepared to ./cache/text/default-f5345d1c90082ae1/0.0.0/293ecb642f9fca45b44ad1f90c8445c54b9d80b95ab3fca3cfa5e1e3d85d4a57. Subsequent calls will reuse this data.


In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'], truncation=True,
        padding=PaddingStrategy.MAX_LENGTH, max_length=seq_length
    )

tokenized_datasets = dataset.shuffle().map(tokenize_function, batched=True, batch_size=10000, writer_batch_size=10000)

HBox(children=(FloatProgress(value=0.0, max=559.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=120.0), HTML(value='')))




In [None]:
class CombinedModel(nn.Module):
    def __init__(self, discriminator: PreTrainedModel, generator: PreTrainedModel, tokenizer: PreTrainedTokenizer):
        super().__init__()
        self.tokenizer = tokenizer
        self.discriminator = discriminator
        self.generator = generator

        self.discriminator.cuda()
        self.generator.cuda()

        # Embeddings are shared
        self.discriminator.set_input_embeddings(self.generator.get_input_embeddings())

    @staticmethod
    def mask_inputs(
            input_ids: torch.Tensor,
            mask_token_id, mask_probability,
            tokens_to_ignore,
            max_predictions_per_seq,
            proposal_distribution=1.0
    ):
        inputs_which_can_be_masked = torch.ones_like(input_ids)
        for token in tokens_to_ignore:
            inputs_which_can_be_masked -= torch.eq(input_ids, token).long()

        total_number_of_tokens = input_ids.shape[-1]

        # Identify the number of tokens to be masked, which should be: 1 < num < max_predictions per seq.
        # It is set to be: n_tokens * mask_probability, but is truncated if it goes beyond bounds.
        number_of_tokens_to_be_masked = torch.max(
            torch.tensor(1),
            torch.min(
                torch.tensor(max_predictions_per_seq),
                torch.tensor(total_number_of_tokens * mask_probability, dtype=torch.long)
            )
        )

        # The probability of each token being masked
        sample_prob = proposal_distribution * inputs_which_can_be_masked
        sample_prob /= torch.sum(sample_prob)
        # Should be passed through a log function here

        # Weight of each position: 1 the position will be masked, 0 the position won't be masked
        masked_lm_weights = torch.tensor([0] * max_predictions_per_seq, dtype=torch.bool).cuda()
        masked_lm_weights[:number_of_tokens_to_be_masked] = True

        # Sample from the probabilities
        masked_lm_positions = sample_prob.multinomial(max_predictions_per_seq)

        # Apply the weights to the positions
        masked_lm_positions *= masked_lm_weights.long()

        # Gather the IDs from the positions
        masked_lm_ids = input_ids.gather(-1, masked_lm_positions)

        # Apply weights to the IDs
        masked_lm_ids *= masked_lm_weights.long()

        replace_with_mask_positions = masked_lm_positions * (torch.rand(masked_lm_positions.shape).cuda() < 0.85)

        # Replace the input IDs with masks on given positions
        masked_input_ids = input_ids.scatter(-1, replace_with_mask_positions, mask_token_id)

        # Updates to index 0 should be ignored
        masked_input_ids[..., 0] = input_ids[..., 0]

        return masked_input_ids, masked_lm_positions

    @staticmethod
    def gather_positions(
            sequence,
            positions
    ):

        batch_size, sequence_length, dimension = sequence.shape
        position_shift = (sequence_length * torch.arange(batch_size)).unsqueeze(-1).cuda()
        flat_positions = torch.reshape(positions + position_shift, [-1]).long().cuda()
        flat_sequence = torch.reshape(sequence, [batch_size * sequence_length, dimension])
        gathered = flat_sequence.index_select(0, flat_positions)
        return torch.reshape(gathered, [batch_size, -1, dimension])

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            labels=None
    ):
        if input_ids is not None: input_ids.cuda()
        if attention_mask is not None: attention_mask.cuda()
        if token_type_ids is not None: token_type_ids.cuda()
        if position_ids is not None: position_ids.cuda()

        masked_input_ids, masked_lm_positions = self.mask_inputs(
            input_ids,  self.tokenizer.mask_token_id, 0.2,
            [self.tokenizer.cls_token_id, self.tokenizer.sep_token_id, self.tokenizer.mask_token_id, *additional_tokens],
            30
        )

        generator_loss, generator_output = self.generator(
            masked_input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            position_ids,
            labels=labels
        )[:2]

        fake_logits = self.gather_positions(generator_output, masked_lm_positions)
        fake_argmaxes = fake_logits.argmax(-1)
        fake_tokens = masked_input_ids.scatter(-1, masked_lm_positions, fake_argmaxes)
        fake_tokens[:, 0] = input_ids[:, 0]

        # discriminator_output
        discriminator_loss, discriminator_output = self.discriminator(
            fake_tokens,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            position_ids,
            labels=labels
        )[:2]

        discriminator_predictions = torch.round((torch.sign(discriminator_output) + 1) / 2).int().tolist()

        total_loss = discriminator_loss + generator_loss
        return (
            total_loss,
            (discriminator_predictions, generator_output),
            (fake_tokens, masked_input_ids)
        )

    def save_pretrained(self, directory):
        generator_path = os.path.join(directory, "generator")
        discriminator_path = os.path.join(directory, "discriminator")

        if not os.path.exists(generator_path):
            os.makedirs(generator_path)

        if not os.path.exists(discriminator_path):
            os.makedirs(discriminator_path)

        self.generator.save_pretrained(generator_path)
        self.discriminator.save_pretrained(discriminator_path)

In [None]:
generator_config = ElectraConfig(
    embedding_size=128,
    hidden_size = 256,
    intermediate_size = 1024,
    max_position_embeddings=seq_length,
    num_attention_heads=4,
    num_hidden_layers=12,
    vocab_size=vocab_size,
)

discriminator_config = ElectraConfig(
    embedding_size=128,
    hidden_size=256,
    intermediate_size=1024,
    max_position_embeddings=seq_length,
    num_attention_heads=4,
    num_hidden_layers=12,
    vocab_size=vocab_size,
)

In [None]:
generator = ElectraForMaskedLM(config=generator_config)
discriminator = ElectraForPreTraining(config=discriminator_config)
model = CombinedModel(discriminator, generator, tokenizer)
# wandb.watch(model)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, mlm_probability=0
)

In [None]:
arguments = TrainingArguments(
    output_dir=model_path,
    do_train=True,
    evaluation_strategy=EvaluationStrategy.STEPS,
    eval_steps = 10000,
    prediction_loss_only=True,
    learning_rate=lr,
    report_to=["wandb"],
    load_best_model_at_end=True,
    num_train_epochs=20,
    per_device_train_batch_size=124
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validate'],
)

In [None]:
# Training
trainer.train(model_path=model_path)
trainer.save_model()

Step,Training Loss
500,-19292.046
1000,-88279.544
1500,-213950.688
2000,-415296.832
2500,-702703.232
3000,-1077721.856
3500,-1554782.336
4000,-2148500.992
4500,-2940645.632
5000,-3753366.272


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-97e05c5759bc>", line 2, in <module>
    trainer.train(model_path=model_path)
  File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 925, in train
    for step, inputs in enumerate(epoch_iterator):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 435, in __next__
    data = self._next_data()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 475, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 

KeyboardInterrupt: ignored

In [None]:
# Evaluation
results = {}
print("*** Evaluate ***")

eval_output = trainer.evaluate()

perplexity = math.exp(eval_output["loss"])
result = {"perplexity": perplexity}

output_eval_file = "eval_results_lm.txt"
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

results.update(result)