## Data Cleaning

In [34]:
import os
import re
from nltk.corpus import brown
words = brown.sents()
raw_text = '\n'.join([' '.join(sent) for sent in words])
raw_text = re.sub(r' +', ' ', raw_text)
raw_text = re.sub(r' ([.,!?;:])', r'\1', raw_text)
raw_text = re.sub(r'`` ', r'"', raw_text)
raw_text = re.sub(r" ''", r'"', raw_text)
raw_text = re.sub(r'``', r'"', raw_text)
raw_text = re.sub(r"''", r'"', raw_text)
raw_text = re.sub(r'\.\.\n', r'.\n', raw_text)
raw_text = re.sub(r'\?\?', '?', raw_text)
raw_text = re.sub('!!', '!', raw_text)
raw_text = re.sub('\' ', '\'', raw_text)
raw_text = re.sub(' \'', '\'', raw_text)
raw_text = re.sub(r'\( ', '(', raw_text)
raw_text = re.sub(r' \)', ')', raw_text)
raw_text = re.sub(r'\[ ', '[', raw_text)
raw_text = re.sub(r' \]', ']', raw_text)
raw_text = re.sub(r';;', ';', raw_text)
raw_text = re.sub(r'::', ':', raw_text)
raw_text = re.sub(r'\,\,', ',', raw_text)
raw_text = re.sub(r'\,\,', ',', raw_text)
raw_text = re.sub(r'\'\'', '\' \'', raw_text)
raw_text = re.sub(r'""', '"', raw_text)

## Folds

In [39]:
os.makedirs('brown_data', exist_ok=True)

lines = raw_text.split('\n')

N_FOLDS = 10

for i in range(N_FOLDS):
    with open(f'brown_data/brown_{i}.txt', 'w+', encoding='utf-8') as f:
        for j, line in enumerate(lines):
            if j % N_FOLDS == i:
                f.write(line + '\n')

## Tokenization

In [24]:
from transformers import (
    GPT2Tokenizer,
    BertTokenizer,
    RobertaTokenizer,
    XLMRobertaTokenizer,
)

settings = {
    'padding': 'longest',
    'max_length': 128,
    'truncation': True,
    'return_tensors': 'pt',
    'return_token_type_ids': False,
}

gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

gpt2_tokenizer(['Hello, world!', 'How are you my dear friends?'], **settings)
bert_tokenizer(['Hello, world!', 'How are you my dear friends?'], **settings)
roberta_tokenizer(['Hello, world!', 'How are you my dear friends?'], **settings)
xlmroberta_tokenizer(['Hello, world!', 'How are you my dear friends?'], **settings)

{'input_ids': tensor([[    0, 35378,     4,  8999,    38,     2,     1,     1,     1,     1],
        [    0, 11249,   621,   398,   759,     8,   147, 23902,    32,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## Training and Validation

In [9]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import random
from train_model import train_and_test
from model import get_model, get_optimizer, get_scheduler

seed = 42

torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

hparams = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'n_positions': settings['max_length'],
    'n_embd': 64,
    'n_layer': 4,
    'n_head': 4,
    'resid_pdrop': 0.05,
    'embd_pdrop': 0.05,
    'attn_pdrop': 0.05,
    'summary_first_dropout': 0.05,
    'bos_token_id': 0,
    'eos_token_id': 1,
    'batch_size': 4,
    'sequence_length': 64,
    'epochs': 4,
    'learning_rate': 0.001,
    'warmup_steps': 100,
    'weight_decay': 0.01,
    'adam_epsilon': 1e-8,
    'max_grad_norm': 1.0,
    'log_interval': 100
}

for tokenizer, save_name in zip(
    [gpt2_tokenizer, bert_tokenizer, roberta_tokenizer, xlmroberta_tokenizer],
    ['gpt2', 'bert', 'roberta', 'xlmroberta']
):
    
    hparams['vocab_size'] = tokenizer.vocab_size
    
    for split in range(N_FOLDS):
        
        test_data = []
        with open(f'brown_data/brown_{split}.txt', 'r', encoding='utf-8') as f:
            test_data = f.readlines()
        
        train_data = []
        for i in range(N_FOLDS):
            if i == split:
                continue
            train_data.extend(open(f'brown_data/brown_{i}.txt', 'r', encoding='utf-8').readlines())

        def collate_fn(data):
            return tokenizer(data, **settings)
        
        train_loader = DataLoader(
            train_data,
            batch_size=hparams['batch_size'],
            shuffle=True,
            collate_fn=collate_fn
        )

        test_loader = DataLoader(
            test_data,
            batch_size=hparams['batch_size'],
            shuffle=False,
            collate_fn=collate_fn
        )
        
        optimizer = get_optimizer(model, hparams)
        
        model = get_model(**hparams)
        
        train_and_test(
            model,
            train_loader,
            test_loader,
            optimizer=optimizer,
            epochs=hparams['epochs'],
            log_interval=hparams['log_interval'],
            save_name=save_name + f'_{split}',
            scheduler=get_scheduler(optimizer, hparams['warmup_steps']),
            device=hparams['device'],
            entropy=0
        )