# Important Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import re

In [3]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marneusz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Loading Data

In [5]:
CUR_DATASET = "LIAR-PLUS"

In [6]:
train_dataset = pd.read_csv(f'../data/{CUR_DATASET}/train2.tsv', sep='\t', header = None)
valid_dataset = pd.read_csv(f'../data/{CUR_DATASET}/val2.tsv', sep='\t', header = None)
test_dataset = pd.read_csv(f'../data/{CUR_DATASET}/test2.tsv', sep='\t', header = None)
liar_dataset = pd.concat([train_dataset, valid_dataset, test_dataset], axis = 0).reset_index(drop = True)

In [7]:
liar_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1,1.0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe..."
2,2.0,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...
3,3.0,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...
4,4.0,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start..."


In [8]:
liar_dataset = liar_dataset.iloc[:, [2, 3, 15]]
liar_dataset = liar_dataset.rename(columns = {2: 'label', 3: 'statements', 15: 'justification'})
liar_dataset.head()

Unnamed: 0,label,statements,justification
0,false,Says the Annies List political group supports ...,That's a premise that he fails to back up. Ann...
1,half-true,When did the decline of coal start? It started...,"Surovell said the decline of coal ""started whe..."
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",Obama said he would have voted against the ame...
3,false,Health care reform legislation is likely to ma...,The release may have a point that Mikulskis co...
4,half-true,The economic turnaround started at the end of ...,"Crist said that the economic ""turnaround start..."


In [9]:
liar_dataset['label'] = liar_dataset['label'].replace({
    'false' : 0,
    'barely-true' : 0,
    'pants-fire' : 0,
    'half-true' : 1,
    'mostly-true' : 1,
    'true' : 1
})


# Some More EDA

In [10]:
liar_dataset.isnull().sum()

label              2
statements         2
justification    101
dtype: int64

In [11]:
liar_dataset = liar_dataset.dropna()

In [12]:
labels = liar_dataset["label"].values.astype(int)

# Data Preprocessing and Data Preparation

### Removing stopwords

In [13]:
stop_words = stopwords.words('english')

In [14]:
if_stopwords = True

### Preprocessing

In [15]:
# from num2words import num2words

In [16]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    # "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)
}

In [17]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [18]:
liar_dataset["statements"] = liar_dataset["statements"].apply(preprocess_text)
liar_dataset["justification"] = liar_dataset["justification"].apply(preprocess_text)
liar_dataset.head(10)

Unnamed: 0,label,statements,justification
0,0.0,Says the Annies List political group supports ...,Thats premise that he fails to back up Annies ...
1,1.0,When did the decline of coal start It started ...,Surovell said the decline of coal started when...
2,1.0,Hillary Clinton agrees with John McCain by vot...,Obama said he would have voted against the ame...
3,0.0,Health care reform legislation is likely to ma...,The release may have point that Mikulskis comm...
4,1.0,The economic turnaround started at the end of ...,Crist said that the economic turnaround starte...
5,1.0,The Chicago Bears have had more starting quart...,But Vos specifically used the word fired which...
6,0.0,Jim Dunnam has not lived in the district he re...,But determining that would take significant de...
7,1.0,Im the only person on this stage who has worke...,However it was not that bill but another one s...
8,1.0,However it took 195 million in Oregon Lottery ...,But Johnson is correct that many other factors...
9,1.0,Says GOP primary opponents Glenn Grothman and ...,Considering that the 532 million figure covers...


In [19]:
if if_stopwords:
    for col in ["statements", "justification"]:
        
        liar_dataset[col] = liar_dataset[col].str.lower().str.replace("’", "'")
        liar_dataset[col] = liar_dataset[col].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


### Lemmatization and Stemming

In [20]:
if_lemmatize = True

In [21]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    for col in ["statements", "justification"]:
        liar_dataset[col] = liar_dataset[col].str.lower().str.replace("’", "'")
        liar_dataset[col] = liar_dataset[col].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    

[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marneusz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [23]:
train_text = (liar_dataset["justification"]).values

# Model Initialization

In [24]:
from tqdm import tqdm

In [25]:
import torch
if torch.cuda.is_available():    
    device = torch.device('cuda')    
print(device)

cuda


  from .autonotebook import tqdm as notebook_tqdm


In [26]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [27]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score

In [28]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [29]:
print(' Original: ', train_text[0])
print('Tokenized: ', tokenizer.tokenize(train_text[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[0])))

 Original:  thats premise fails back annies list make bone comfortable candidate oppose restriction lateterm abortion year backing two house candidate voted limit
Tokenized:  ['that', '##s', 'premise', 'fails', 'back', 'annie', '##s', 'list', 'make', 'bone', 'comfortable', 'candidate', 'oppose', 'restriction', 'late', '##ter', '##m', 'abortion', 'year', 'backing', 'two', 'house', 'candidate', 'voted', 'limit']
Token IDs:  [2008, 2015, 18458, 11896, 2067, 8194, 2015, 2862, 2191, 5923, 6625, 4018, 15391, 16840, 2397, 3334, 2213, 11324, 2095, 5150, 2048, 2160, 4018, 5444, 5787]


In [30]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
    num_labels = len(np.unique(labels)), 
    output_attentions = False, 
    output_hidden_states = False, 
)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [31]:
max_len = 0
len_limit = 512
LIMIT = 100_000

indices = []
train_text_filtered = []

for i, text in enumerate(tqdm(train_text)):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
    if len(input_ids) <= LIMIT:
        train_text_filtered.append(text)
        indices.append(i)
print('Max sentence length: ', max_len)

  0%|                                                                                                             | 0/12692 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2075 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 12692/12692 [00:04<00:00, 2595.49it/s]

Max sentence length:  2075





In [32]:
labels_filtered = labels[indices]
labels_filtered.shape

(12692,)

In [33]:
# https://www.kaggle.com/code/jeongwonkim10516/nlp-fake-news-with-bert-99-55-top1/notebook

def tokenize_map(sentence, labs='None'):
    
    """A function for tokenize all of the sentences and map the tokens to their word IDs."""
    
    global labels
    
    input_ids = []
    attention_masks = []

    # For every sentence...
    
    for text in tqdm(sentence):
        #   "encode_plus" will:
        
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            truncation='longest_first', # Activate and control truncation
                            max_length = len_limit,           # Max length according to our text data.
                            padding = 'max_length', # Pad & truncate all sentences.
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the id list. 
        
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        
        attention_masks.append(encoded_dict['attention_mask'])
        
    # Convert the lists into tensors.
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if labs != 'None': # Setting this for using this definition for both train and test data so labels won't be a problem in our outputs.
        labels = torch.tensor(labels)
        return input_ids, attention_masks, labels
    
    return input_ids, attention_masks

In [34]:
train_text_filtered = np.array(train_text)

In [35]:
train_text.shape, train_text_filtered.shape,

((12692,), (12692,))

In [36]:
input_ids, attention_masks, labels_filtered = tokenize_map(train_text_filtered, labels_filtered)
# test_input_ids, test_attention_masks= tokenize_map(test_text)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 12692/12692 [00:06<00:00, 2095.14it/s]
  if labs != 'None': # Setting this for using this definition for both train and test data so labels won't be a problem in our outputs.


## Train and Validation Dataset

In [37]:
import transformers

In [38]:
seed = 10
transformers.set_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f87cf656f50>

In [39]:
type(attention_masks)

torch.Tensor

In [40]:
labels_filtered.shape, input_ids.shape

(torch.Size([12692]), torch.Size([12692, 512]))

In [41]:
labels_filtered = labels[indices]
labels_filtered.shape

torch.Size([12692])

In [42]:
dataset = TensorDataset(input_ids, attention_masks, labels_filtered)

In [43]:
torch.manual_seed(seed)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_size, val_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [44]:
# Train DataLoader
batch_size = 4

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

# Validation DataLoader
validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
)

In [45]:
# # Test DataLoader

# test_data = TensorDataset(test_input_ids, test_attention_masks)
# test_sampler = SequentialSampler(test_data)
# test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## Hyperparameters

In [46]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # args.learning_rate
                  eps = 1e-8 # args.adam_epsilon
            )

In [47]:
num_epochs = 10
total_num_steps = len(train_dataloader) * num_epochs

## Metrics

In [48]:
def accuracy(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return accuracy_score(labels_flat, pred_flat)

def flat_f1_score(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    

    return f1_score(labels_flat, pred_flat, zero_division=0, average="weighted")

## Training

In [49]:
import time

In [50]:
training_stats = []
validations_labels_ep = []
actual_labels_ep = []

total_t0 = time.time()
for i in range(0, num_epochs):
    print('')
    print('Training...')
    print('----- Epoch {:} / {:} -----'.format(i + 1, num_epochs))

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 50 == 0 and not step == 0:
            elapsed = time.time() - t0
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)
        
        model.zero_grad()        

        loss = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     labels=b_labels)[0]
        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)[1]

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        # scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = time.time() - t0

    print('')
    print('  Average training loss: {0:.2f}'.format(avg_train_loss))
    print('  Training epoc h took: {:}'.format(training_time))
    
    print('')
    print('Running Validation...')

    t0 = time.time()
    
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0
    
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            loss = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[0]

            logits = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[1]
            
        # Accumulate the validation loss.
        
        total_eval_loss += loss.item()

        # Move logits and labels to CPU:
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches:
        
        total_eval_accuracy += accuracy(logits, label_ids)
        total_eval_f1 += flat_f1_score(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('  Accuracy: {0:.5f}'.format(avg_val_accuracy))
    
    # Report the final f1 score for this validation run.
    
    avg_val_f1 = total_eval_f1 / len(validation_dataloader)
    print('  F1: {0:.5f}'.format(avg_val_f1))

    # Calculate the average loss over all of the batches.
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    
    
    # Measure how long the validation run took:
    
    validation_time = time.time() - t0
    
    print('  Validation Loss: {0:.5f}'.format(avg_val_loss))
    print('  Validation took: {:}'.format(validation_time))

    # Record all statistics from this epoch.
    
    training_stats.append(
        {
            'epoch': i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Val_F1' : avg_val_f1,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )


Training...
----- Epoch 1 / 10 -----
  Batch    50  of  2,539.    Elapsed: 30.044180393218994.
  Batch   100  of  2,539.    Elapsed: 47.41488337516785.
  Batch   150  of  2,539.    Elapsed: 65.71474361419678.
  Batch   200  of  2,539.    Elapsed: 86.07025980949402.
  Batch   250  of  2,539.    Elapsed: 107.20276212692261.
  Batch   300  of  2,539.    Elapsed: 128.29850673675537.
  Batch   350  of  2,539.    Elapsed: 149.46811199188232.
  Batch   400  of  2,539.    Elapsed: 168.64443969726562.
  Batch   450  of  2,539.    Elapsed: 186.27076125144958.
  Batch   500  of  2,539.    Elapsed: 205.1120102405548.
  Batch   550  of  2,539.    Elapsed: 224.3102264404297.
  Batch   600  of  2,539.    Elapsed: 242.73338222503662.
  Batch   650  of  2,539.    Elapsed: 260.4591865539551.
  Batch   700  of  2,539.    Elapsed: 278.120854139328.
  Batch   750  of  2,539.    Elapsed: 295.5653636455536.
  Batch   800  of  2,539.    Elapsed: 313.0636169910431.
  Batch   850  of  2,539.    Elapsed: 330.48

  Batch 1,800  of  2,539.    Elapsed: 620.9998738765717.
  Batch 1,850  of  2,539.    Elapsed: 638.2208471298218.
  Batch 1,900  of  2,539.    Elapsed: 655.5163276195526.
  Batch 1,950  of  2,539.    Elapsed: 672.7127277851105.
  Batch 2,000  of  2,539.    Elapsed: 689.9254727363586.
  Batch 2,050  of  2,539.    Elapsed: 707.0995705127716.
  Batch 2,100  of  2,539.    Elapsed: 724.3317391872406.
  Batch 2,150  of  2,539.    Elapsed: 741.5776093006134.
  Batch 2,200  of  2,539.    Elapsed: 758.8762373924255.
  Batch 2,250  of  2,539.    Elapsed: 776.1166050434113.
  Batch 2,300  of  2,539.    Elapsed: 793.3723268508911.
  Batch 2,350  of  2,539.    Elapsed: 810.6678447723389.
  Batch 2,400  of  2,539.    Elapsed: 827.9577043056488.
  Batch 2,450  of  2,539.    Elapsed: 845.1657009124756.
  Batch 2,500  of  2,539.    Elapsed: 862.3688201904297.

  Average training loss: 0.69
  Training epoc h took: 875.6558184623718

Running Validation...
  Accuracy: 0.54948
  F1: 0.43847
  Validation Lo

  Batch   850  of  2,539.    Elapsed: 293.04201197624207.
  Batch   900  of  2,539.    Elapsed: 310.22705721855164.
  Batch   950  of  2,539.    Elapsed: 327.42700147628784.
  Batch 1,000  of  2,539.    Elapsed: 344.6767451763153.
  Batch 1,050  of  2,539.    Elapsed: 361.8604984283447.
  Batch 1,100  of  2,539.    Elapsed: 379.1516270637512.
  Batch 1,150  of  2,539.    Elapsed: 396.44611501693726.
  Batch 1,200  of  2,539.    Elapsed: 413.6300287246704.
  Batch 1,250  of  2,539.    Elapsed: 430.8571240901947.
  Batch 1,300  of  2,539.    Elapsed: 448.09167528152466.
  Batch 1,350  of  2,539.    Elapsed: 465.30126094818115.
  Batch 1,400  of  2,539.    Elapsed: 482.550005197525.
  Batch 1,450  of  2,539.    Elapsed: 499.82032322883606.
  Batch 1,500  of  2,539.    Elapsed: 517.0909638404846.
  Batch 1,550  of  2,539.    Elapsed: 534.3000452518463.
  Batch 1,600  of  2,539.    Elapsed: 551.5086090564728.
  Batch 1,650  of  2,539.    Elapsed: 568.7263233661652.
  Batch 1,700  of  2,539.

  Accuracy: 0.50564
  F1: 0.50695
  Validation Loss: 2.39207
  Validation took: 96.51109838485718

Training...
----- Epoch 9 / 10 -----
  Batch    50  of  2,539.    Elapsed: 17.5398690700531.
  Batch   100  of  2,539.    Elapsed: 35.4463894367218.
  Batch   150  of  2,539.    Elapsed: 53.48884391784668.
  Batch   200  of  2,539.    Elapsed: 71.28428435325623.
  Batch   250  of  2,539.    Elapsed: 89.5705201625824.
  Batch   300  of  2,539.    Elapsed: 107.22225999832153.
  Batch   350  of  2,539.    Elapsed: 125.2859263420105.
  Batch   400  of  2,539.    Elapsed: 142.77438831329346.
  Batch   450  of  2,539.    Elapsed: 160.56399655342102.
  Batch   500  of  2,539.    Elapsed: 178.40036296844482.
  Batch   550  of  2,539.    Elapsed: 196.6806836128235.
  Batch   600  of  2,539.    Elapsed: 214.4763641357422.
  Batch   650  of  2,539.    Elapsed: 232.31122493743896.
  Batch   700  of  2,539.    Elapsed: 250.1335563659668.
  Batch   750  of  2,539.    Elapsed: 267.7003936767578.
  Batch

In [51]:
model.save_pretrained(f"./models/bert_{CUR_DATASET}_regexp_stopwords_{if_stopwords}_lemmatization_{if_lemmatize}_binary")