# Important Imports

In [1]:
from platform import python_version

print(python_version())

3.11.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import re

In [4]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marneusz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [6]:
import random

In [7]:
stop = set(stopwords.words('english'))
random.sample(stopwords.words('english'), 10)

['won',
 'yourselves',
 "don't",
 "that'll",
 "you'd",
 'theirs',
 'doesn',
 "needn't",
 'with',
 'that']

Consider removing some stop words like _no_, _yes_, etc.

# Loading Data

In [8]:
DATASETS = {
    "FakeNews": "",
    "ISOT": ""
}

CUR_DATASET = "FakeNews"

In [9]:
train_dataset = pd.read_csv(f"./data/{CUR_DATASET}/train.csv.zip")
test_dataset = pd.read_csv(f"./data/{CUR_DATASET}/test.csv.zip")

In [10]:
train_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [11]:
labels = train_dataset["label"].values

In [12]:
whole_dataset = pd.concat([train_dataset, test_dataset])

# Some More EDA

In [13]:
train_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [14]:
train_dataset.isnull().sum() / train_dataset.shape[0]

id        0.000000
title     0.026827
author    0.094087
text      0.001875
label     0.000000
dtype: float64

In [15]:
whole_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26000 entries, 0 to 5199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      26000 non-null  int64  
 1   title   25320 non-null  object 
 2   author  23540 non-null  object 
 3   text    25954 non-null  object 
 4   label   20800 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.2+ MB


# Data Preprocessing and Data Preparation

In [16]:
train_dataset = train_dataset.fillna("null data")
test_dataset = test_dataset.fillna("null data")

### Removing stopwords

In [17]:
stop_words = stopwords.words('english')

In [18]:
if_stopwords = True

In [19]:
if if_stopwords:
    train_dataset["title"] = train_dataset["title"].str.lower().str.replace("’", "'")
    train_dataset["title"] = train_dataset["title"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    train_dataset["text"] = train_dataset["text"].str.lower().str.replace("’", "'")
    train_dataset["text"] = train_dataset["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [20]:
train_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,house dem aide: even see comey's letter jason ...,Darrell Lucus,house dem aide: even see comey's letter jason ...,1
1,1,"flynn: hillary clinton, big woman campus - bre...",Daniel J. Flynn,ever get feeling life circles roundabout rathe...,0
2,2,truth might get fired,Consortiumnews.com,"truth might get fired october 29, 2016 tension...",1
3,3,15 civilians killed single us airstrike identi...,Jessica Purkiss,videos 15 civilians killed single us airstrike...,1
4,4,iranian woman jailed fictional unpublished sto...,Howard Portnoy,print iranian woman sentenced six years prison...,1


### Preprocessing

In [21]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', '', txt)
}

In [22]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

Consider removing some of the stopwords.

In [23]:
STOP_WORDS = [preprocessing_text_fn["no_punctuation"](word) for word in stop_words]
random.sample(stop_words, 20)

["it's",
 'there',
 'the',
 'why',
 'yourselves',
 'aren',
 'further',
 'few',
 'isn',
 'shan',
 'because',
 'their',
 'will',
 'ourselves',
 'these',
 'ain',
 'out',
 'its',
 'whom',
 'too']

In [24]:
def tokenize_without_stopwords(text, stop_words=STOP_WORDS):
    word_tokens = word_tokenize(text)
    filtered_sequence = [word for word in word_tokens if not word.lower() in stop_words]
    return filtered_sequence

In [25]:
train_dataset["title"] = train_dataset["title"].apply(preprocess_text)
train_dataset["text"] = train_dataset["text"].apply(preprocess_text)
train_dataset.head(10)

Unnamed: 0,id,title,author,text,label
0,0,house dem aide even see comeys letter jason ch...,Darrell Lucus,house dem aide even see comeys letter jason ch...,1
1,1,flynn hillary clinton big woman campus breitbart,Daniel J. Flynn,ever get feeling life circles roundabout rathe...,0
2,2,truth might get fired,Consortiumnews.com,truth might get fired october tension intellig...,1
3,3,civilians killed single us airstrike identified,Jessica Purkiss,videos civilians killed single us airstrike id...,1
4,4,iranian woman jailed fictional unpublished sto...,Howard Portnoy,print iranian woman sentenced six years prison...,1
5,5,jackie mason hollywood would love trump bombed...,Daniel Nussbaum,trying times jackie mason voice reason in week...,0
6,6,life life luxury elton johns favorite shark pi...,null data,ever wonder britains iconic pop pianist gets l...,1
7,7,benoît hamon wins french socialist partys pres...,Alissa J. Rubin,paris france chose idealistic traditional cand...,0
8,8,excerpts draft script donald trumps qampa blac...,null data,donaldtrump scheduled make highly anticipated ...,0
9,9,backchannel plan ukraine russia courtesy trump...,Megan Twohey and Scott Shane,week michaelflynn resigned national security a...,0


In [26]:
test_dataset["title"] = test_dataset["title"].apply(preprocess_text)
test_dataset["text"] = test_dataset["text"].apply(preprocess_text)
test_dataset.head(10)

Unnamed: 0,id,title,author,text
0,20800,Specter of Trump Loosens Tongues if Not Purse ...,David Streitfeld,PALO ALTO Calif After years of scorning the po...
1,20801,Russian warships ready to strike terrorists ne...,null data,Russian warships ready to strike terrorists ne...
2,20802,NoDAPL Native American Leaders Vow to Stay All...,Common Dreams,Videos NoDAPL Native American Leaders Vow to S...
3,20803,Tim Tebow Will Attempt Another Comeback This T...,Daniel Victor,If at first you dont succeed trydifferent spor...
4,20804,Keiser Report Meme Wars E,Truth Broadcast Network,mins ago Views Comments Likes For the first t...
5,20805,Trump is USAs antique hero Clinton will be nex...,null data,Trump is USAs antique hero Clinton will be nex...
6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,Sunday on NBCs Meet the Press House Minority L...
7,20807,Weekly Featured Profile Randy Shannon,Trevor Loudon,You are here Home Articles of the Bound Weekly...
8,20808,Urban Population Booms Will Make Climate Chang...,null data,Urban Population Booms Will Make Climate Chang...
9,20809,null data,cognitive dissident,dont we have the receipt


### Lemmatization and Stemming

In [27]:
if_lemmatize = True

In [30]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    train_dataset["title"] = train_dataset["title"].str.lower().str.replace("’", "'")
    train_dataset["title"] = train_dataset["title"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    train_dataset["text"] = train_dataset["text"].str.lower().str.replace("’", "'")
    train_dataset["text"] = train_dataset["text"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    

[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marneusz/nltk_data...


In [32]:
# train_dataset["title"] = train_dataset["title"].apply(tokenize_without_stopwords)
# train_dataset["text"] = train_dataset["text"].apply(tokenize_without_stopwords)
# train_dataset.head(10)

In [33]:
# test_dataset["title"] = test_dataset["title"].apply(tokenize_without_stopwords)
# test_dataset["text"] = test_dataset["text"].apply(tokenize_without_stopwords)
# test_dataset.head(10)

In [34]:
# train_text = train_dataset['text'].values
train_text = (train_dataset['title'] + " " + train_dataset['text']).values
test_text = (test_dataset['title'] + " " + test_dataset['text']).values

In [35]:
labels = train_dataset['label'].values

In the Kaggle competition the best scores were obtained by using only 'author' and 'title' features. Let's take a look if it's possible to train BERT using text.

# Model Initialization

In [36]:
from tqdm import tqdm

In [37]:
import torch
if torch.cuda.is_available():    
    device = torch.device('cuda')    
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [38]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [39]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score

In [40]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [41]:
print(' Original: ', train_text[0])
print('Tokenized: ', tokenizer.tokenize(train_text[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[0])))

 Original:  house dem aide even see comeys letter jason chaffetz tweeted house dem aide even see comeys letter jason chaffetz tweeted darrell lucus october subscribe jason chaffetz stump american fork utah image courtesy michael jolley available creative commonsby license apology keith olbermann doubt worst person world weekfbi director james comey according house democratic aide look like also know secondworst person well turn comey sent nowinfamous letter announcing fbi looking email may related hillary clinton email server ranking democrat relevant committee hear comey found via tweet one republican committee chairman know comey notified republican chairman democratic ranking member house intelligence judiciary oversight committee agency reviewing email recently discovered order see contained classified information long letter went out oversight committee chairman jason chaffetz set political world ablaze tweet fbi dir informed me the fbi learned existence email appear pertinent inv

In [42]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', # Use the 124-layer, 1024-hidden, 16-heads, 340M parameters BERT model with an uncased vocab.
    num_labels = 2, 
    output_attentions = False, 
    output_hidden_states = False, 
)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [43]:
max_len = 0
len_limit = 512
LIMIT = 100_000

indices = []
train_text_filtered = []

for i, text in enumerate(tqdm(train_text)):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
    if len(input_ids) <= LIMIT:
        train_text_filtered.append(text)
        indices.append(i)
print('Max sentence length: ', max_len)

  0%|                                                                                                                                    | 0/20800 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20800/20800 [01:18<00:00, 265.23it/s]

Max sentence length:  113463





In [44]:
labels_filtered = labels[indices]
labels_filtered.shape

(20799,)

In [45]:
# https://www.kaggle.com/code/jeongwonkim10516/nlp-fake-news-with-bert-99-55-top1/notebook

def tokenize_map(sentence, labs='None'):
    
    """A function for tokenize all of the sentences and map the tokens to their word IDs."""
    
    global labels
    
    input_ids = []
    attention_masks = []

    # For every sentence...
    
    for text in tqdm(sentence):
        #   "encode_plus" will:
        
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            truncation='longest_first', # Activate and control truncation
                            max_length = len_limit,           # Max length according to our text data.
                            padding = 'max_length', # Pad & truncate all sentences.
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the id list. 
        
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        
        attention_masks.append(encoded_dict['attention_mask'])
        
    # Convert the lists into tensors.
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if labs != 'None': # Setting this for using this definition for both train and test data so labels won't be a problem in our outputs.
        labels = torch.tensor(labels)
        return input_ids, attention_masks, labels
    
    return input_ids, attention_masks

In [46]:
train_text_filtered = np.array(train_text_filtered)

In [47]:
train_text.shape, train_text_filtered.shape,

((20800,), (20799,))

In [48]:
input_ids, attention_masks, labels_filtered = tokenize_map(train_text_filtered, labels_filtered)
# test_input_ids, test_attention_masks= tokenize_map(test_text)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20799/20799 [01:21<00:00, 254.28it/s]
  if labs != 'None': # Setting this for using this definition for both train and test data so labels won't be a problem in our outputs.


## Train and Validation Dataset

In [49]:
import transformers

In [50]:
seed = 10
transformers.set_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fb24c116850>

In [51]:
type(attention_masks)

torch.Tensor

In [52]:
labels_filtered.shape, input_ids.shape

(torch.Size([20800]), torch.Size([20799, 512]))

In [53]:
labels_filtered = labels[indices]
labels_filtered.shape

torch.Size([20799])

In [54]:
dataset = TensorDataset(input_ids, attention_masks, labels_filtered)

In [55]:
torch.manual_seed(seed)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_size, val_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [56]:
# Train DataLoader
batch_size = 4

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

# Validation DataLoader
validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
)

In [57]:
# # Test DataLoader

# test_data = TensorDataset(test_input_ids, test_attention_masks)
# test_sampler = SequentialSampler(test_data)
# test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## Hyperparameters

In [58]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # args.learning_rate
                  eps = 1e-8 # args.adam_epsilon
            )

In [59]:
num_epochs = 5
total_num_steps = len(train_dataloader) * num_epochs

## Metrics

In [60]:
def accuracy(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return accuracy_score(labels_flat, pred_flat)

def flat_f1_score(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    

    return f1_score(labels_flat, pred_flat, zero_division=0)

## Training

In [61]:
import time

In [62]:
training_stats = []
validations_labels_ep = []
actual_labels_ep = []

total_t0 = time.time()
for i in range(0, num_epochs):
    print('')
    print('Training...')
    print('----- Epoch {:} / {:} -----'.format(i + 1, num_epochs))

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 50 == 0 and not step == 0:
            elapsed = time.time() - t0
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)
        
        model.zero_grad()        

        loss = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     labels=b_labels)[0]
        logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                       labels=b_labels)[1]

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        # scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = time.time() - t0

    print('')
    print('  Average training loss: {0:.2f}'.format(avg_train_loss))
    print('  Training epoc h took: {:}'.format(training_time))
    
    print('')
    print('Running Validation...')

    t0 = time.time()
    
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0
    
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            loss = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[0]

            logits = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)[1]
            
        # Accumulate the validation loss.
        
        total_eval_loss += loss.item()

        # Move logits and labels to CPU:
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches:
        
        total_eval_accuracy += accuracy(logits, label_ids)
        total_eval_f1 += flat_f1_score(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('  Accuracy: {0:.5f}'.format(avg_val_accuracy))
    
    # Report the final f1 score for this validation run.
    
    avg_val_f1 = total_eval_f1 / len(validation_dataloader)
    print('  F1: {0:.5f}'.format(avg_val_f1))

    # Calculate the average loss over all of the batches.
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    
    
    # Measure how long the validation run took:
    
    validation_time = time.time() - t0
    
    print('  Validation Loss: {0:.5f}'.format(avg_val_loss))
    print('  Validation took: {:}'.format(validation_time))

    # Record all statistics from this epoch.
    
    training_stats.append(
        {
            'epoch': i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Val_F1' : avg_val_f1,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )


Training...
----- Epoch 1 / 5 -----
  Batch    50  of  4,160.    Elapsed: 18.190900802612305.
  Batch   100  of  4,160.    Elapsed: 35.57329249382019.
  Batch   150  of  4,160.    Elapsed: 52.82647728919983.
  Batch   200  of  4,160.    Elapsed: 70.12784576416016.
  Batch   250  of  4,160.    Elapsed: 87.47257828712463.
  Batch   300  of  4,160.    Elapsed: 104.73115277290344.
  Batch   350  of  4,160.    Elapsed: 122.05042028427124.
  Batch   400  of  4,160.    Elapsed: 139.36284804344177.
  Batch   450  of  4,160.    Elapsed: 156.6610827445984.
  Batch   500  of  4,160.    Elapsed: 174.0032639503479.
  Batch   550  of  4,160.    Elapsed: 191.36982226371765.
  Batch   600  of  4,160.    Elapsed: 208.6692774295807.
  Batch   650  of  4,160.    Elapsed: 226.00262069702148.
  Batch   700  of  4,160.    Elapsed: 243.2904953956604.
  Batch   750  of  4,160.    Elapsed: 260.5849645137787.
  Batch   800  of  4,160.    Elapsed: 277.8481729030609.
  Batch   850  of  4,160.    Elapsed: 295.218

  Batch 2,850  of  4,160.    Elapsed: 1049.5795638561249.
  Batch 2,900  of  4,160.    Elapsed: 1067.3617701530457.
  Batch 2,950  of  4,160.    Elapsed: 1085.8282976150513.
  Batch 3,000  of  4,160.    Elapsed: 1105.411389350891.
  Batch 3,050  of  4,160.    Elapsed: 1123.8294179439545.
  Batch 3,100  of  4,160.    Elapsed: 1142.4479703903198.
  Batch 3,150  of  4,160.    Elapsed: 1161.8522758483887.
  Batch 3,200  of  4,160.    Elapsed: 1179.780639886856.
  Batch 3,250  of  4,160.    Elapsed: 1197.7894141674042.
  Batch 3,300  of  4,160.    Elapsed: 1217.3441450595856.
  Batch 3,350  of  4,160.    Elapsed: 1236.9851546287537.
  Batch 3,400  of  4,160.    Elapsed: 1256.1564092636108.
  Batch 3,450  of  4,160.    Elapsed: 1275.4608218669891.
  Batch 3,500  of  4,160.    Elapsed: 1293.1530985832214.
  Batch 3,550  of  4,160.    Elapsed: 1310.5729389190674.
  Batch 3,600  of  4,160.    Elapsed: 1328.1260006427765.
  Batch 3,650  of  4,160.    Elapsed: 1345.5357766151428.
  Batch 3,700  o

  Batch 1,300  of  4,160.    Elapsed: 484.4105224609375.
  Batch 1,350  of  4,160.    Elapsed: 503.1155822277069.
  Batch 1,400  of  4,160.    Elapsed: 522.0715494155884.
  Batch 1,450  of  4,160.    Elapsed: 540.4148154258728.
  Batch 1,500  of  4,160.    Elapsed: 559.7886364459991.
  Batch 1,550  of  4,160.    Elapsed: 578.3877775669098.
  Batch 1,600  of  4,160.    Elapsed: 597.8031077384949.
  Batch 1,650  of  4,160.    Elapsed: 617.293134689331.
  Batch 1,700  of  4,160.    Elapsed: 635.2435967922211.
  Batch 1,750  of  4,160.    Elapsed: 653.6979978084564.
  Batch 1,800  of  4,160.    Elapsed: 672.6571073532104.
  Batch 1,850  of  4,160.    Elapsed: 690.9754354953766.
  Batch 1,900  of  4,160.    Elapsed: 709.8436243534088.
  Batch 1,950  of  4,160.    Elapsed: 727.242707490921.
  Batch 2,000  of  4,160.    Elapsed: 746.8545088768005.
  Batch 2,050  of  4,160.    Elapsed: 766.3234102725983.
  Batch 2,100  of  4,160.    Elapsed: 784.6837890148163.
  Batch 2,150  of  4,160.    Elap

  Batch 4,100  of  4,160.    Elapsed: 1424.5584454536438.
  Batch 4,150  of  4,160.    Elapsed: 1441.8161218166351.

  Average training loss: 0.02
  Training epoc h took: 1445.2066023349762

Running Validation...
  Accuracy: 0.99375
  F1: 0.94318
  Validation Loss: 0.04326
  Validation took: 153.01307678222656


In [63]:
model.save_pretrained("./models/bert_regexp_stopwords_lemmatization")