In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from nltk.corpus import stopwords
import string
import re
from bs4 import BeautifulSoup
import torch
import nltk
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/emil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df_fake = pd.read_csv("input_data/Fake.csv")
df_true = pd.read_csv("input_data/True.csv")

In [4]:
df_fake["label"] = 0
df_true["label"] = 1

In [5]:
# Removing last 10 rows for manual testing
df_fake_manual_testing = df_fake.tail(50)
for i in range(23480,23430,-1):
    df_fake.drop([i], axis = 0, inplace = True)
    
    
df_true_manual_testing = df_true.tail(50)
for i in range(21416,21366,-1):
    df_true.drop([i], axis = 0, inplace = True)

In [6]:
df_fake_manual_testing["label"] = 0
df_true_manual_testing["label"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fake_manual_testing["label"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_true_manual_testing["label"] = 1


In [7]:
df_manual_testing = pd.concat([df_fake_manual_testing,df_true_manual_testing], axis = 0)
df_manual_testing.to_csv("manual_testing.csv")

In [8]:
df = pd.concat([df_fake, df_true], axis =0 )

In [9]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [10]:
len(df)

44798

In [11]:
df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [12]:
df.dropna(inplace=True)

In [13]:
df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [14]:
len(df)

44798

In [16]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def preprocess_text(text):

    def remove_reuters_prefix(text):
        pattern = r'^[\s\S]*?\(reuters\) - '
        return re.sub(pattern, '', text)

    def strip_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    def remove_square_brackets(text):
        return re.sub('\[[^]]*\]', '', text)

    def remove_urls(text):
        return re.sub(r'http\S+', '', text)

    def remove_stopwords(text):
        final_text = []
        for i in text.split():
            if i.strip().lower() not in stop:
                final_text.append(i.strip())
        return " ".join(final_text)

    text = text.lower()
    text = remove_reuters_prefix(text)
    text = strip_html(text)
    text = remove_square_brackets(text)
    text = remove_urls(text)
    text = remove_stopwords(text)

    return text


In [17]:
df['text'] = df['text'].apply(preprocess_text)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


In [18]:
df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [19]:
df = df.filter(items=['text', 'label'])

In [20]:
df.head()

Unnamed: 0,text,label
0,donald trump wish americans happy new year lea...,0
1,house intelligence committee chairman devin nu...,0
2,"friday, revealed former milwaukee sheriff davi...",0
3,"christmas day, donald trump announced would ba...",0
4,pope francis used annual christmas day message...,0


In [21]:
df = df.sample(frac = 1)
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [22]:
df.head()

Unnamed: 0,text,label
0,black protesters white police would killed sec...,0
1,colombia armed forces authorized launch air ra...,1
2,great idea! sheriff joe arpaio considering run...,0
3,share everyone! good reason (with exception ob...,0
4,opponents proposal create u.s. border tax impo...,1


In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [23]:
def tokenize_function(data):
    return tokenizer(data['text'], padding="max_length", truncation=True)

In [24]:
dataset = Dataset.from_pandas(df)

In [25]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 44798/44798 [02:56<00:00, 254.28 examples/s]


In [26]:
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [27]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [28]:
device

'cuda'

In [29]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [31]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [32]:
training_args = TrainingArguments(
    output_dir='./results',  # Directory for saving model checkpoints and other results
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=50,  # Log every 50 steps
    save_total_limit=3,  # Limit the total amount of checkpoints
    save_steps=2_000,  # Save checkpoint every 2,000 steps
    evaluation_strategy="steps",  # Evaluation strategy to use
    eval_steps=2_000,  # Evaluate every 2,000 steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="f1",  # Use F1 score to determine the best model
)



In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [34]:
trainer.train()

                                                      
 15%|█▍        | 2000/13440 [27:08<2:18:30,  1.38it/s]

{'eval_loss': 0.011703373864293098, 'eval_accuracy': 0.9969866071428571, 'eval_precision': 0.9969869197785193, 'eval_recall': 0.9969866071428571, 'eval_f1': 0.9969866514822618, 'eval_runtime': 235.8809, 'eval_samples_per_second': 37.985, 'eval_steps_per_second': 4.748, 'epoch': 0.45}


 15%|█▌        | 2050/13440 [27:46<2:18:53,  1.37it/s]  

{'loss': 0.0266, 'grad_norm': 0.04126088693737984, 'learning_rate': 4.401081916537868e-05, 'epoch': 0.46}


 16%|█▌        | 2100/13440 [28:23<2:19:07,  1.36it/s]

{'loss': 0.0503, 'grad_norm': 0.01036617998033762, 'learning_rate': 4.381761978361669e-05, 'epoch': 0.47}


 16%|█▌        | 2150/13440 [29:00<2:22:45,  1.32it/s]

{'loss': 0.0343, 'grad_norm': 0.00663467962294817, 'learning_rate': 4.3624420401854714e-05, 'epoch': 0.48}


 16%|█▋        | 2200/13440 [29:37<2:16:26,  1.37it/s]

{'loss': 0.0342, 'grad_norm': 0.011663831770420074, 'learning_rate': 4.3431221020092735e-05, 'epoch': 0.49}


 17%|█▋        | 2250/13440 [30:13<2:10:38,  1.43it/s]

{'loss': 0.0666, 'grad_norm': 0.007911188527941704, 'learning_rate': 4.3238021638330764e-05, 'epoch': 0.5}


 17%|█▋        | 2300/13440 [30:48<2:06:13,  1.47it/s]

{'loss': 0.0256, 'grad_norm': 0.00293835811316967, 'learning_rate': 4.304482225656878e-05, 'epoch': 0.51}


 17%|█▋        | 2350/13440 [31:22<2:07:10,  1.45it/s]

{'loss': 0.0001, 'grad_norm': 0.001901215990073979, 'learning_rate': 4.28516228748068e-05, 'epoch': 0.52}


 18%|█▊        | 2400/13440 [31:57<2:07:02,  1.45it/s]

{'loss': 0.0264, 'grad_norm': 1160.1175537109375, 'learning_rate': 4.265842349304482e-05, 'epoch': 0.54}


 18%|█▊        | 2450/13440 [32:32<2:10:32,  1.40it/s]

{'loss': 0.0579, 'grad_norm': 0.006658140569925308, 'learning_rate': 4.246522411128285e-05, 'epoch': 0.55}


 19%|█▊        | 2500/13440 [33:06<2:03:08,  1.48it/s]

{'loss': 0.0601, 'grad_norm': 0.00860655028373003, 'learning_rate': 4.2272024729520865e-05, 'epoch': 0.56}


 19%|█▉        | 2550/13440 [33:41<2:07:18,  1.43it/s]

{'loss': 0.0163, 'grad_norm': 0.0047186920419335365, 'learning_rate': 4.2078825347758886e-05, 'epoch': 0.57}


 19%|█▉        | 2600/13440 [34:16<2:04:56,  1.45it/s]

{'loss': 0.0382, 'grad_norm': 0.0037789372727274895, 'learning_rate': 4.188562596599691e-05, 'epoch': 0.58}


 20%|█▉        | 2650/13440 [34:50<2:04:19,  1.45it/s]

{'loss': 0.0332, 'grad_norm': 0.002206839621067047, 'learning_rate': 4.1692426584234936e-05, 'epoch': 0.59}


 20%|██        | 2700/13440 [35:25<2:02:54,  1.46it/s]

{'loss': 0.0227, 'grad_norm': 3.453540325164795, 'learning_rate': 4.149922720247296e-05, 'epoch': 0.6}


 20%|██        | 2750/13440 [35:59<2:07:11,  1.40it/s]

{'loss': 0.0737, 'grad_norm': 0.029167981818318367, 'learning_rate': 4.130602782071097e-05, 'epoch': 0.61}


 21%|██        | 2800/13440 [36:32<1:51:37,  1.59it/s]

{'loss': 0.0331, 'grad_norm': 0.01474771834909916, 'learning_rate': 4.1112828438948994e-05, 'epoch': 0.62}


 21%|██        | 2850/13440 [37:04<1:51:27,  1.58it/s]

{'loss': 0.032, 'grad_norm': 0.014380394481122494, 'learning_rate': 4.091962905718702e-05, 'epoch': 0.64}


 22%|██▏       | 2900/13440 [37:36<1:50:12,  1.59it/s]

{'loss': 0.0219, 'grad_norm': 0.006894794758409262, 'learning_rate': 4.0726429675425044e-05, 'epoch': 0.65}


 22%|██▏       | 2950/13440 [38:10<2:02:36,  1.43it/s]

{'loss': 0.0374, 'grad_norm': 0.011370779015123844, 'learning_rate': 4.053323029366306e-05, 'epoch': 0.66}


 22%|██▏       | 3000/13440 [38:42<1:57:06,  1.49it/s]

{'loss': 0.0661, 'grad_norm': 0.0701204165816307, 'learning_rate': 4.034003091190108e-05, 'epoch': 0.67}


 23%|██▎       | 3050/13440 [39:18<2:02:16,  1.42it/s]

{'loss': 0.0294, 'grad_norm': 0.011096499860286713, 'learning_rate': 4.014683153013911e-05, 'epoch': 0.68}


 23%|██▎       | 3100/13440 [39:55<2:06:49,  1.36it/s]

{'loss': 0.0065, 'grad_norm': 0.005870483350008726, 'learning_rate': 3.995363214837713e-05, 'epoch': 0.69}


 23%|██▎       | 3150/13440 [40:29<1:52:00,  1.53it/s]

{'loss': 0.0382, 'grad_norm': 0.0057111987844109535, 'learning_rate': 3.9760432766615145e-05, 'epoch': 0.7}


 24%|██▍       | 3200/13440 [41:04<2:03:01,  1.39it/s]

{'loss': 0.0685, 'grad_norm': 0.025007987394928932, 'learning_rate': 3.956723338485317e-05, 'epoch': 0.71}


 24%|██▍       | 3250/13440 [41:41<2:06:49,  1.34it/s]

{'loss': 0.003, 'grad_norm': 0.004142228048294783, 'learning_rate': 3.9374034003091195e-05, 'epoch': 0.73}


 25%|██▍       | 3300/13440 [42:18<2:06:58,  1.33it/s]

{'loss': 0.0364, 'grad_norm': 0.004840391222387552, 'learning_rate': 3.9180834621329217e-05, 'epoch': 0.74}


 25%|██▍       | 3350/13440 [42:55<2:05:32,  1.34it/s]

{'loss': 0.0196, 'grad_norm': 0.033741455525159836, 'learning_rate': 3.898763523956723e-05, 'epoch': 0.75}


 25%|██▌       | 3400/13440 [43:32<2:04:06,  1.35it/s]

{'loss': 0.0124, 'grad_norm': 0.07082712650299072, 'learning_rate': 3.879443585780525e-05, 'epoch': 0.76}


 26%|██▌       | 3450/13440 [44:09<2:04:55,  1.33it/s]

{'loss': 0.0469, 'grad_norm': 0.002446320140734315, 'learning_rate': 3.860123647604328e-05, 'epoch': 0.77}


 26%|██▌       | 3500/13440 [44:45<2:02:49,  1.35it/s]

{'loss': 0.0094, 'grad_norm': 0.001972583355382085, 'learning_rate': 3.84080370942813e-05, 'epoch': 0.78}


 26%|██▋       | 3550/13440 [45:22<2:00:52,  1.36it/s]

{'loss': 0.0009, 'grad_norm': 0.011396575719118118, 'learning_rate': 3.821483771251932e-05, 'epoch': 0.79}


 27%|██▋       | 3600/13440 [45:58<1:56:46,  1.40it/s]

{'loss': 0.0874, 'grad_norm': 0.005708309821784496, 'learning_rate': 3.802163833075734e-05, 'epoch': 0.8}


 27%|██▋       | 3650/13440 [46:34<2:00:43,  1.35it/s]

{'loss': 0.0019, 'grad_norm': 0.003343229880556464, 'learning_rate': 3.782843894899537e-05, 'epoch': 0.81}


 28%|██▊       | 3700/13440 [47:11<1:57:38,  1.38it/s]

{'loss': 0.0427, 'grad_norm': 0.01296034175902605, 'learning_rate': 3.763523956723339e-05, 'epoch': 0.83}


 28%|██▊       | 3750/13440 [47:48<2:01:27,  1.33it/s]

{'loss': 0.0146, 'grad_norm': 0.006794216576963663, 'learning_rate': 3.744204018547141e-05, 'epoch': 0.84}


 28%|██▊       | 3800/13440 [48:25<2:00:04,  1.34it/s]

{'loss': 0.0382, 'grad_norm': 0.006701276171952486, 'learning_rate': 3.7248840803709425e-05, 'epoch': 0.85}


 29%|██▊       | 3850/13440 [49:02<1:58:45,  1.35it/s]

{'loss': 0.0421, 'grad_norm': 0.032422393560409546, 'learning_rate': 3.7055641421947454e-05, 'epoch': 0.86}


 29%|██▉       | 3900/13440 [49:39<1:55:34,  1.38it/s]

{'loss': 0.0158, 'grad_norm': 0.003955509513616562, 'learning_rate': 3.6862442040185475e-05, 'epoch': 0.87}


 29%|██▉       | 3950/13440 [50:16<1:57:47,  1.34it/s]

{'loss': 0.0246, 'grad_norm': 0.011270631104707718, 'learning_rate': 3.66692426584235e-05, 'epoch': 0.88}


 30%|██▉       | 4000/13440 [50:54<1:57:38,  1.34it/s]

{'loss': 0.0205, 'grad_norm': 0.003182325279340148, 'learning_rate': 3.647604327666151e-05, 'epoch': 0.89}


                                                      
 30%|██▉       | 4000/13440 [54:50<1:57:38,  1.34it/s]

{'eval_loss': 0.00838431715965271, 'eval_accuracy': 0.9986607142857142, 'eval_precision': 0.998661165510803, 'eval_recall': 0.9986607142857142, 'eval_f1': 0.9986607404938024, 'eval_runtime': 235.8643, 'eval_samples_per_second': 37.988, 'eval_steps_per_second': 4.748, 'epoch': 0.89}


 30%|███       | 4050/13440 [55:28<1:53:14,  1.38it/s]  

{'loss': 0.0447, 'grad_norm': 0.004140955395996571, 'learning_rate': 3.628284389489954e-05, 'epoch': 0.9}


 31%|███       | 4100/13440 [56:04<1:54:00,  1.37it/s]

{'loss': 0.0338, 'grad_norm': 8.40139389038086, 'learning_rate': 3.608964451313756e-05, 'epoch': 0.92}


 31%|███       | 4150/13440 [56:42<1:58:28,  1.31it/s]

{'loss': 0.0106, 'grad_norm': 0.00241656182333827, 'learning_rate': 3.589644513137558e-05, 'epoch': 0.93}


 31%|███▏      | 4200/13440 [57:20<1:52:42,  1.37it/s]

{'loss': 0.0184, 'grad_norm': 0.02831336483359337, 'learning_rate': 3.57032457496136e-05, 'epoch': 0.94}


 32%|███▏      | 4250/13440 [57:57<1:49:17,  1.40it/s]

{'loss': 0.0305, 'grad_norm': 0.019439697265625, 'learning_rate': 3.5510046367851626e-05, 'epoch': 0.95}


 32%|███▏      | 4300/13440 [58:33<1:48:10,  1.41it/s]

{'loss': 0.052, 'grad_norm': 0.04795429855585098, 'learning_rate': 3.531684698608965e-05, 'epoch': 0.96}


 32%|███▏      | 4350/13440 [59:09<1:47:40,  1.41it/s]

{'loss': 0.0327, 'grad_norm': 0.016046244651079178, 'learning_rate': 3.512364760432767e-05, 'epoch': 0.97}


 33%|███▎      | 4400/13440 [59:44<1:47:21,  1.40it/s]

{'loss': 0.0196, 'grad_norm': 0.020063884556293488, 'learning_rate': 3.4930448222565684e-05, 'epoch': 0.98}


 33%|███▎      | 4450/13440 [1:00:20<1:47:06,  1.40it/s]

{'loss': 0.0163, 'grad_norm': 0.0068003456108272076, 'learning_rate': 3.473724884080371e-05, 'epoch': 0.99}


 33%|███▎      | 4500/13440 [1:00:56<1:49:05,  1.37it/s]

{'loss': 0.0033, 'grad_norm': 0.004684485495090485, 'learning_rate': 3.4544049459041734e-05, 'epoch': 1.0}


 34%|███▍      | 4550/13440 [1:01:33<1:48:01,  1.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0035074034240096807, 'learning_rate': 3.4350850077279756e-05, 'epoch': 1.02}


 34%|███▍      | 4600/13440 [1:02:09<1:51:09,  1.33it/s]

{'loss': 0.019, 'grad_norm': 0.008328988216817379, 'learning_rate': 3.415765069551777e-05, 'epoch': 1.03}


 35%|███▍      | 4650/13440 [1:02:46<1:46:37,  1.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0021906367037445307, 'learning_rate': 3.39644513137558e-05, 'epoch': 1.04}


 35%|███▍      | 4700/13440 [1:03:23<1:48:10,  1.35it/s]

{'loss': 0.0001, 'grad_norm': 0.0013431616825982928, 'learning_rate': 3.377125193199382e-05, 'epoch': 1.05}


 35%|███▌      | 4750/13440 [1:03:59<1:46:20,  1.36it/s]

{'loss': 0.0001, 'grad_norm': 0.0017265656497329473, 'learning_rate': 3.357805255023184e-05, 'epoch': 1.06}


 36%|███▌      | 4800/13440 [1:04:36<1:45:19,  1.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0013180042151361704, 'learning_rate': 3.3384853168469863e-05, 'epoch': 1.07}


 36%|███▌      | 4850/13440 [1:05:12<1:46:00,  1.35it/s]

{'loss': 0.0, 'grad_norm': 0.0011092444183304906, 'learning_rate': 3.3191653786707885e-05, 'epoch': 1.08}


 36%|███▋      | 4900/13440 [1:05:49<1:44:19,  1.36it/s]

{'loss': 0.0, 'grad_norm': 0.0007945583201944828, 'learning_rate': 3.2998454404945907e-05, 'epoch': 1.09}


 37%|███▋      | 4950/13440 [1:06:25<1:43:30,  1.37it/s]

{'loss': 0.0401, 'grad_norm': 0.0016780684236437082, 'learning_rate': 3.280525502318393e-05, 'epoch': 1.1}


 37%|███▋      | 5000/13440 [1:07:01<1:43:03,  1.36it/s]

{'loss': 0.0361, 'grad_norm': 0.010490736924111843, 'learning_rate': 3.261205564142195e-05, 'epoch': 1.12}


 38%|███▊      | 5050/13440 [1:07:38<1:41:18,  1.38it/s]

{'loss': 0.0414, 'grad_norm': 0.004964008461683989, 'learning_rate': 3.241885625965997e-05, 'epoch': 1.13}


 38%|███▊      | 5100/13440 [1:08:15<1:42:21,  1.36it/s]

{'loss': 0.0003, 'grad_norm': 0.006159558426588774, 'learning_rate': 3.222565687789799e-05, 'epoch': 1.14}


 38%|███▊      | 5150/13440 [1:08:52<1:42:52,  1.34it/s]

{'loss': 0.0167, 'grad_norm': 151.51600646972656, 'learning_rate': 3.2032457496136014e-05, 'epoch': 1.15}


 39%|███▊      | 5200/13440 [1:09:29<1:41:43,  1.35it/s]

{'loss': 0.017, 'grad_norm': 0.003202777821570635, 'learning_rate': 3.1839258114374036e-05, 'epoch': 1.16}


 39%|███▉      | 5250/13440 [1:10:05<1:38:37,  1.38it/s]

{'loss': 0.0001, 'grad_norm': 0.002407362451776862, 'learning_rate': 3.164605873261206e-05, 'epoch': 1.17}


 39%|███▉      | 5300/13440 [1:10:41<1:38:38,  1.38it/s]

{'loss': 0.0001, 'grad_norm': 0.0019770548678934574, 'learning_rate': 3.145285935085008e-05, 'epoch': 1.18}


 40%|███▉      | 5350/13440 [1:11:16<1:31:22,  1.48it/s]

{'loss': 0.0001, 'grad_norm': 0.0016414750134572387, 'learning_rate': 3.12596599690881e-05, 'epoch': 1.19}


 40%|████      | 5400/13440 [1:11:50<1:32:38,  1.45it/s]

{'loss': 0.046, 'grad_norm': 0.0022190490271896124, 'learning_rate': 3.106646058732612e-05, 'epoch': 1.21}


 41%|████      | 5450/13440 [1:12:25<1:35:35,  1.39it/s]

{'loss': 0.0001, 'grad_norm': 0.009225898422300816, 'learning_rate': 3.0873261205564144e-05, 'epoch': 1.22}


 41%|████      | 5500/13440 [1:13:01<1:36:22,  1.37it/s]

{'loss': 0.0239, 'grad_norm': 0.00698483781889081, 'learning_rate': 3.0680061823802165e-05, 'epoch': 1.23}


 41%|████▏     | 5550/13440 [1:13:37<1:33:58,  1.40it/s]

{'loss': 0.0001, 'grad_norm': 0.007882528938353062, 'learning_rate': 3.0486862442040187e-05, 'epoch': 1.24}


 42%|████▏     | 5600/13440 [1:14:13<1:35:24,  1.37it/s]

{'loss': 0.0214, 'grad_norm': 0.011859542690217495, 'learning_rate': 3.0293663060278212e-05, 'epoch': 1.25}


 42%|████▏     | 5650/13440 [1:14:50<1:34:32,  1.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0020261616446077824, 'learning_rate': 3.0100463678516227e-05, 'epoch': 1.26}


 42%|████▏     | 5700/13440 [1:15:26<1:33:37,  1.38it/s]

{'loss': 0.0001, 'grad_norm': 0.0013800404267385602, 'learning_rate': 2.990726429675425e-05, 'epoch': 1.27}


 43%|████▎     | 5750/13440 [1:16:02<1:32:36,  1.38it/s]

{'loss': 0.0, 'grad_norm': 0.0012907875934615731, 'learning_rate': 2.9714064914992273e-05, 'epoch': 1.28}


 43%|████▎     | 5800/13440 [1:16:38<1:31:59,  1.38it/s]

{'loss': 0.0, 'grad_norm': 0.0009089414379559457, 'learning_rate': 2.9520865533230298e-05, 'epoch': 1.29}


 44%|████▎     | 5850/13440 [1:17:13<1:31:08,  1.39it/s]

{'loss': 0.0311, 'grad_norm': 0.001074925297871232, 'learning_rate': 2.932766615146832e-05, 'epoch': 1.31}


 44%|████▍     | 5900/13440 [1:17:49<1:30:47,  1.38it/s]

{'loss': 0.0, 'grad_norm': 0.004112385679036379, 'learning_rate': 2.9134466769706338e-05, 'epoch': 1.32}


 44%|████▍     | 5950/13440 [1:18:25<1:32:04,  1.36it/s]

{'loss': 0.0, 'grad_norm': 0.0006637957994826138, 'learning_rate': 2.894126738794436e-05, 'epoch': 1.33}


 45%|████▍     | 6000/13440 [1:19:01<1:27:05,  1.42it/s]

{'loss': 0.0, 'grad_norm': 0.0007188444142229855, 'learning_rate': 2.8748068006182384e-05, 'epoch': 1.34}


                                                        
 45%|████▍     | 6000/13440 [1:22:51<1:27:05,  1.42it/s]

{'eval_loss': 0.009827688336372375, 'eval_accuracy': 0.9985491071428572, 'eval_precision': 0.9985512518225274, 'eval_recall': 0.9985491071428572, 'eval_f1': 0.998549170207182, 'eval_runtime': 229.7173, 'eval_samples_per_second': 39.004, 'eval_steps_per_second': 4.876, 'epoch': 1.34}


 45%|████▌     | 6050/13440 [1:23:28<1:26:46,  1.42it/s]  

{'loss': 0.0168, 'grad_norm': 0.0006843885639682412, 'learning_rate': 2.8554868624420406e-05, 'epoch': 1.35}


 45%|████▌     | 6100/13440 [1:24:03<1:25:41,  1.43it/s]

{'loss': 0.0751, 'grad_norm': 0.005218070931732655, 'learning_rate': 2.8361669242658424e-05, 'epoch': 1.36}


 46%|████▌     | 6150/13440 [1:24:39<1:26:55,  1.40it/s]

{'loss': 0.0002, 'grad_norm': 0.01440390944480896, 'learning_rate': 2.8168469860896446e-05, 'epoch': 1.37}


 46%|████▌     | 6200/13440 [1:25:15<1:28:40,  1.36it/s]

{'loss': 0.0001, 'grad_norm': 0.0015132564585655928, 'learning_rate': 2.797527047913447e-05, 'epoch': 1.38}


 47%|████▋     | 6250/13440 [1:25:51<1:24:57,  1.41it/s]

{'loss': 0.0441, 'grad_norm': 0.01553453877568245, 'learning_rate': 2.7782071097372492e-05, 'epoch': 1.4}


 47%|████▋     | 6300/13440 [1:26:27<1:23:49,  1.42it/s]

{'loss': 0.0208, 'grad_norm': 0.011310472153127193, 'learning_rate': 2.758887171561051e-05, 'epoch': 1.41}


 47%|████▋     | 6350/13440 [1:27:02<1:27:29,  1.35it/s]

{'loss': 0.0348, 'grad_norm': 0.03468591719865799, 'learning_rate': 2.7395672333848532e-05, 'epoch': 1.42}


 48%|████▊     | 6400/13440 [1:27:38<1:22:52,  1.42it/s]

{'loss': 0.0004, 'grad_norm': 0.0019778255373239517, 'learning_rate': 2.7202472952086557e-05, 'epoch': 1.43}


 48%|████▊     | 6450/13440 [1:28:15<1:25:27,  1.36it/s]

{'loss': 0.0151, 'grad_norm': 0.001585328602232039, 'learning_rate': 2.700927357032458e-05, 'epoch': 1.44}


 48%|████▊     | 6500/13440 [1:28:51<1:23:25,  1.39it/s]

{'loss': 0.0001, 'grad_norm': 0.0015588682144880295, 'learning_rate': 2.6816074188562596e-05, 'epoch': 1.45}


 49%|████▊     | 6550/13440 [1:29:27<1:22:32,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.001398434629663825, 'learning_rate': 2.6622874806800618e-05, 'epoch': 1.46}


 49%|████▉     | 6600/13440 [1:30:03<1:20:27,  1.42it/s]

{'loss': 0.0537, 'grad_norm': 0.003005996346473694, 'learning_rate': 2.6429675425038643e-05, 'epoch': 1.47}


 49%|████▉     | 6650/13440 [1:30:38<1:21:26,  1.39it/s]

{'loss': 0.0091, 'grad_norm': 0.002766022691503167, 'learning_rate': 2.6236476043276665e-05, 'epoch': 1.48}


 50%|████▉     | 6700/13440 [1:31:14<1:19:27,  1.41it/s]

{'loss': 0.0332, 'grad_norm': 0.008017952553927898, 'learning_rate': 2.6043276661514683e-05, 'epoch': 1.5}


 50%|█████     | 6750/13440 [1:31:50<1:20:31,  1.38it/s]

{'loss': 0.02, 'grad_norm': 0.004357119556516409, 'learning_rate': 2.5850077279752704e-05, 'epoch': 1.51}


 51%|█████     | 6800/13440 [1:32:26<1:19:54,  1.38it/s]

{'loss': 0.0001, 'grad_norm': 0.002452077576890588, 'learning_rate': 2.565687789799073e-05, 'epoch': 1.52}


 51%|█████     | 6850/13440 [1:33:01<1:17:25,  1.42it/s]

{'loss': 0.0001, 'grad_norm': 0.0014945589937269688, 'learning_rate': 2.546367851622875e-05, 'epoch': 1.53}


 51%|█████▏    | 6900/13440 [1:33:37<1:14:16,  1.47it/s]

{'loss': 0.0001, 'grad_norm': 0.005023638252168894, 'learning_rate': 2.5270479134466772e-05, 'epoch': 1.54}


 52%|█████▏    | 6950/13440 [1:34:13<1:19:18,  1.36it/s]

{'loss': 0.0285, 'grad_norm': 75.24444580078125, 'learning_rate': 2.507727975270479e-05, 'epoch': 1.55}


 52%|█████▏    | 7000/13440 [1:34:50<1:19:38,  1.35it/s]

{'loss': 0.0001, 'grad_norm': 0.0022704198490828276, 'learning_rate': 2.4884080370942815e-05, 'epoch': 1.56}


 52%|█████▏    | 7050/13440 [1:35:27<1:19:40,  1.34it/s]

{'loss': 0.0, 'grad_norm': 0.0010087962727993727, 'learning_rate': 2.4690880989180837e-05, 'epoch': 1.57}


 53%|█████▎    | 7100/13440 [1:36:04<1:16:27,  1.38it/s]

{'loss': 0.0001, 'grad_norm': 0.0007063655648380518, 'learning_rate': 2.449768160741886e-05, 'epoch': 1.58}


 53%|█████▎    | 7150/13440 [1:36:40<1:19:46,  1.31it/s]

{'loss': 0.0, 'grad_norm': 0.0007690652273595333, 'learning_rate': 2.430448222565688e-05, 'epoch': 1.6}


 54%|█████▎    | 7200/13440 [1:37:16<1:13:33,  1.41it/s]

{'loss': 0.0135, 'grad_norm': 0.0009114729473367333, 'learning_rate': 2.4111282843894902e-05, 'epoch': 1.61}


 54%|█████▍    | 7250/13440 [1:37:52<1:13:16,  1.41it/s]

{'loss': 0.0483, 'grad_norm': 0.002124591264873743, 'learning_rate': 2.3918083462132923e-05, 'epoch': 1.62}


 54%|█████▍    | 7300/13440 [1:38:28<1:14:44,  1.37it/s]

{'loss': 0.0174, 'grad_norm': 0.003746192902326584, 'learning_rate': 2.3724884080370945e-05, 'epoch': 1.63}


 55%|█████▍    | 7350/13440 [1:39:05<1:14:31,  1.36it/s]

{'loss': 0.0508, 'grad_norm': 0.015553801320493221, 'learning_rate': 2.3531684698608966e-05, 'epoch': 1.64}


 55%|█████▌    | 7400/13440 [1:39:41<1:13:12,  1.37it/s]

{'loss': 0.0003, 'grad_norm': 0.004322574008256197, 'learning_rate': 2.3338485316846988e-05, 'epoch': 1.65}


 55%|█████▌    | 7450/13440 [1:40:17<1:12:59,  1.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0023547187447547913, 'learning_rate': 2.314528593508501e-05, 'epoch': 1.66}


 56%|█████▌    | 7500/13440 [1:40:53<1:10:03,  1.41it/s]

{'loss': 0.0001, 'grad_norm': 0.002604213310405612, 'learning_rate': 2.295208655332303e-05, 'epoch': 1.67}


 56%|█████▌    | 7550/13440 [1:41:28<1:10:53,  1.38it/s]

{'loss': 0.0001, 'grad_norm': 0.002916746074333787, 'learning_rate': 2.2758887171561053e-05, 'epoch': 1.69}


 57%|█████▋    | 7600/13440 [1:42:04<1:07:25,  1.44it/s]

{'loss': 0.0185, 'grad_norm': 0.0010850856779143214, 'learning_rate': 2.2565687789799074e-05, 'epoch': 1.7}


 57%|█████▋    | 7650/13440 [1:42:40<1:09:41,  1.38it/s]

{'loss': 0.0258, 'grad_norm': 0.014043807983398438, 'learning_rate': 2.2372488408037096e-05, 'epoch': 1.71}


 57%|█████▋    | 7700/13440 [1:43:16<1:08:52,  1.39it/s]

{'loss': 0.0003, 'grad_norm': 0.004582550376653671, 'learning_rate': 2.2179289026275117e-05, 'epoch': 1.72}


 58%|█████▊    | 7750/13440 [1:43:52<1:08:11,  1.39it/s]

{'loss': 0.0211, 'grad_norm': 0.004590483382344246, 'learning_rate': 2.198608964451314e-05, 'epoch': 1.73}


 58%|█████▊    | 7800/13440 [1:44:27<1:05:59,  1.42it/s]

{'loss': 0.0001, 'grad_norm': 0.0021655368618667126, 'learning_rate': 2.179289026275116e-05, 'epoch': 1.74}


 58%|█████▊    | 7850/13440 [1:45:02<1:05:21,  1.43it/s]

{'loss': 0.0163, 'grad_norm': 0.0077921380288898945, 'learning_rate': 2.1599690880989182e-05, 'epoch': 1.75}


 59%|█████▉    | 7900/13440 [1:45:38<1:06:57,  1.38it/s]

{'loss': 0.0002, 'grad_norm': 0.0147707499563694, 'learning_rate': 2.1406491499227204e-05, 'epoch': 1.76}


 59%|█████▉    | 7950/13440 [1:46:14<1:07:13,  1.36it/s]

{'loss': 0.0175, 'grad_norm': 0.0020233753602951765, 'learning_rate': 2.1213292117465225e-05, 'epoch': 1.77}


 60%|█████▉    | 8000/13440 [1:46:51<1:07:35,  1.34it/s]

{'loss': 0.0026, 'grad_norm': 0.00811986904591322, 'learning_rate': 2.1020092735703247e-05, 'epoch': 1.79}


                                                        
 60%|█████▉    | 8000/13440 [1:50:43<1:07:35,  1.34it/s]

{'eval_loss': 0.011392113752663136, 'eval_accuracy': 0.9981026785714285, 'eval_precision': 0.9981095485318726, 'eval_recall': 0.9981026785714285, 'eval_f1': 0.9981025122885386, 'eval_runtime': 231.8203, 'eval_samples_per_second': 38.651, 'eval_steps_per_second': 4.831, 'epoch': 1.79}


 60%|█████▉    | 8050/13440 [1:51:22<1:05:27,  1.37it/s]  

{'loss': 0.0149, 'grad_norm': 0.0016723948065191507, 'learning_rate': 2.0826893353941268e-05, 'epoch': 1.8}


 60%|██████    | 8100/13440 [1:51:58<1:05:41,  1.35it/s]

{'loss': 0.0213, 'grad_norm': 0.003087674966081977, 'learning_rate': 2.063369397217929e-05, 'epoch': 1.81}


 61%|██████    | 8150/13440 [1:52:34<1:05:29,  1.35it/s]

{'loss': 0.0001, 'grad_norm': 0.0013052449794486165, 'learning_rate': 2.044049459041731e-05, 'epoch': 1.82}


 61%|██████    | 8200/13440 [1:53:11<1:05:43,  1.33it/s]

{'loss': 0.0545, 'grad_norm': 0.006218386348336935, 'learning_rate': 2.0247295208655333e-05, 'epoch': 1.83}


 61%|██████▏   | 8250/13440 [1:53:48<1:04:28,  1.34it/s]

{'loss': 0.006, 'grad_norm': 0.0012042834423482418, 'learning_rate': 2.0054095826893355e-05, 'epoch': 1.84}


 62%|██████▏   | 8300/13440 [1:54:25<1:05:05,  1.32it/s]

{'loss': 0.0241, 'grad_norm': 0.005999427754431963, 'learning_rate': 1.9860896445131376e-05, 'epoch': 1.85}


 62%|██████▏   | 8350/13440 [1:55:03<1:03:39,  1.33it/s]

{'loss': 0.0002, 'grad_norm': 0.003585174446925521, 'learning_rate': 1.9667697063369398e-05, 'epoch': 1.86}


 62%|██████▎   | 8400/13440 [1:55:39<1:01:20,  1.37it/s]

{'loss': 0.0231, 'grad_norm': 0.006280276458710432, 'learning_rate': 1.947449768160742e-05, 'epoch': 1.88}


 63%|██████▎   | 8450/13440 [1:56:15<59:22,  1.40it/s]  

{'loss': 0.0114, 'grad_norm': 0.005543829407542944, 'learning_rate': 1.928129829984544e-05, 'epoch': 1.89}


 63%|██████▎   | 8500/13440 [1:56:51<1:00:07,  1.37it/s]

{'loss': 0.0214, 'grad_norm': 0.0038920289371162653, 'learning_rate': 1.9088098918083462e-05, 'epoch': 1.9}


 64%|██████▎   | 8550/13440 [1:57:27<58:02,  1.40it/s]  

{'loss': 0.024, 'grad_norm': 0.0037076901644468307, 'learning_rate': 1.8894899536321484e-05, 'epoch': 1.91}


 64%|██████▍   | 8600/13440 [1:58:03<59:03,  1.37it/s]

{'loss': 0.0141, 'grad_norm': 0.0036813525948673487, 'learning_rate': 1.8701700154559505e-05, 'epoch': 1.92}


 64%|██████▍   | 8650/13440 [1:58:40<58:25,  1.37it/s]

{'loss': 0.0397, 'grad_norm': 0.006997312884777784, 'learning_rate': 1.8508500772797527e-05, 'epoch': 1.93}


 65%|██████▍   | 8700/13440 [1:59:17<57:35,  1.37it/s]

{'loss': 0.0005, 'grad_norm': 0.003691278398036957, 'learning_rate': 1.831530139103555e-05, 'epoch': 1.94}


 65%|██████▌   | 8750/13440 [1:59:53<57:01,  1.37it/s]

{'loss': 0.0132, 'grad_norm': 0.0027093517128378153, 'learning_rate': 1.812210200927357e-05, 'epoch': 1.95}


 65%|██████▌   | 8800/13440 [2:00:29<55:11,  1.40it/s]

{'loss': 0.0001, 'grad_norm': 0.0013485264498740435, 'learning_rate': 1.792890262751159e-05, 'epoch': 1.96}


 66%|██████▌   | 8850/13440 [2:01:05<55:55,  1.37it/s]

{'loss': 0.0329, 'grad_norm': 0.005324990022927523, 'learning_rate': 1.7735703245749617e-05, 'epoch': 1.98}


 66%|██████▌   | 8900/13440 [2:01:41<54:10,  1.40it/s]

{'loss': 0.0101, 'grad_norm': 0.0019242214038968086, 'learning_rate': 1.7542503863987635e-05, 'epoch': 1.99}


 67%|██████▋   | 8950/13440 [2:02:18<54:32,  1.37it/s]

{'loss': 0.0002, 'grad_norm': 0.0015023265732452273, 'learning_rate': 1.734930448222566e-05, 'epoch': 2.0}


 67%|██████▋   | 9000/13440 [2:02:54<52:44,  1.40it/s]

{'loss': 0.0, 'grad_norm': 0.0012417641701176763, 'learning_rate': 1.7156105100463678e-05, 'epoch': 2.01}


 67%|██████▋   | 9050/13440 [2:03:30<53:25,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0012029348872601986, 'learning_rate': 1.6962905718701703e-05, 'epoch': 2.02}


 68%|██████▊   | 9100/13440 [2:04:05<51:22,  1.41it/s]

{'loss': 0.0251, 'grad_norm': 0.001149741350673139, 'learning_rate': 1.676970633693972e-05, 'epoch': 2.03}


 68%|██████▊   | 9150/13440 [2:04:41<50:52,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0009577294113114476, 'learning_rate': 1.6576506955177746e-05, 'epoch': 2.04}


 68%|██████▊   | 9200/13440 [2:05:17<50:54,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.000960576580837369, 'learning_rate': 1.6383307573415764e-05, 'epoch': 2.05}


 69%|██████▉   | 9250/13440 [2:05:53<50:46,  1.38it/s]

{'loss': 0.0126, 'grad_norm': 0.0006073267431929708, 'learning_rate': 1.619010819165379e-05, 'epoch': 2.06}


 69%|██████▉   | 9300/13440 [2:06:28<49:35,  1.39it/s]

{'loss': 0.0253, 'grad_norm': 0.005082892719656229, 'learning_rate': 1.5996908809891807e-05, 'epoch': 2.08}


 70%|██████▉   | 9350/13440 [2:07:04<48:09,  1.42it/s]

{'loss': 0.0008, 'grad_norm': 0.0014793219743296504, 'learning_rate': 1.5803709428129832e-05, 'epoch': 2.09}


 70%|██████▉   | 9400/13440 [2:07:39<47:30,  1.42it/s]

{'loss': 0.0, 'grad_norm': 0.0020658005960285664, 'learning_rate': 1.561051004636785e-05, 'epoch': 2.1}


 70%|███████   | 9450/13440 [2:08:15<47:44,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.003340383293107152, 'learning_rate': 1.5417310664605875e-05, 'epoch': 2.11}


 71%|███████   | 9500/13440 [2:08:51<47:14,  1.39it/s]

{'loss': 0.0114, 'grad_norm': 0.0024777064099907875, 'learning_rate': 1.5224111282843895e-05, 'epoch': 2.12}


 71%|███████   | 9550/13440 [2:09:27<46:35,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.0010292943334206939, 'learning_rate': 1.5030911901081918e-05, 'epoch': 2.13}


 71%|███████▏  | 9600/13440 [2:10:03<45:54,  1.39it/s]

{'loss': 0.0077, 'grad_norm': 0.001118711312301457, 'learning_rate': 1.4837712519319938e-05, 'epoch': 2.14}


 72%|███████▏  | 9650/13440 [2:10:39<45:24,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.0005728807882405818, 'learning_rate': 1.4644513137557962e-05, 'epoch': 2.15}


 72%|███████▏  | 9700/13440 [2:11:14<43:44,  1.42it/s]

{'loss': 0.0001, 'grad_norm': 0.000561264983844012, 'learning_rate': 1.4451313755795981e-05, 'epoch': 2.17}


 73%|███████▎  | 9750/13440 [2:11:50<44:15,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.00048265859368257225, 'learning_rate': 1.4258114374034005e-05, 'epoch': 2.18}


 73%|███████▎  | 9800/13440 [2:12:26<43:34,  1.39it/s]

{'loss': 0.0241, 'grad_norm': 0.001731249736621976, 'learning_rate': 1.4064914992272025e-05, 'epoch': 2.19}


 73%|███████▎  | 9850/13440 [2:13:02<42:54,  1.39it/s]

{'loss': 0.0241, 'grad_norm': 0.0010050362907350063, 'learning_rate': 1.3871715610510048e-05, 'epoch': 2.2}


 74%|███████▎  | 9900/13440 [2:13:37<42:16,  1.40it/s]

{'loss': 0.0001, 'grad_norm': 0.003370226128026843, 'learning_rate': 1.367851622874807e-05, 'epoch': 2.21}


 74%|███████▍  | 9950/13440 [2:14:13<41:06,  1.42it/s]

{'loss': 0.0, 'grad_norm': 0.0026513258926570415, 'learning_rate': 1.3485316846986091e-05, 'epoch': 2.22}


 74%|███████▍  | 10000/13440 [2:14:49<41:08,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.0008791492437012494, 'learning_rate': 1.3292117465224113e-05, 'epoch': 2.23}


                                                       
 74%|███████▍  | 10000/13440 [2:18:37<41:08,  1.39it/s]

{'eval_loss': 0.0038314524572342634, 'eval_accuracy': 0.9994419642857143, 'eval_precision': 0.9994421721491528, 'eval_recall': 0.9994419642857143, 'eval_f1': 0.9994419559488603, 'eval_runtime': 228.4443, 'eval_samples_per_second': 39.222, 'eval_steps_per_second': 4.903, 'epoch': 2.23}


 75%|███████▍  | 10050/13440 [2:19:15<40:30,  1.39it/s]   

{'loss': 0.0, 'grad_norm': 0.0007388900849036872, 'learning_rate': 1.3098918083462134e-05, 'epoch': 2.24}


 75%|███████▌  | 10100/13440 [2:19:51<39:56,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.0007009866531006992, 'learning_rate': 1.2905718701700156e-05, 'epoch': 2.25}


 76%|███████▌  | 10150/13440 [2:20:26<38:33,  1.42it/s]

{'loss': 0.0, 'grad_norm': 0.0005120745045132935, 'learning_rate': 1.2712519319938177e-05, 'epoch': 2.27}


 76%|███████▌  | 10200/13440 [2:21:02<38:40,  1.40it/s]

{'loss': 0.0, 'grad_norm': 0.0005866154097020626, 'learning_rate': 1.2519319938176199e-05, 'epoch': 2.28}


 76%|███████▋  | 10250/13440 [2:21:38<37:47,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0006719144294038415, 'learning_rate': 1.232612055641422e-05, 'epoch': 2.29}


 77%|███████▋  | 10300/13440 [2:22:13<37:31,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.00046496803406625986, 'learning_rate': 1.2132921174652242e-05, 'epoch': 2.3}


 77%|███████▋  | 10350/13440 [2:22:49<36:58,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.0004136731440667063, 'learning_rate': 1.1939721792890263e-05, 'epoch': 2.31}


 77%|███████▋  | 10400/13440 [2:23:25<35:34,  1.42it/s]

{'loss': 0.0, 'grad_norm': 0.0009705465054139495, 'learning_rate': 1.1746522411128285e-05, 'epoch': 2.32}


 78%|███████▊  | 10450/13440 [2:24:01<35:44,  1.39it/s]

{'loss': 0.0272, 'grad_norm': 0.004306999500840902, 'learning_rate': 1.1553323029366307e-05, 'epoch': 2.33}


 78%|███████▊  | 10500/13440 [2:24:36<34:55,  1.40it/s]

{'loss': 0.0001, 'grad_norm': 0.0006459119031205773, 'learning_rate': 1.1360123647604328e-05, 'epoch': 2.34}


 78%|███████▊  | 10550/13440 [2:25:12<34:38,  1.39it/s]

{'loss': 0.0001, 'grad_norm': 0.000521671783644706, 'learning_rate': 1.116692426584235e-05, 'epoch': 2.35}


 79%|███████▉  | 10600/13440 [2:25:47<33:56,  1.39it/s]

{'loss': 0.0, 'grad_norm': 0.0005488016176968813, 'learning_rate': 1.0973724884080371e-05, 'epoch': 2.37}


 79%|███████▉  | 10650/13440 [2:26:23<33:17,  1.40it/s]

{'loss': 0.0002, 'grad_norm': 0.00048612794489599764, 'learning_rate': 1.0780525502318393e-05, 'epoch': 2.38}


 80%|███████▉  | 10700/13440 [2:26:59<32:17,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0009732528124004602, 'learning_rate': 1.0587326120556414e-05, 'epoch': 2.39}


 80%|███████▉  | 10750/13440 [2:27:34<31:39,  1.42it/s]

{'loss': 0.0, 'grad_norm': 0.0003738639352377504, 'learning_rate': 1.0394126738794436e-05, 'epoch': 2.4}


 80%|████████  | 10800/13440 [2:28:10<32:13,  1.37it/s]

{'loss': 0.0124, 'grad_norm': 0.0006241571391001344, 'learning_rate': 1.0200927357032458e-05, 'epoch': 2.41}


 81%|████████  | 10850/13440 [2:28:47<31:35,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0005992588121443987, 'learning_rate': 1.0007727975270479e-05, 'epoch': 2.42}


 81%|████████  | 10900/13440 [2:29:23<30:59,  1.37it/s]

{'loss': 0.0214, 'grad_norm': 0.0049209981225430965, 'learning_rate': 9.8145285935085e-06, 'epoch': 2.43}


 81%|████████▏ | 10950/13440 [2:30:00<30:23,  1.37it/s]

{'loss': 0.0001, 'grad_norm': 0.001156185520812869, 'learning_rate': 9.621329211746522e-06, 'epoch': 2.44}


 82%|████████▏ | 11000/13440 [2:30:36<29:40,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.00040417115087620914, 'learning_rate': 9.428129829984544e-06, 'epoch': 2.46}


 82%|████████▏ | 11050/13440 [2:31:12<28:18,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.001079428824596107, 'learning_rate': 9.234930448222565e-06, 'epoch': 2.47}


 83%|████████▎ | 11100/13440 [2:31:48<27:42,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0005588580388575792, 'learning_rate': 9.041731066460587e-06, 'epoch': 2.48}


 83%|████████▎ | 11150/13440 [2:32:24<27:11,  1.40it/s]

{'loss': 0.0, 'grad_norm': 0.00036471348721534014, 'learning_rate': 8.848531684698608e-06, 'epoch': 2.49}


 83%|████████▎ | 11200/13440 [2:33:00<27:14,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.00033004695433191955, 'learning_rate': 8.65533230293663e-06, 'epoch': 2.5}


 84%|████████▎ | 11250/13440 [2:33:37<26:37,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0003523774503264576, 'learning_rate': 8.462132921174652e-06, 'epoch': 2.51}


 84%|████████▍ | 11300/13440 [2:34:13<26:01,  1.37it/s]

{'loss': 0.0282, 'grad_norm': 0.007688111159950495, 'learning_rate': 8.268933539412673e-06, 'epoch': 2.52}


 84%|████████▍ | 11350/13440 [2:34:50<25:29,  1.37it/s]

{'loss': 0.0002, 'grad_norm': 0.0021747234277427197, 'learning_rate': 8.075734157650695e-06, 'epoch': 2.53}


 85%|████████▍ | 11400/13440 [2:35:25<24:12,  1.40it/s]

{'loss': 0.0001, 'grad_norm': 0.002160309813916683, 'learning_rate': 7.882534775888716e-06, 'epoch': 2.54}


 85%|████████▌ | 11450/13440 [2:36:02<24:10,  1.37it/s]

{'loss': 0.0215, 'grad_norm': 0.0020575784146785736, 'learning_rate': 7.689335394126738e-06, 'epoch': 2.56}


 86%|████████▌ | 11500/13440 [2:36:38<23:33,  1.37it/s]

{'loss': 0.0001, 'grad_norm': 0.001611652784049511, 'learning_rate': 7.496136012364761e-06, 'epoch': 2.57}


 86%|████████▌ | 11550/13440 [2:37:15<22:20,  1.41it/s]

{'loss': 0.0001, 'grad_norm': 0.0009279755176976323, 'learning_rate': 7.302936630602783e-06, 'epoch': 2.58}


 86%|████████▋ | 11600/13440 [2:37:50<21:44,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.001089970930479467, 'learning_rate': 7.109737248840804e-06, 'epoch': 2.59}


 87%|████████▋ | 11650/13440 [2:38:26<21:06,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0016317114932462573, 'learning_rate': 6.916537867078826e-06, 'epoch': 2.6}


 87%|████████▋ | 11700/13440 [2:39:02<20:38,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0011718233581632376, 'learning_rate': 6.723338485316847e-06, 'epoch': 2.61}


 87%|████████▋ | 11750/13440 [2:39:37<20:27,  1.38it/s]

{'loss': 0.0, 'grad_norm': 0.0011165846372023225, 'learning_rate': 6.530139103554869e-06, 'epoch': 2.62}


 88%|████████▊ | 11800/13440 [2:40:14<19:57,  1.37it/s]

{'loss': 0.0006, 'grad_norm': 0.0006507558864541352, 'learning_rate': 6.3369397217928904e-06, 'epoch': 2.63}


 88%|████████▊ | 11850/13440 [2:40:50<19:17,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.000967451196629554, 'learning_rate': 6.143740340030912e-06, 'epoch': 2.65}


 89%|████████▊ | 11900/13440 [2:41:27<18:43,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.000602195446845144, 'learning_rate': 5.9505409582689335e-06, 'epoch': 2.66}


 89%|████████▉ | 11950/13440 [2:42:02<17:39,  1.41it/s]

{'loss': 0.0229, 'grad_norm': 0.000547204923350364, 'learning_rate': 5.757341576506955e-06, 'epoch': 2.67}


 89%|████████▉ | 12000/13440 [2:42:38<17:03,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.000711658678483218, 'learning_rate': 5.564142194744977e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 12000/13440 [2:46:29<17:03,  1.41it/s]

{'eval_loss': 0.006380924489349127, 'eval_accuracy': 0.9989955357142857, 'eval_precision': 0.9989974644914505, 'eval_recall': 0.9989955357142857, 'eval_f1': 0.9989954900155446, 'eval_runtime': 231.0128, 'eval_samples_per_second': 38.786, 'eval_steps_per_second': 4.848, 'epoch': 2.68}


 90%|████████▉ | 12050/13440 [2:47:08<16:53,  1.37it/s]   

{'loss': 0.0, 'grad_norm': 0.0007362915202975273, 'learning_rate': 5.370942812982998e-06, 'epoch': 2.69}


 90%|█████████ | 12100/13440 [2:47:44<16:13,  1.38it/s]

{'loss': 0.0, 'grad_norm': 0.0007855462608858943, 'learning_rate': 5.17774343122102e-06, 'epoch': 2.7}


 90%|█████████ | 12150/13440 [2:48:21<15:41,  1.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0004725979524664581, 'learning_rate': 4.984544049459041e-06, 'epoch': 2.71}


 91%|█████████ | 12200/13440 [2:48:57<15:06,  1.37it/s]

{'loss': 0.0258, 'grad_norm': 0.0012753185583278537, 'learning_rate': 4.791344667697063e-06, 'epoch': 2.72}


 91%|█████████ | 12250/13440 [2:49:33<14:06,  1.41it/s]

{'loss': 0.0001, 'grad_norm': 0.008993657305836678, 'learning_rate': 4.598145285935085e-06, 'epoch': 2.73}


 92%|█████████▏| 12300/13440 [2:50:09<13:31,  1.40it/s]

{'loss': 0.0152, 'grad_norm': 0.0026155251543968916, 'learning_rate': 4.404945904173107e-06, 'epoch': 2.75}


 92%|█████████▏| 12350/13440 [2:50:45<13:14,  1.37it/s]

{'loss': 0.0265, 'grad_norm': 0.0008781153592281044, 'learning_rate': 4.2117465224111284e-06, 'epoch': 2.76}


 92%|█████████▏| 12400/13440 [2:51:22<12:37,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0016253223875537515, 'learning_rate': 4.01854714064915e-06, 'epoch': 2.77}


 93%|█████████▎| 12450/13440 [2:51:57<11:41,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0009305955027230084, 'learning_rate': 3.8253477588871716e-06, 'epoch': 2.78}


 93%|█████████▎| 12500/13440 [2:52:33<11:06,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0007277940167114139, 'learning_rate': 3.6321483771251936e-06, 'epoch': 2.79}


 93%|█████████▎| 12550/13440 [2:53:09<10:33,  1.40it/s]

{'loss': 0.0, 'grad_norm': 0.0011194911785423756, 'learning_rate': 3.438948995363215e-06, 'epoch': 2.8}


 94%|█████████▍| 12600/13440 [2:53:45<10:12,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0008941338746808469, 'learning_rate': 3.2457496136012367e-06, 'epoch': 2.81}


 94%|█████████▍| 12650/13440 [2:54:22<09:34,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0010001275222748518, 'learning_rate': 3.0525502318392582e-06, 'epoch': 2.82}


 94%|█████████▍| 12700/13440 [2:54:58<08:47,  1.40it/s]

{'loss': 0.0, 'grad_norm': 0.000816216750536114, 'learning_rate': 2.85935085007728e-06, 'epoch': 2.83}


 95%|█████████▍| 12750/13440 [2:55:34<08:21,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0015054242685437202, 'learning_rate': 2.6661514683153014e-06, 'epoch': 2.85}


 95%|█████████▌| 12800/13440 [2:56:09<07:45,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0005954232183285058, 'learning_rate': 2.472952086553323e-06, 'epoch': 2.86}


 96%|█████████▌| 12850/13440 [2:56:46<07:10,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0006296301726251841, 'learning_rate': 2.2797527047913445e-06, 'epoch': 2.87}


 96%|█████████▌| 12900/13440 [2:57:22<06:32,  1.38it/s]

{'loss': 0.0, 'grad_norm': 0.000721085409168154, 'learning_rate': 2.0865533230293665e-06, 'epoch': 2.88}


 96%|█████████▋| 12950/13440 [2:57:59<05:55,  1.38it/s]

{'loss': 0.0, 'grad_norm': 0.0005392585881054401, 'learning_rate': 1.8933539412673882e-06, 'epoch': 2.89}


 97%|█████████▋| 13000/13440 [2:58:35<05:20,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0006015889230184257, 'learning_rate': 1.7001545595054098e-06, 'epoch': 2.9}


 97%|█████████▋| 13050/13440 [2:59:11<04:36,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0006105902139097452, 'learning_rate': 1.5069551777434314e-06, 'epoch': 2.91}


 97%|█████████▋| 13100/13440 [2:59:47<04:03,  1.40it/s]

{'loss': 0.0, 'grad_norm': 0.00038809963734820485, 'learning_rate': 1.313755795981453e-06, 'epoch': 2.92}


 98%|█████████▊| 13150/13440 [3:00:23<03:31,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.00042959905113093555, 'learning_rate': 1.1205564142194745e-06, 'epoch': 2.94}


 98%|█████████▊| 13200/13440 [3:01:00<02:54,  1.37it/s]

{'loss': 0.0, 'grad_norm': 0.0005086485180072486, 'learning_rate': 9.273570324574961e-07, 'epoch': 2.95}


 99%|█████████▊| 13250/13440 [3:01:36<02:18,  1.37it/s]

{'loss': 0.0192, 'grad_norm': 0.0004925589310005307, 'learning_rate': 7.341576506955178e-07, 'epoch': 2.96}


 99%|█████████▉| 13300/13440 [3:02:12<01:40,  1.39it/s]

{'loss': 0.0181, 'grad_norm': 0.0017725755460560322, 'learning_rate': 5.409582689335394e-07, 'epoch': 2.97}


 99%|█████████▉| 13350/13440 [3:02:48<01:03,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.00048207127838395536, 'learning_rate': 3.4775888717156104e-07, 'epoch': 2.98}


100%|█████████▉| 13400/13440 [3:03:24<00:28,  1.41it/s]

{'loss': 0.0, 'grad_norm': 0.0006188718252815306, 'learning_rate': 1.5455950540958268e-07, 'epoch': 2.99}


100%|██████████| 13440/13440 [3:03:52<00:00,  1.22it/s]

{'train_runtime': 11032.7217, 'train_samples_per_second': 9.745, 'train_steps_per_second': 1.218, 'train_loss': 0.023050309770903932, 'epoch': 3.0}





TrainOutput(global_step=13440, training_loss=0.023050309770903932, metrics={'train_runtime': 11032.7217, 'train_samples_per_second': 9.745, 'train_steps_per_second': 1.218, 'total_flos': 2.828812200597504e+16, 'train_loss': 0.023050309770903932, 'epoch': 3.0})

In [35]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

100%|██████████| 1120/1120 [03:49<00:00,  4.88it/s]

Evaluation results: {'eval_loss': 0.0038314524572342634, 'eval_accuracy': 0.9994419642857143, 'eval_precision': 0.9994421721491528, 'eval_recall': 0.9994419642857143, 'eval_f1': 0.9994419559488603, 'eval_runtime': 229.7196, 'eval_samples_per_second': 39.004, 'eval_steps_per_second': 4.876, 'epoch': 3.0}





In [36]:
model.save_pretrained("./fake_news_classifier")
tokenizer.save_pretrained("./fake_news_classifier")

('./fake_news_classifier/tokenizer_config.json',
 './fake_news_classifier/special_tokens_map.json',
 './fake_news_classifier/vocab.txt',
 './fake_news_classifier/added_tokens.json')