In [2]:
import pandas as pd
import optuna
import mlflow
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR, ExponentialLR
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from mlflow import pytorch
from pprint import pformat
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModel
import gc

In [3]:
df_col = pd.read_table('collection.tsv')

In [4]:
df_col_20k = df_col.loc[:20000, :]

In [5]:
df_col_20k.columns

Index(['0', 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'], dtype='object')

In [6]:
df_col_20k

Unnamed: 0,0,The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.
0,1,The Manhattan Project and its atomic bomb help...
1,2,Essay on The Manhattan Project - The Manhattan...
2,3,The Manhattan Project was the name for a proje...
3,4,versions of each volume as well as complementa...
4,5,The Manhattan Project. This once classified ph...
...,...,...
19996,19997,The leaner the beef the faster this process ta...
19997,19998,A: It takes 24 to 72 hours for beef to digest ...
19998,19999,Digestion Essay The path of digestion begins i...
19999,20000,It takes 24 to 72 hours for beef to digest wit...


In [7]:
df_col_20k.rename(columns={'0': 'Index', 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.': 'Text'}, inplace=True)
df_col_20k.set_index('Index')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_col_20k.rename(columns={'0': 'Index', 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.': 'Text'}, inplace=True)


Unnamed: 0_level_0,Text
Index,Unnamed: 1_level_1
1,The Manhattan Project and its atomic bomb help...
2,Essay on The Manhattan Project - The Manhattan...
3,The Manhattan Project was the name for a proje...
4,versions of each volume as well as complementa...
5,The Manhattan Project. This once classified ph...
...,...
19997,The leaner the beef the faster this process ta...
19998,A: It takes 24 to 72 hours for beef to digest ...
19999,Digestion Essay The path of digestion begins i...
20000,It takes 24 to 72 hours for beef to digest wit...


In [8]:
df_col_20k['label'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_col_20k['label'] = 1


In [9]:
df_col_20k.rename(columns={'label': 'labels'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_col_20k.rename(columns={'label': 'labels'}, inplace=True)


In [10]:
df_cb = pd.read_csv('crowdsourced.csv')

In [11]:
df_cb.shape

(22501, 10)

In [12]:
df_x = df_cb['Text']

In [13]:
df_cb_data = df_cb[['Text', 'Verdict']] 

In [14]:
df_cb_data.loc[df_cb_data['Verdict'] < 1, 'Verdict'] = 0

In [15]:
df_cb_data[df_cb_data['Verdict'] == 0].shape[0]

17088

In [16]:
df_cb_data[df_cb_data['Verdict'] == 1].shape[0]

5413

In [17]:
df_cb_data.rename(columns={'Verdict': 'labels'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cb_data.rename(columns={'Verdict': 'labels'}, inplace=True)


In [18]:
df_cb_data

Unnamed: 0,Text,labels
0,I think we've seen a deterioration of values.,0
1,I think for a while as a nation we condoned th...,0
2,"For a while, as I recall, it even seems to me ...",0
3,"So we've seen a deterioration in values, and o...",0
4,"We got away, we got into this feeling that val...",0
...,...,...
22496,You get shot walking to the store.,0
22497,I will do more for African-Americans and Latin...,0
22498,All she's done is talk to the African-American...,0
22499,"We are going to make America strong again, and...",0


In [19]:
df_merged = df_cb_data.append(df_col_20k, ignore_index=True)

  df_merged = df_cb_data.append(df_col_20k, ignore_index=True)


In [20]:
df_merged.drop('Index', axis=1)

Unnamed: 0,Text,labels
0,I think we've seen a deterioration of values.,0
1,I think for a while as a nation we condoned th...,0
2,"For a while, as I recall, it even seems to me ...",0
3,"So we've seen a deterioration in values, and o...",0
4,"We got away, we got into this feeling that val...",0
...,...,...
42497,The leaner the beef the faster this process ta...,1
42498,A: It takes 24 to 72 hours for beef to digest ...,1
42499,Digestion Essay The path of digestion begins i...,1
42500,It takes 24 to 72 hours for beef to digest wit...,1


In [23]:
df_merged.loc[42499, 'Text']

'Digestion Essay The path of digestion begins in the mouth when food is broken down mechanically, into smaller pieces, by the teeth and tongue. The complex carbohydrates found in the food are also broken down chemically with the help of the enzyme (ptyalin), found in saliva.'

In [20]:
from transformers import RobertaTokenizerFast


In [21]:
def tokenize_and_dataload_df(model_name, batch_size):
    if model_name == 'bert':
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    elif model_name == 'distilbert':
        tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    elif model_name == 'roberta':
        tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    else:
        tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
    dataset = Dataset.from_pandas(df_merged)
    dataset_torch = dataset.map(lambda e: tokenizer(e['Text'], truncation=True, max_length=512, padding='max_length'), batched=True)
    dataset_torch.set_format(type='torch', columns=['input_ids', 'labels'])
    
    train_size = int(0.8 * len(dataset_torch))
    test_size = len(dataset_torch) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset_torch, [train_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    return train_loader, test_loader

In [22]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    train_set_size = len(train_loader.dataset)
    num_batches = len(train_loader)
    train_loss = 0.0
    correct = 0
    for batch_idx, data in enumerate(train_loader):
        x, target = data['input_ids'].to(device), data['labels'].to(device)
        optimizer.zero_grad()
        loss, logits = model(x, 
                             labels=target,
                            return_dict=False)
#         print(logits)
#         loss = F.nll_loss(logits, target)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        pred = logits.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        
        if batch_idx % 10 == 0:
            batch_size = len(x)
            print(f"Train Epoch: {epoch} [{batch_idx * batch_size}/{train_set_size} "
                  f"({100. * batch_idx / num_batches:.0f}%)]\tLoss: {loss.item():.6f}")
    
        del loss
        del logits
        gc.collect()
    accuracy = correct/train_set_size
    print(f'Train Accuracy after epoch {epoch}: {accuracy}')
    avg_train_loss = train_loss / num_batches

    return avg_train_loss, accuracy

In [23]:
def save_model(model, model_type, params, epoch, val_accuracy):
    params['current_epoch'] = epoch
    params['val_accuracy'] = val_accuracy
    now = datetime.now()
    original_stdout = sys.stdout
    accuracy = round(val_accuracy*100)
#     with open(f'best_params_{now.month}_{now.day}_{now.hour}_{now.minute}.txt', 'w') as f:
#         sys.stdout = f 
#         print(params)
#         sys.stdout = original_stdout
#         f.close()
#     torch.save(model.state_dict(), f'cd_cb_{model_type}_{accuracy}')
#     mlflow.pytorch.save_model(model, f'best_cd_{model_type}_{now.month}_{now.day}_{now.hour}_{now.minute}')
    mlflow.pytorch.log_model(model, f'cd_cb_{model_type}_epoch_{epoch}_acc_{accuracy}')

In [24]:
def validate(model, device, val_loader):
    model.eval()
    val_set_size = len(val_loader.dataset)
    val_num_batches = len(val_loader)
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for data in val_loader:
            x, target = data['input_ids'].to(device), data['labels'].to(device)
            loss, logits = model(x, 
                             labels=target,
                                return_dict=False)
#             val_loss += F.nll_loss(logits, target, reduction='sum').item()
            val_loss += loss.item()
            pred = logits.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    val_loss /= val_set_size
    accuracy = correct/val_set_size
    
    del loss
    del logits
    gc.collect()

    print(f"Validation set: Average loss: {val_loss:.4f}, Accuracy: {correct}/{val_set_size} "
          f"({100. * correct / val_set_size:.0f}%)\n")
    return val_loss, accuracy

In [25]:
# def suggest_hyperparameters(trial):
#     lr = trial.suggest_float("lr", 1e-6, 1e-3, log=True)
#     optimizer_name = trial.suggest_categorical("optimizer_name", ["Adam", "AdamW"])
#     epochs = trial.suggest_int("epochs", 1, 10, step=1)
#     hidden_size = trial.suggest_categorical("hidden_size", [64, 128, 256, 512])
#     num_layers = trial.suggest_categorical("num_layers", [2, 3, 4])
#     dropout = trial.suggest_float("dropout", 0.1, 0.9, step=0.1)
#     model = trial.suggest_categorical("model", ['bert', 'distilbert', 'roberta'])
#     batch_size = trial.suggest_categorical("batch_size", [32, 64])
#     gamma = trial.suggest_float("gamma", 0.1, 0.9, step=0.1)
#     scheduler = trial.suggest_categorical("scheduler", ['step', 'exponential'])

#     return lr, optimizer_name, epochs, hidden_size, num_layers, dropout, model, batch_size, gamma, scheduler

In [26]:
def suggest_hyperparameters(trial):
    lr = trial.suggest_float("lr", 1e-6, 1e-3, log=True)
    optimizer_name = trial.suggest_categorical("optimizer_name", ["Adam", "AdamW"])
    epochs = trial.suggest_int("epochs", 1, 5, step=1)
#     model = trial.suggest_categorical("model", ['bert', 'distilbert', 'roberta', 'albert'])
    model = trial.suggest_categorical("model", ['distilbert'])
    batch_size = trial.suggest_categorical("batch_size", [32])
    gamma = trial.suggest_float("gamma", 0.1, 0.9, step=0.1)
    scheduler = trial.suggest_categorical("scheduler", ['step', 'exponential'])

    return lr, optimizer_name, epochs, model, batch_size, gamma, scheduler

In [27]:
from transformers import BertTokenizer, BertModel, RobertaModel, DistilBertModel, RobertaForSequenceClassification

In [28]:
import sys

In [29]:
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification, AlbertForSequenceClassification, RobertaForSequenceClassification

In [30]:
def objective(trial):
    best_val_loss = float('Inf')
    best_val_accuracy = 0

    with mlflow.start_run():
#         lr, optimizer_name, epochs, hidden_size, num_layers, dropout, model_type, batch_size, gamma, sched = suggest_hyperparameters(trial)
        lr, optimizer_name, epochs, model_type, batch_size, gamma, sched = suggest_hyperparameters(trial)
        mlflow.log_params(trial.params)
        print(f'Trial params: {trial.params}')

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        mlflow.log_param("device", device)
        
        if model_type == 'bert':
            train_loader, test_loader = tokenize_and_dataload_df('bert', batch_size)
#             bert = BertModel.from_pretrained('bert-base-cased').to(device)
            model = BertForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                                 num_labels = 2,
                                                                 output_attentions = False,
                                                                 output_hidden_states = False).to(device)
        elif model_type == 'distilbert':
            train_loader, test_loader = tokenize_and_dataload_df('distilbert', batch_size)
            model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                                         num_labels = 2,
                                                                         output_attentions = False,
                                                                         output_hidden_states = False).to(device)
        elif model_type == 'roberta':
            model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                                     num_labels = 2,
                                                                     output_attentions = False,
                                                                     output_hidden_states = False).to(device)
            train_loader, test_loader = tokenize_and_dataload_df('roberta', batch_size)
        
        else:
            model = AlbertForSequenceClassification.from_pretrained('albert-base-v2',
                                                                     num_labels = 2,
                                                                     output_attentions = False,
                                                                     output_hidden_states = False).to(device)
            train_loader, test_loader = tokenize_and_dataload_df('albert', batch_size)

#         model = BERTGRUSentiment(bert,
#                          model_type,
#                          hidden_size,
#                          2,
#                          num_layers,
#                          True,
#                          dropout).to(device)

        if optimizer_name == "Adam":
            optimizer = optim.Adam(model.parameters(), lr=lr)
        if optimizer_name == "AdamW":
            optimizer = optim.AdamW(model.parameters(), lr=lr)
        
        if sched == 'step':
            scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
        else:
            scheduler = ExponentialLR(optimizer, gamma=gamma)
        
        for epoch in range(epochs):
            avg_train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch)
            avg_val_loss, val_accuracy = validate(model, device, test_loader)
            
            if avg_val_loss <= best_val_loss:
                best_val_loss = avg_val_loss

            mlflow.log_metric("avg_train_losses", avg_train_loss, step=epoch)
            mlflow.log_metric("avg_val_loss", avg_val_loss, step=epoch)
            mlflow.log_metric("train_accuracy", train_accuracy, step=epoch)
            mlflow.log_metric("val_accuracy", val_accuracy, step=epoch)
            
            scheduler.step()
            
            global global_val_accuracy
            if val_accuracy > global_val_accuracy:
                print(f'Saving model. Best validation accuracy: {val_accuracy}')
                save_model(model, model_type, trial.params, epoch, val_accuracy)
                global_val_accuracy = val_accuracy
    
    
    del model
    torch.cuda.empty_cache()
    gc.collect()
    
#     return best_val_loss
#     return avg_val_loss
    return val_accuracy

In [31]:
import random
import numpy as np
from optuna.samplers import TPESampler
from datetime import *

In [32]:
def train_best_model(best_trial_params):
    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    lr = best_trial_params['lr']
    optimizer_name = best_trial_params['optimizer_name']
    epochs = best_trial_params['epochs']
    model_type = best_trial_params['model']
    batch_size = best_trial_params['batch_size']
    gamma = best_trial_params['gamma']
    sched = best_trial_params['scheduler']
    
    if model_type == 'bert':
            train_loader, test_loader = tokenize_and_dataload_df('bert', batch_size)
            model = BertForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                                 num_labels = 2,
                                                                 output_attentions = False,
                                                                 output_hidden_states = False).to(device)
    elif model_type == 'distilbert':
        train_loader, test_loader = tokenize_and_dataload_df('distilbert', batch_size)
        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                                     num_labels = 2,
                                                                     output_attentions = False,
                                                                     output_hidden_states = False).to(device)
    else:
        model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                                 num_labels = 2,
                                                                 output_attentions = False,
                                                                 output_hidden_states = False).to(device)
        train_loader, test_loader = tokenize_and_dataload_df('roberta', batch_size)
        
    if optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=lr)
    if optimizer_name == "AdamW":
        optimizer = optim.AdamW(model.parameters(), lr=lr)

    if sched == 'step':
        scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
    else:
        scheduler = ExponentialLR(optimizer, gamma=gamma)

    for epoch in range(epochs):
        avg_train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch)
        avg_val_loss, val_accuracy = validate(model, device, test_loader)

    best_val_accuracy = val_accuracy
    original_stdout = sys.stdout
    best_trial_params['val_accuracy'] = val_accuracy
    now = datetime.now()
    with open(f'best_params_{now.month}_{now.day}_{now.hour}_{now.minute}.txt', 'w') as f:
        sys.stdout = f 
        print(best_trial_params)
        sys.stdout = original_stdout
        f.close()
    torch.save(model.state_dict(), f'best_cd_{model_type}_{now.month}_{now.day}_{now.hour}_{now.minute}.pth')
            
        
    
    

In [33]:
def main():
    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)
    best_val_accuracy = 0

    study = optuna.create_study(study_name="claim_detection", 
                                direction="maximize", 
                                sampler=TPESampler(seed=2))
    study.optimize(objective, n_trials=100)

    print("Best trial:")
    trial = study.best_trial

    print("  Trial number: ", trial.number)
    print("  Loss (trial value): ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    
#     train_best_model(trial.params.items())

In [34]:
global_val_accuracy = 0
main()


[32m[I 2023-04-29 10:29:50,681][0m A new study created in memory with name: claim_detection[0m


Trial params: {'lr': 2.0322854432411518e-05, 'optimizer_name': 'AdamW', 'epochs': 3, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.4, 'scheduler': 'step'}


Map:   0%|          | 0/42502 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Train Accuracy after epoch 0: 0.910973206670392
Validation set: Average loss: 0.0054, Accuracy: 7838/8501 (92%)

Saving model. Best validation accuracy: 0.9220091753911305




Train Accuracy after epoch 1: 0.9499132378459457
Validation set: Average loss: 0.0055, Accuracy: 7899/8501 (93%)

Saving model. Best validation accuracy: 0.929184801788025


Train Accuracy after epoch 2: 0.9682362283462251
Validation set: Average loss: 0.0061, Accuracy: 7910/8501 (93%)

Saving model. Best validation accuracy: 0.9304787672038584


[32m[I 2023-04-29 10:58:58,247][0m Trial 0 finished with value: 0.9304787672038584 and parameters: {'lr': 2.0322854432411518e-05, 'optimizer_name': 'AdamW', 'epochs': 3, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.4, 'scheduler': 'step'}. Best is trial 0 with value: 0.9304787672038584.[0m


Trial params: {'lr': 7.207968815585904e-05, 'optimizer_name': 'Adam', 'epochs': 4, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.5, 'scheduler': 'exponential'}


Map:   0%|          | 0/42502 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Train Accuracy after epoch 0: 0.906149819122967
Validation set: Average loss: 0.0057, Accuracy: 7845/8501 (92%)



Train Accuracy after epoch 1: 0.9499720596453045
Validation set: Average loss: 0.0052, Accuracy: 7905/8501 (93%)

Train Accuracy after epoch 2: 0.97611834946031
Validation set: Average loss: 0.0066, Accuracy: 7902/8501 (93%)



Train Accuracy after epoch 3: 0.9890591453192553
Validation set: Average loss: 0.0091, Accuracy: 7903/8501 (93%)



[32m[I 2023-04-29 11:37:46,986][0m Trial 1 finished with value: 0.9296553346665098 and parameters: {'lr': 7.207968815585904e-05, 'optimizer_name': 'Adam', 'epochs': 4, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.5, 'scheduler': 'exponential'}. Best is trial 0 with value: 0.9304787672038584.[0m


Trial params: {'lr': 3.575358516942407e-06, 'optimizer_name': 'AdamW', 'epochs': 3, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.8, 'scheduler': 'exponential'}


Map:   0%|          | 0/42502 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Train Accuracy after epoch 0: 0.8812093761948178
Validation set: Average loss: 0.0055, Accuracy: 7876/8501 (93%)



Train Accuracy after epoch 1: 0.9270609687950354
Validation set: Average loss: 0.0050, Accuracy: 7922/8501 (93%)

Saving model. Best validation accuracy: 0.9318903658393131
Train Accuracy after epoch 2: 0.9362077585953354
Validation set: Average loss: 0.0048, Accuracy: 7948/8501 (93%)

Saving model. Best validation accuracy: 0.9349488295494648


[32m[I 2023-04-29 12:06:48,977][0m Trial 2 finished with value: 0.9349488295494648 and parameters: {'lr': 3.575358516942407e-06, 'optimizer_name': 'AdamW', 'epochs': 3, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.8, 'scheduler': 'exponential'}. Best is trial 2 with value: 0.9349488295494648.[0m


Trial params: {'lr': 1.5698549009697361e-06, 'optimizer_name': 'Adam', 'epochs': 1, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.6, 'scheduler': 'step'}


Map:   0%|          | 0/42502 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Train Accuracy after epoch 0: 0.8570924384576925


[32m[I 2023-04-29 12:16:31,249][0m Trial 3 finished with value: 0.8971885660510528 and parameters: {'lr': 1.5698549009697361e-06, 'optimizer_name': 'Adam', 'epochs': 1, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.6, 'scheduler': 'step'}. Best is trial 2 with value: 0.9349488295494648.[0m


Validation set: Average loss: 0.0075, Accuracy: 7627/8501 (90%)

Trial params: {'lr': 4.580560474043079e-06, 'optimizer_name': 'AdamW', 'epochs': 2, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.6, 'scheduler': 'exponential'}


Map:   0%|          | 0/42502 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Train Accuracy after epoch 0: 0.8915326019822947
Validation set: Average loss: 0.0059, Accuracy: 7823/8501 (92%)



Train Accuracy after epoch 1: 0.9283256374812505
Validation set: Average loss: 0.0056, Accuracy: 7858/8501 (92%)



[32m[I 2023-04-29 12:35:48,255][0m Trial 4 finished with value: 0.9243618397835549 and parameters: {'lr': 4.580560474043079e-06, 'optimizer_name': 'AdamW', 'epochs': 2, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.6, 'scheduler': 'exponential'}. Best is trial 2 with value: 0.9349488295494648.[0m


Trial params: {'lr': 1.4476979297784871e-05, 'optimizer_name': 'Adam', 'epochs': 1, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.7000000000000001, 'scheduler': 'step'}


Map:   0%|          | 0/42502 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Train Accuracy after epoch 0: 0.9012087879768242
Validation set: Average loss: 0.0050, Accuracy: 7945/8501 (93%)



[32m[I 2023-04-29 12:45:30,663][0m Trial 5 finished with value: 0.9345959298906011 and parameters: {'lr': 1.4476979297784871e-05, 'optimizer_name': 'Adam', 'epochs': 1, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.7000000000000001, 'scheduler': 'step'}. Best is trial 2 with value: 0.9349488295494648.[0m


Trial params: {'lr': 0.00046618703556985193, 'optimizer_name': 'AdamW', 'epochs': 3, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.4, 'scheduler': 'step'}


Map:   0%|          | 0/42502 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Train Accuracy after epoch 0: 0.599747066262757
Validation set: Average loss: 0.0211, Accuracy: 5067/8501 (60%)



Train Accuracy after epoch 1: 0.598423575777183
Validation set: Average loss: 0.0211, Accuracy: 5067/8501 (60%)

Train Accuracy after epoch 2: 0.598423575777183
Validation set: Average loss: 0.0211, Accuracy: 5067/8501 (60%)



[32m[I 2023-04-29 13:14:31,139][0m Trial 6 finished with value: 0.596047523820727 and parameters: {'lr': 0.00046618703556985193, 'optimizer_name': 'AdamW', 'epochs': 3, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.4, 'scheduler': 'step'}. Best is trial 2 with value: 0.9349488295494648.[0m


Trial params: {'lr': 0.0007264850487520609, 'optimizer_name': 'Adam', 'epochs': 2, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.8, 'scheduler': 'step'}


Map:   0%|          | 0/42502 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Train Accuracy after epoch 0: 0.5951883768124467
Validation set: Average loss: 0.0210, Accuracy: 5109/8501 (60%)



Train Accuracy after epoch 1: 0.5971883179906473
Validation set: Average loss: 0.0211, Accuracy: 5109/8501 (60%)



[32m[I 2023-04-29 13:34:00,055][0m Trial 7 finished with value: 0.6009881190448183 and parameters: {'lr': 0.0007264850487520609, 'optimizer_name': 'Adam', 'epochs': 2, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.8, 'scheduler': 'step'}. Best is trial 2 with value: 0.9349488295494648.[0m


Trial params: {'lr': 5.514824762681477e-06, 'optimizer_name': 'AdamW', 'epochs': 5, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.8, 'scheduler': 'exponential'}


Map:   0%|          | 0/42502 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Train Accuracy after epoch 0: 0.8965030440281169
Validation set: Average loss: 0.0056, Accuracy: 7839/8501 (92%)



Train Accuracy after epoch 1: 0.9321490544395753
Validation set: Average loss: 0.0051, Accuracy: 7900/8501 (93%)

Train Accuracy after epoch 2: 0.9439428252110232


[33m[W 2023-04-29 14:02:48,889][0m Trial 8 failed with parameters: {'lr': 5.514824762681477e-06, 'optimizer_name': 'AdamW', 'epochs': 5, 'model': 'distilbert', 'batch_size': 32, 'gamma': 0.8, 'scheduler': 'exponential'} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "C:\Users\bibl1\anaconda3\envs\edutech\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\bibl1\AppData\Local\Temp\ipykernel_23128\822735883.py", line 61, in objective
    avg_val_loss, val_accuracy = validate(model, device, test_loader)
  File "C:\Users\bibl1\AppData\Local\Temp\ipykernel_23128\2436376795.py", line 15, in validate
    pred = logits.argmax(dim=1, keepdim=True)
KeyboardInterrupt
[33m[W 2023-04-29 14:02:48,890][0m Trial 8 failed with value None.[0m


KeyboardInterrupt: 