<a href="https://colab.research.google.com/github/dmavani25/DivineManuscriptsDB/blob/main/code/tokenization_and_baseline_model_train_val_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries for training, validating and testing the baseline model
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from transformers import AdamW, get_linear_schedule_with_warmup
import time
import os
import re

In [5]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd

# URLs of the CSV files
data_urls = {
    "train": "https://raw.githubusercontent.com/DhyeyMavani2003/DeepSentiment/main/data/tweeteval/sentiment/csv/pre-token-train-data.csv",
    "validation": "https://raw.githubusercontent.com/DhyeyMavani2003/DeepSentiment/main/data/tweeteval/sentiment/csv/pre-token-val-data.csv",
    "test": "https://raw.githubusercontent.com/DhyeyMavani2003/DeepSentiment/main/data/tweeteval/sentiment/csv/pre-token-test-data.csv"
}

# Load and prepare datasets
def load_and_prepare_data(url, tokenizer):
    df = pd.read_csv(url)
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(lambda examples: tokenizer(examples['Text'], truncation=True, padding="max_length", max_length=256), batched=True)
    dataset = dataset.map(lambda examples: {'labels': examples['Label_ID']}, batched=False)
    return dataset

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preparing datasets
datasets = {split: load_and_prepare_data(data_urls[split], tokenizer) for split in data_urls}

# Create DatasetDict
data = DatasetDict({
    'train': datasets['train'],
    'validation': datasets['validation'],
    'test': datasets['test']
})

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy="epoch"      # evaluate each `logging_steps`
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=data['train'],         # training dataset
    eval_dataset=data['validation']      # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
eval_results = trainer.evaluate(data['test'])
print(eval_results)




Map:   0%|          | 0/45615 [00:00<?, ? examples/s]

Map:   0%|          | 0/45615 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,0.7545,0.737275
2,0.6255,0.719672
3,0.4226,0.853378


{'eval_loss': 0.9884742498397827, 'eval_runtime': 196.2915, 'eval_samples_per_second': 62.58, 'eval_steps_per_second': 0.978, 'epoch': 3.0}
