In [9]:
# Importing necessary libraries
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split

# Load train and test datasets
train_essays = pd.read_excel('/kaggle/input/povdetecting/Blog POV Dataset Finalllllll.xlsx')


# Preprocessing the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_data(texts):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )
class EssaysDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Ensure labels are a list of integers
        self.labels = labels if isinstance(labels, list) else labels.tolist()

    def __getitem__(self, idx):
        item = {key: val[idx].detach().clone() for key, val in self.encodings.items()}
        # Convert integer label to a tensor of dtype long
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Preprocess and split the train dataset
X_train, X_val, y_train, y_val = train_test_split(train_essays['Text'],train_essays['POV'], test_size=0.2)
train_encodings = preprocess_data(X_train.tolist())
val_encodings = preprocess_data(X_val.tolist())



In [3]:
#custom dataset 
train_dataset = EssaysDataset(train_encodings,y_train.tolist())
val_dataset = EssaysDataset(val_encodings,y_val.tolist())



In [5]:
# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,1.1227
20,1.126
30,1.0433
40,0.9919
50,0.909
60,0.8439
70,0.7091
80,0.5985
90,0.4818
100,0.4064


TrainOutput(global_step=150, training_loss=0.6080288827419281, metrics={'train_runtime': 7130.7407, 'train_samples_per_second': 0.337, 'train_steps_per_second': 0.021, 'total_flos': 631472202547200.0, 'train_loss': 0.6080288827419281, 'epoch': 10.0})

In [10]:
trainer.evaluate()

{'eval_loss': 0.313424289226532,
 'eval_runtime': 48.7889,
 'eval_samples_per_second': 1.23,
 'eval_steps_per_second': 0.164,
 'epoch': 10.0}

In [20]:
Input_blog="Bank managers looking to understand how Blockchain works in the banking industry can benefit greatly from incorporating this technology into their operations. Blockchain technology provides transaction immutability, transparency, and provenance, eliminating the need for trust enforcers and increasing transparency between market participants. By storing immutable records of ownership and enabling secure transfers of assets among distrusting parties, Blockchain enhances trust, transparency, and efficiency in the banking sector. This technology revolutionizes how money and transactions are managed, offering a more secure and cost effective way of conducting financial transactions."


In [21]:
# Preprocess the raw text input for prediction
test_encodings = preprocess_data([Input_blog])

# Create a dummy label for the single input
dummy_label = [2]  

# Create a Dataset object for the single input
test_dataset = EssaysDataset(test_encodings, dummy_label)

# Predict using the trained model
predictions = trainer.predict(test_dataset)

# Convert predictions to binary labels
pred_labels = predictions.predictions.argmax(axis=-1)

# Output the prediction
print("Predicted label:", pred_labels.item())

Predicted label: 2


In [29]:
if pred_labels ==0:
    print("First person perspective")
elif pred_labels == 1:
    print("Second person perspective")
else:
    print ("third person perspective")

third person perspective


In [27]:
#To save the model
trainer.save_model("./model")