In [20]:
# Import necessary libraries
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch
import json

# Load your dataset
with open('data.json', 'r') as file:
    data = json.load(file)

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Load pre-trained DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Tokenize the data
def tokenize_data(data):
    return tokenizer([item["subject"] for item in data], truncation=True, padding=True)

train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)

# Convert lists to PyTorch tensors
train_encodings = {key: torch.tensor(val) for key, val in train_encodings.items()}
val_encodings = {key: torch.tensor(val) for key, val in val_encodings.items()}

# Create PyTorch datasets
train_labels = torch.tensor([item["relevant"] for item in train_data], dtype=torch.long)
val_labels = torch.tensor([item["relevant"] for item in val_data], dtype=torch.long)
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_labels)
val_dataset = TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], val_labels)

# Define the training arguments
training_args = {
    'output_dir': './results',
    'num_train_epochs': 10,
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 8,
    'warmup_steps': 500,
    'weight_decay': 0.01,
}

# Create a PyTorch DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=training_args['per_device_train_batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=training_args['per_device_eval_batch_size'], shuffle=False)

# Training
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)
model.train()

for epoch in range(training_args['num_train_epochs']):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch}'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = torch.nn.functional.cross_entropy(outputs.logits, labels)
        loss.backward()
        optimizer.step()

# Save the trained model
model.save_pretrained('./trained_model')
# Load the trained model for inference
model = DistilBertForSequenceClassification.from_pretrained('./trained_model')
model.to(device)  # Ensure the model is on the same device as input tensors

# Test on custom input
custom_input = "BTech result published"
tokenized_input = tokenizer(custom_input, return_tensors='pt')
input_ids = tokenized_input['input_ids'].to(device)
attention_mask = tokenized_input['attention_mask'].to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

print(f'Predicted Class: {predicted_class}')



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 32/32 [00:02<00:00, 13.67it/s]
Epoch 1: 100%|██████████| 32/32 [00:02<00:00, 14.06it/s]
Epoch 2: 100%|██████████| 32/32 [00:02<00:00, 13.94it/s]
Epoch 3: 100%|██████████| 32/32 [00:02<00:00, 13.88it/s]
Epoch 4: 100%|██████████| 32/32 [00:02<00:00, 13.89it/s]
Epoch 5: 100%|██████████| 32/32 [00:02<00:00, 13.90it/s]
Epoch 6: 100%|██████████| 32/32 [00:02<00:00, 13.80it/s]
Epoch 7: 100%|██████████| 32/32 [00:02<00:00, 13.70it/s]
Epoch 8: 100%|██████████| 32/32 [00:02<00:00, 13.67it/s]
Epoch 9: 100%|██████████| 32/32 [00:02<00:00, 13.63it/s]


Predicted Class: 1


In [26]:
# Import necessary libraries
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the trained model for inference
model = DistilBertForSequenceClassification.from_pretrained('./trained_model')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Ensure the model is on the same device as input tensors

# Test on custom input
custom_input = "classes postponed"
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenized_input = tokenizer(custom_input, return_tensors='pt')
input_ids = tokenized_input['input_ids'].to(device)
attention_mask = tokenized_input['attention_mask'].to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

print(f'Predicted Class: {predicted_class}')


Predicted Class: 1
