<a href="https://colab.research.google.com/github/dbarenas/nlps/blob/master/Bert_Question_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:

!pip install requests

In [43]:
import torch
import torch.nn as nn
import requests
from io import BytesIO
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer

# Define the BERT-based question classification model
class BertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        output = self.dropout(pooled_output)
        logits = self.fc(output)
        return logits

# Define a custom dataset for question classification
# Define a custom dataset for question classification
class QuestionClassificationDataset(Dataset):
    def __init__(self, data_url, tokenizer):
        self.tokenizer = tokenizer
        self.sentences, self.labels = self._load_data(data_url)

    def _load_data(self, data_url):
        sentences = []
        labels = []
        response = requests.get(data_url)
        lines = response.content.decode(errors='ignore').splitlines()  # Handle encoding errors and ignore problematic lines
        for line in lines:
            parts = line.strip().split()
            label = parts[0]
            if ':' in label:
                label = label.split(':', 1)[1]  # Remove the label prefix if it is repeated
            sentence = ' '.join(parts[1:])
            sentences.append(sentence)
            labels.append(label)
        return sentences, labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        sentence = self.sentences[index]
        label = self.labels[index]
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        label_id = self._get_label_id(label)
        return input_ids, attention_mask, label_id

    def _get_label_id(self, label):
        label_mapping = {
            'DESC:desc': 0,
            'ENTY:enty': 1,
            'ABBR:abbr': 2,
            'HUM:hum': 3,
            'LOC:loc': 4,
            'NUM:num': 5,
            'NUM:num': 6
        }
        return label_mapping.get(label, len(label_mapping))  # Return a unique ID for unknown labels


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set hyperparameters
# Set hyperparameters

batch_size = 16
lr = 2e-5
num_epochs = 5

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



# Load the training data
data_url = 'https://cogcomp.seas.upenn.edu/Data/QA/QC/train_1000.label'
train_dataset = QuestionClassificationDataset(data_url, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
num_classes = len(set(train_dataset.labels))  # Update the num_classes based on the unique labels in the training dataset

# ...

# Create the BERT-based question classification model
model = BertClassifier(num_classes)
model.to(device)

# ...

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=num_classes)  # Add ignore_index parameter to handle out-of-range labels
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


# Training loop
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (input_ids, attention_mask, labels) in enumerate(train_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}')

# Save the trained model
torch.save(model.state_dict(), 'question_classifier.pt')



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch [1/5], Step [10/63], Loss: 2.6331
Epoch [1/5], Step [20/63], Loss: 1.3690
Epoch [1/5], Step [30/63], Loss: 0.4729
Epoch [1/5], Step [40/63], Loss: 0.2074
Epoch [1/5], Step [50/63], Loss: 0.1022
Epoch [1/5], Step [60/63], Loss: 0.0629
Epoch [2/5], Step [10/63], Loss: 0.0430
Epoch [2/5], Step [20/63], Loss: 0.0343
Epoch [2/5], Step [30/63], Loss: 0.0305
Epoch [2/5], Step [40/63], Loss: 0.0266
Epoch [2/5], Step [50/63], Loss: 0.0226
Epoch [2/5], Step [60/63], Loss: 0.0190
Epoch [3/5], Step [10/63], Loss: 0.0177
Epoch [3/5], Step [20/63], Loss: 0.0154
Epoch [3/5], Step [30/63], Loss: 0.0140
Epoch [3/5], Step [40/63], Loss: 0.0138
Epoch [3/5], Step [50/63], Loss: 0.0135
Epoch [3/5], Step [60/63], Loss: 0.0116
Epoch [4/5], Step [10/63], Loss: 0.0100
Epoch [4/5], Step [20/63], Loss: 0.0099
Epoch [4/5], Step [30/63], Loss: 0.0095
Epoch [4/5], Step [40/63], Loss: 0.0088
Epoch [4/5], Step [50/63], Loss: 0.0082
Epoch [4/5], Step [60/63], Loss: 0.0076
Epoch [5/5], Step [10/63], Loss: 0.0073


In [45]:
# Load the saved model
model = BertClassifier(num_classes)
model.load_state_dict(torch.load('question_classifier.pt'))
model.to(device)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [46]:


# Prepare the input sentence for prediction
input_sentence = "What is the capital of France?"
input_encoding = tokenizer.encode_plus(
    input_sentence,
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)
input_ids = input_encoding['input_ids'].to(device)
attention_mask = input_encoding['attention_mask'].to(device)




In [47]:
# Make the prediction
with torch.no_grad():
    outputs = model(input_ids, attention_mask)
    predicted_labels = torch.argmax(outputs, dim=1)



In [48]:
# Map the predicted label index back to the original label
predicted_label = train_dataset.get_original_label(predicted_labels.item())

# Print the predicted label
print("Predicted label:", predicted_label)

AttributeError: ignored