In [1]:
from transformers import BertTokenizer, BertForSequenceClassification


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import json
import re
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [None]:
with open('./intents_updated.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data['intents'])

# Prepare data dictionary
dic = {"tag": [], "patterns": [], "responses": []}
for i in range(len(df)):
    ptrns = df[df.index == i]['patterns'].values[0]
    rspns = df[df.index == i]['responses'].values[0]
    tag = df[df.index == i]['tag'].values[0]
    for j in range(len(ptrns)):
        dic['tag'].append(tag)
        dic['patterns'].append(ptrns[j])
        dic['responses'].append(rspns)

df = pd.DataFrame.from_dict(dic)

# Preprocessing function
def preprocess_text(s):
    s = re.sub('[^a-zA-Z\']', ' ', s).lower().strip()
    return s

df['patterns'] = df['patterns'].apply(preprocess_text)
df['tag'] = df['tag'].apply(preprocess_text)


In [None]:
# Encoding labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['tag'])
num_labels = len(np.unique(y_encoded))

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

def encode_texts(texts, max_len):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=max_len, 
            pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

input_ids, attention_masks = encode_texts(df['patterns'], max_len)
labels = torch.tensor(y_encoded)

dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_dataset, batch_size=16)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:

# Model and Optimization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training Loop with Accuracy
epochs = 30
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    correct = 0
    total = 0
    
    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        b_labels = b_labels.long()
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == b_labels).sum().item()
        total += b_labels.size(0)

    avg_loss = total_train_loss / len(train_dataloader)
    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.2f}, Accuracy: {accuracy:.2f}%")

# Prediction Function with Confidence Score
def predict_intent(text):
    encoded_dict = tokenizer.encode_plus(
        text, add_special_tokens=True, max_length=max_len,
        pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    
    logits = outputs.logits.cpu().numpy()
    probs = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)
    confidence = np.max(probs)
    predicted_label_idx = np.argmax(logits, axis=1).flatten()
    predicted_label = label_encoder.inverse_transform(predicted_label_idx)[0]
    
    return predicted_label, confidence

# Response Selection
import random
def get_response(intent):
    possible_responses = df[df['tag'] == intent]['responses'].values[0]
    if isinstance(possible_responses, str):
        possible_responses = [possible_responses]
    if not possible_responses:
        return "I'm here for you, but I'm not sure I understood that. Can you share a bit more?"
    return random.choice(possible_responses)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Average Training Loss: 3.83
Epoch 2, Average Training Loss: 3.61
Epoch 3, Average Training Loss: 3.40
Epoch 4, Average Training Loss: 3.13
Epoch 5, Average Training Loss: 2.96
Epoch 6, Average Training Loss: 2.75
Epoch 7, Average Training Loss: 2.58
Epoch 8, Average Training Loss: 2.44


In [None]:
# Chatbot Loop with Debugging
print("Chatbot is ready! Type 'exit' to stop.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        print("Chatbot: Goodbye!")
        break

    intent, confidence = predict_intent(user_input)  # Now returns both intent & confidence

    if confidence < 0.1:
        response = "I'm not completely sure what you mean. Could you rephrase?"
    else:
        response = get_response(intent)

    print(f"Chatbot: {response}")