In [None]:
import re

import nltk
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
tickets_df = pd.read_csv("/content/cleaned_tickets_v4.csv")
tickets_df

In [None]:
# Text cleaning and preprocessing function
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = tokenizer(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Lemmatize and remove stop words
    return ' '.join(tokens)

# Initialize the tokenizer, lemmatizer, and stop words
tokenizer = get_tokenizer("basic_english")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Clean the text
tickets_df['cleaned_message'] = tickets_df['message'].apply(clean_text)
tickets_df

In [None]:
# Encode the labels
label_encoder = LabelEncoder()
tickets_df["tags"] = label_encoder.fit_transform(tickets_df["tags"])

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text
def tokenize_text(text, tokenizer, max_length=20):
    return tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )


# Custom Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=20,
            return_tensors="pt",
        )
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Build the model
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[
            :, 0, :
        ]  # Extract the [CLS] token's output
        cls_output = self.dropout(cls_output)
        logits = self.linear(cls_output)
        return logits

In [None]:
# Prepare the data
X_train, X_test, y_train, y_test = train_test_split(
    tickets_df["cleaned_message"], tickets_df["tags"], test_size=0.2, random_state=42
)

train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer)
test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)


num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes)

# Training the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_model(model, train_loader, optimizer, criterion, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


train_model(model, train_loader, optimizer, criterion, device, 10)

In [None]:
# Evaluate the model
def evaluate_model(model, test_loader, device):
    model.eval()
    correct_predictions = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels).item()
    return correct_predictions / len(test_dataset)


accuracy = evaluate_model(model, test_loader, device)
print(f"Test Accuracy: {accuracy}")

In [None]:
# Predict on new data
def predict(model, tokenizer, text, device):
    model.eval()
    tokens = tokenizer(
        text, padding="max_length", truncation=True, max_length=20, return_tensors="pt"
    )
    input_ids = tokens["input_ids"].to(device)
    attention_mask = tokens["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)

    return preds.cpu().numpy()


new_messages = ["Can you provide a pricing quote?", "I have a feature suggestion."]
predictions = [predict(model, tokenizer, msg, device)[0] for msg in new_messages]
predicted_labels = label_encoder.inverse_transform(predictions)

print(predicted_labels)