In [1]:
import pandas as pd
import altair as alt

In [2]:
df = pd.read_csv('ticket-helpdesk-multi-lang.csv')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,queue,priority,language,subcategory,subject,text
0,ACCOUNTING,MEDIUM,EN,Customer Inquiries::Payments,Inquiry About Payment Method Update,"Dear Support Team,\n\nI would like to update t..."
1,ACCOUNTING,MEDIUM,DE,Employee Inquiries::Health and Safety,Mängel Gesundheitsbericht Anwendung,"Sehr geehrtes Support-Team, ich nutze Ihre Anw..."
2,SOFTWARE,LOW,EN,Crypto Wallets,Crypto Wallets Update Inquiry and Billing Info,"Good day, I hope everything is great on your e..."
3,ACCOUNTING,LOW,EN,Employee Inquiries::Staff Development,Possibility of Business Name Change on Next In...,"Hello team,\n\nI noticed there's a slight typo..."
4,HARDWARE,HIGH,EN,Temperature Sensor,High Priority: Temperature Sensor Not Powering Up,I urgently need assistance with my hardware te...


In [4]:
# Check for null values and incorrect data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   queue        399 non-null    object
 1   priority     399 non-null    object
 2   language     399 non-null    object
 3   subcategory  399 non-null    object
 4   subject      399 non-null    object
 5   text         399 non-null    object
dtypes: object(6)
memory usage: 18.8+ KB


In [9]:
# Explore the queue column
print(f"Unique values in queue: {df['queue'].unique()}")

print(df['queue'].value_counts().to_markdown(numalign="left", stralign="left"))

Unique values in queue: ['ACCOUNTING' 'SOFTWARE' 'HARDWARE']
| queue      | count   |
|:-----------|:--------|
| SOFTWARE   | 226     |
| ACCOUNTING | 91      |
| HARDWARE   | 82      |


In [11]:
# Explore the text column
df['text'].head()

0    Dear Support Team,\n\nI would like to update t...
1    Sehr geehrtes Support-Team, ich nutze Ihre Anw...
2    Good day, I hope everything is great on your e...
3    Hello team,\n\nI noticed there's a slight typo...
4    I urgently need assistance with my hardware te...
Name: text, dtype: object

In [19]:
print(df['text'].iloc[0])

Dear Support Team,

I would like to update the payment method linked to my account. I recently encountered an issue with my current payment method and would prefer to switch to a different one. Additionally, I have an outstanding invoice for which I need an updated version reflecting the new payment details.

Thank you for your prompt assistance.

Best regards,
Anthony Weber, Cust# 53212


In [31]:
# Convert texts to lower case

df['text'] = df['text'].str.lower()

In [16]:
# Inspect the disttribution of languages
language_counts = df['language'].value_counts().reset_index()
language_counts.columns = ['Language', 'Frequency']

chart = alt.Chart(language_counts).mark_bar().encode(
    x=alt.X('Language', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('Frequency:Q'),
    tooltip=['Language', 'Frequency']
).properties(
    title='Distribution of Languages'
).interactive()

chart.display()

In [None]:
# Download model into path
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cwchang/text-classification-model-multilingual", cache_dir='.')
model = AutoModelForSequenceClassification.from_pretrained("cwchang/text-classification-model-multilingual", cache_dir='.')

BUILD THE DATASET

In [28]:
tokenizer = AutoTokenizer.from_pretrained("models--cwchang--text-classification-model-multilingual/snapshots/939b37821955f4846485f16ac4b18c962b2edc42")
model = AutoModelForSequenceClassification.from_pretrained("models--cwchang--text-classification-model-multilingual/snapshots/939b37821955f4846485f16ac4b18c962b2edc42",
                                                           num_labels=df['queue'].nunique(),
                                                           ignore_mismatched_sizes=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at models--cwchang--text-classification-model-multilingual/snapshots/939b37821955f4846485f16ac4b18c962b2edc42 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([150, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib # for saving the `queue` encoder
import torch

# Encode the `queue` column
label_encoder = LabelEncoder()
df['queue_encoded'] = label_encoder.fit_transform(df['queue'])

# Save the label encoder
joblib.dump(label_encoder, 'queue_encoder.joblib')

# Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['queue_encoded'].tolist(),
    test_size=0.2,
    random_state=42
)

# Tokenize the text
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Create a dataset object
class TicketDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
train_dataset = TicketDataset(train_encodings, train_labels)
val_dataset = TicketDataset(val_encodings, val_labels)

In [38]:
print(train_dataset[0]['input_ids'].shape)

torch.Size([177])


In [42]:
from tqdm import tqdm
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

# Set up the dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16)

# Move model to GPU if available
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
model.to(device)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f'Training Epoch {epoch + 1}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f'Train loss: {train_loss / len(train_loader)}')

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
    print(f'Validation loss: {val_loss / len(val_loader)}')

  The optimizer for which to schedule the learning rate.
Training Epoch 1: 100%|██████████| 20/20 [02:48<00:00,  8.41s/it]


Train loss: 1.0605864554643631


Validation: 100%|██████████| 5/5 [00:02<00:00,  2.45it/s]


Validation loss: 0.9663825988769531


Training Epoch 2: 100%|██████████| 20/20 [02:03<00:00,  6.18s/it]


Train loss: 0.843424865603447


Validation: 100%|██████████| 5/5 [00:01<00:00,  3.85it/s]


Validation loss: 0.9905829071998596


Training Epoch 3: 100%|██████████| 20/20 [02:17<00:00,  6.89s/it]


Train loss: 0.5938209861516952


Validation: 100%|██████████| 5/5 [00:01<00:00,  3.46it/s]

Validation loss: 0.9304910182952881





In [50]:
label_encoder.inverse_transform([0])

array(['ACCOUNTING'], dtype=object)

In [43]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')

('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.txt',
 './results/added_tokens.json',
 './results/tokenizer.json')

In [45]:
tokenizer = AutoTokenizer.from_pretrained('./results')
model = AutoModelForSequenceClassification.from_pretrained('./results')