In [3]:
# necessary libraries
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
warnings.filterwarnings('ignore')

In [7]:
#import the data
data = pd.read_csv("/content/ticket-helpdesk-multi-lang.csv")
data.head(5)

Unnamed: 0,queue,priority,language,subcategory,subject,text
0,ACCOUNTING,MEDIUM,EN,Customer Inquiries::Payments,Inquiry About Payment Method Update,"Dear Support Team,\n\nI would like to update t..."
1,ACCOUNTING,MEDIUM,DE,Employee Inquiries::Health and Safety,Mängel Gesundheitsbericht Anwendung,"Sehr geehrtes Support-Team, ich nutze Ihre Anw..."
2,SOFTWARE,LOW,EN,Crypto Wallets,Crypto Wallets Update Inquiry and Billing Info,"Good day, I hope everything is great on your e..."
3,ACCOUNTING,LOW,EN,Employee Inquiries::Staff Development,Possibility of Business Name Change on Next In...,"Hello team,\n\nI noticed there's a slight typo..."
4,HARDWARE,HIGH,EN,Temperature Sensor,High Priority: Temperature Sensor Not Powering Up,I urgently need assistance with my hardware te...


In [8]:
#inspect the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   queue        399 non-null    object
 1   priority     399 non-null    object
 2   language     399 non-null    object
 3   subcategory  399 non-null    object
 4   subject      399 non-null    object
 5   text         399 non-null    object
dtypes: object(6)
memory usage: 18.8+ KB


In [9]:
data['text'][0]

'Dear Support Team,\n\nI would like to update the payment method linked to my account. I recently encountered an issue with my current payment method and would prefer to switch to a different one. Additionally, I have an outstanding invoice for which I need an updated version reflecting the new payment details.\n\nThank you for your prompt assistance.\n\nBest regards,\nAnthony Weber, Cust# 53212'

In [10]:
data['subject'][0]

'Inquiry About Payment Method Update'

In [11]:
#check propotion of the features
data['queue'].value_counts(normalize=True), data['language'].value_counts(normalize=True)

(queue
 SOFTWARE      0.566416
 ACCOUNTING    0.228070
 HARDWARE      0.205514
 Name: proportion, dtype: float64,
 language
 EN    0.433584
 DE    0.223058
 ES    0.223058
 FR    0.120301
 Name: proportion, dtype: float64)

**There are more tickets for the software department and a higher portion of the tickets are in English.**
**This would require splitting the train and test sets proportionally to avoid imbalance and bias for the model.**

### Data Preprocessing

**The main subcategory for each ticket is needed to maintain some unifromity**

In [12]:
# Preprocess the subcategory column

def preprocess_subcategory(text):
  if "::" in text:
    text = text.split("::")[-1]
  return text

In [13]:
data['subcategory'] = data['subcategory'].apply(preprocess_subcategory)

**Combine subcategory, subject and text**

**Method**

subcategory: content ~ subject: content ~ email: content

In [14]:
# combine subcategory, subject and text into one variable that will be passed to the model
data['text'] = "Subcategory:"+ data['subcategory']+ " ~ "+ "Subject:"+ data['subject']+ " ~ "+ "Email:"+ data['text']

In [15]:
data = data.rename(columns= {"queue":"label"})

**Encode the Label values**

In [16]:
data['label'].unique()

array(['ACCOUNTING', 'SOFTWARE', 'HARDWARE'], dtype=object)

In [17]:
encoder = {"ACCOUNTING":0,
           "SOFTWARE":1,
           "HARDWARE":2}

data['label'] = data['label'].map(encoder)

**Split the data into train and test sets. Using the language and label columns to enusre the model receives balanced data in the train and test set**

In [18]:
data['stratify_column'] = data['language']+"_"+data['label'].astype(str)

In [19]:
train_set,test_set = train_test_split(data, test_size=0.2, stratify=data['stratify_column'], random_state=42)

In [20]:
train_set['label'].value_counts(), test_set['label'].value_counts()

(label
 1    180
 0     73
 2     66
 Name: count, dtype: int64,
 label
 1    46
 0    18
 2    16
 Name: count, dtype: int64)

In [21]:
train_set['language'].value_counts(), test_set['language'].value_counts()

(language
 EN    139
 DE     71
 ES     71
 FR     38
 Name: count, dtype: int64,
 language
 EN    34
 ES    18
 DE    18
 FR    10
 Name: count, dtype: int64)

**Select the required columns for training the model**

In [23]:
train_dataset, test_dataset = train_set[['label', 'text']], test_set[['label', 'text']]
train_dataset.to_csv('/content/train_mbl.csv', index=False)
test_dataset.to_csv('/content/test_mbl.csv', index=False)

In [None]:
train_dataset.head()

Unnamed: 0,label,text
55,2,Subcategory:Brother HL-L8360CDW ~ Subject:URGE...
243,2,Subcategory:Parabolic Antenna ~ Subject:Parabo...
202,2,Subcategory:Smart-Roboter-Mopp ~ Subject:Issue...
253,0,Subcategory:Feedback ~ Subject:Seigue Algunas ...
218,2,Subcategory:UPS ~ Subject:UPS gibt zufällige P...


### **Load the Data as a transformer dataset**

In [24]:
data_files={"train": "/content/train_mbl.csv",
            "test": "/content/test_mbl.csv"}

dataset = load_dataset("csv", data_files = data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 319
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 80
    })
})

In [26]:
dataset['train'][0]

{'label': 2,
 'text': "Subcategory:Brother HL-L8360CDW ~ Subject:URGENT: Brother HL-L8360CDW won't start up ~ Email:Hi Support, I urgently need help. My Brother HL-L8360CDW printer isn’t starting up. This is a critical issue as I need it for daily business operations. No lights are coming on and I've tried all troubleshooting steps. Please assist as soon as possible. Much appreciated!"}

### Tokenize the Text Field

`distilbert-base-multilingual-cased` model was used due to the presence of multiple languages and it [supports the languages](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages) present in the data.

In [27]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [28]:
#tokenizer function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [29]:
tokinized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/319 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [30]:
#padding data
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Evaluation Function

In [31]:
import evaluate
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [32]:
#function to calculate metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # return metric.compute(predictions=predictions, references=labels)
    return accuracy.compute(predictions=predictions, references=labels)

### Train

In [33]:
#mapping the labels to the values for the model
id2label = {0: "ACCOUNTING", 1: "SOFTWARE", 2: "HARDWARE"}
label2id = {"ACCOUNTING":0, "SOFTWARE":1, "HARDWARE":2}

In [34]:
#load the model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-multilingual-cased", num_labels=3, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
training_args = TrainingArguments(
    output_dir="/content/my_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokinized_dataset["train"],
    eval_dataset=tokinized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.970743,0.575


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.970743,0.575
2,No log,0.961671,0.575


TrainOutput(global_step=40, training_loss=0.9875455856323242, metrics={'train_runtime': 690.4111, 'train_samples_per_second': 0.924, 'train_steps_per_second': 0.058, 'total_flos': 21958415269740.0, 'train_loss': 0.9875455856323242, 'epoch': 2.0})

In [37]:
trainer.save_model("/content/trainer_model")

In [38]:
tokenizer.save_pretrained("/content/tokenizer")

('/content/tokenizer/tokenizer_config.json',
 '/content/tokenizer/special_tokens_map.json',
 '/content/tokenizer/vocab.txt',
 '/content/tokenizer/added_tokens.json',
 '/content/tokenizer/tokenizer.json')

### Inference

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/trainer_model")

In [39]:
dataset['train'][1]

{'label': 2,
 'text': 'Subcategory:Parabolic Antenna ~ Subject:Parabolantennen funktionieren nicht ~ Email:Meine kürzlich gekaufte Parabolic Antenna zeigt keine Reaktion mehr. Ich bin auf sie für meine Arbeit dringend angewiesen. Bitte helfen Sie mir so schnell wie möglich!'}

In [40]:
text = dataset['train'][1]['text']

In [41]:
classifier = pipeline("text-classification", model="/content/trainer_model")
classifier(text)

[{'label': 'SOFTWARE', 'score': 0.5752891302108765}]