Import the Dependencies

In [68]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

In [69]:
df = pd.read_csv('/content/helpdesk_customer_tickets.csv')

In [70]:
df = df[['body','tag_1']]

In [72]:
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['tag_1'])

In [73]:
df

Unnamed: 0,body,tag_1,labels
0,Sehr geehrtes Support-Team des Tech Online Sto...,Product Support,10
1,Le client signale des dÃ©connexions frÃ©quentes ...,Technical Support,17
2,"Problema con el sonido, manejando como devoluc...",Returns and Exchanges,11
3,"Cher support client,\n\nNotre client, <name>, ...",Technical Support,17
4,Caro Suporte ao Cliente da Firma de Consultori...,Urgent Issue,18
...,...,...,...
595,"Estimado Soporte de Servicios de TI,\n\nEstamo...",Technical Support,17
596,"Estimado equipo de soporte de servicios de TI,...",Technical Support,17
597,"Estimado soporte al cliente, La pantalla tÃ¡cti...",Technical Support,17
598,"Caro Suporte ao Cliente,\n\nNosso ServiÃ§o de C...",Technical Support,17


In [74]:
# Convert DataFrame to Hugging Face Dataset format
dataset = Dataset.from_pandas(df[['body', 'labels']])

**Tokenization**

In [76]:
from transformers import DistilBertTokenizer

In [77]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')



In [78]:
# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples['body'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [79]:
#Prepare the dataset for training
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


**Model Initialization**

In [80]:
from transformers import DistilBertForSequenceClassification

# Specify the number of unique labels
num_labels = len(df['labels'].unique())

# Load the DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [89]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir='./distilbert_ticket_classification',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,   # Adjust based on GPU memory
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=200,
)



In [90]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # Ideally, split into train/eval datasets
)

In [91]:
#Start Training
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.141386
2,No log,0.830612
3,1.133200,0.596252
4,1.133200,0.519665
5,1.133200,0.389072
6,0.626800,0.280994
7,0.626800,0.220781
8,0.308500,0.163654
9,0.308500,0.144702
10,0.308500,0.135445


TrainOutput(global_step=750, training_loss=0.588356803894043, metrics={'train_runtime': 130.0722, 'train_samples_per_second': 46.128, 'train_steps_per_second': 5.766, 'total_flos': 198764881920000.0, 'train_loss': 0.588356803894043, 'epoch': 10.0})

**Save the Model and Tokenizer**

In [92]:
# Save the fine-tuned model
trainer.save_model('distilbert_ticket_classification')

# Save the tokenizer
tokenizer.save_pretrained('distilbert_ticket_classification')


('distilbert_ticket_classification/tokenizer_config.json',
 'distilbert_ticket_classification/special_tokens_map.json',
 'distilbert_ticket_classification/vocab.txt',
 'distilbert_ticket_classification/added_tokens.json')

**Using the Fine Tuned Model**

In [97]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
classifier = pipeline("text-classification", model='distilbert_ticket_classification', tokenizer='./distilbert_ticket_classification')

# Example inference
sample_text = """
Dear Tech Online Store Support,

I am writing to request a return for my Samsung QLED QN90A due to a defective HDR feature. The issue was discovered within the return window. Please advise on the next steps for processing this return.

Thank you,
<name>
<acc_num>
<tel_num>
"""

sample_text2 = """
Caro Time de Suporte ao Cliente,

Recentemente, comprei uma licenÃƒÂ§a para o Adobe Photoshop 2024. No entanto, estou vendo cobranÃƒÂ§as inesperadas na minha fatura. VocÃƒÂª poderia fornecer mais detalhes ou esclarecer essas cobranÃƒÂ§as?

Obrigado,
<name>
<email>
"""

sample_text3 = """
Caro Time de Suporte ao Cliente,

Estou escrevendo para relatar problemas de login com o Microsoft Office 365 durante uma apresentaÃƒÂ§ÃƒÂ£o crÃƒÂ­tica de prÃƒÂ©-vendas. Nossos clientes nÃƒÂ£o conseguem fazer login, e isso estÃƒÂ¡ afetando nossa capacidade de mostrar o produto de forma eficaz. Precisamos de assistÃƒÂªncia imediata para resolver este assunto rapidamente.

Obrigado por sua compreensÃƒÂ£o e atenÃƒÂ§ÃƒÂ£o rÃƒÂ¡pida a este problema urgente.

Atenciosamente,

<name>
<company_name>
<tel_num>
<email>
"""
predictions = classifier(sample_text3)

# Decode the label
predicted_label = label_encoder.inverse_transform([int(predictions[0]['label'].split('_')[-1])])
print(f"Predicted label: {predicted_label[0]}")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Predicted label: Login Issue
