In [None]:
# !pip install torch transformers datasets evaluate accelerate scikit-learn langdetect

In [None]:
from datasets import load_dataset

cust_tickets = load_dataset('Tobi-Bueck/customer-support-tickets')

In [None]:
cust_tickets_df = cust_tickets['train'].to_pandas()

In [None]:
print(cust_tickets_df.head())

In [None]:
print(cust_tickets_df[cust_tickets_df['language'] == 'en'].shape)

In [None]:
cust_sub = cust_tickets_df[['body', 'type', 'language']].copy()

In [None]:
print(cust_sub.head())

In [None]:
print(cust_sub.isna().sum())

In [None]:
cust_sub = cust_sub.dropna()

In [None]:
from langdetect import detect_langs

languages = []

# Loop over the rows of the DataFrame and append
for row in range(len(cust_sub)):
    languages.append(detect_langs(cust_sub.iloc[row, 0]))

# Clean the list by splitting
languages = [str(lang).split(':')[0][1:] for lang in languages]

In [None]:
type(languages)

In [None]:
print(languages[:10])

In [None]:
from collections import Counter

counter = Counter(languages)

print(counter)

In [None]:
cust_sub['language'] = languages

In [None]:
print(cust_sub.head())

In [None]:
cust_en = cust_sub[cust_sub['language'] == 'en'].copy()

In [None]:
print(cust_en.shape)

In [None]:
print(cust_en.head())

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
cust_en['label'] = label_encoder.fit_transform(cust_en['type'])

In [None]:
print(label_encoder.classes_)

In [None]:
cust_en.rename(columns={'body': 'text'}, inplace=True)
cust_en_set = cust_en[['text', 'label']].copy()

In [None]:
print(cust_en_set.head())

In [None]:
#Convert pandas DataFrame to Hugging Face Dataset
from datasets import Dataset

hf_cust = Dataset.from_pandas(cust_en_set)

# Split into train and test sets
hf_cust = hf_cust.train_test_split(test_size=0.2)

In [None]:
# Preprocess
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [None]:
# Apply preprocessing function over the entire dataset, we use map function
tokenized_cust = hf_cust.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Evaluate
import evaluate

accuracy = evaluate.load('accuracy')

In [None]:
# Create function that passes predictions and labels to compute to calculate accuracy
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# Train - create a map of expected ids to their labels with label2id and id2label
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

In [None]:
# Start Training
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert/distilbert-base-uncased', num_labels=4, id2label=id2label, label2id=label2id
)

In [None]:
print(tokenized_cust['train'].shape)
print(tokenized_cust['test'].shape)

In [None]:
# Define Training hyperparameters, pass training arguments to trainer, and call train() to finetune the model
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cust['train'],
    eval_dataset=tokenized_cust['test'],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# Make Inference
import torch
text = "User cannot connect to VPN"
inputs = tokenizer(text, return_tensors='pt')
with torch.no_grad():
    logits = model(**inputs).logits
pred = logits.argmax(dim=1).item()
print('Predicted label:', model.config.id2label[pred])