In [1]:
import os
import pandas as pd
import numpy as np
import torch
import tensorflow as tf

from typing import Optional, Union
from datasets import Dataset
from dataclasses import dataclass
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers.trainer_callback import EarlyStoppingCallback, TrainerCallback, TrainerState, TrainerControl
from transformers import DataCollatorWithPadding
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import re

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def ignore_email(text):
    if str(text)[:8]=='received':
        return str(text).split('\n', 1)[-1]
    else:
        return text
def t1_change(group):
    if 'FieldSupport' in group and 'HDH' not in group:
        return 'ITS-FieldSupport-Intake'
    if 'SecurityZoom' in group:
        return 'ITS-Zoom'
    if 'HDH' in group:
        return 'ITS-HDH'
    if 'RMP' in group:
        return 'ITS-RMP'
    if 'Library' in group:
        return 'ITS-Library'
    if 'Security' in group:
        return 'ITS-SecuritySOC'
    if 'FIS' in group and 'Config' not in group:
        return 'ITS-FIS'
    if 'Datacomm' in group:
        return 'ITS-Datacomm'
    if 'ServiceDesk' in group:
        return 'ITS-ServiceDesk'
    if 'SNOW' in group:
        return 'ITS-SNOW'
    if group=='ITS-JAMF':
        return 'ITS-WorkstationLifecycle'
    else:
        return group 

In [4]:
defunct = ['ITS-Directory', 'ITS-ServiceDesk-MisroutedTicket', 'ITS-MECM', 'ITS-Billing','ITS-Purchasing',
           'ITS-BADG', 'ITS-WebTech', 'ITS-Planning', 'ITS-EcotimeSupport', 'ITS-CCR-Support', 
           'ITS-LaptopLoaners', 'ITS-InTune', 'ITS-MediaTeachLab', 'ITS-UCSD-AH-Support', 
           'ITS-StaffAdmin', 'ITS-DataWarehouse', 'ITS-EndpointStorekeeper', 
           'ITS-MiddlewareIntegration', 'ITS-TSMT', 'ITS-StaffHR', 'ITS-ChangeManagement',
           'ITS-FacilitiesMgmt', 'ITS-ContinuityPlanning', 'ITS-EnterpriseArchitecture','ITS-LSS-Support', 
           'ITS-CloudFinance', 'ITS-MediaServicesMgmt', 'ITS-ServiceMgmtOffice', 'ITS-FinancialMainframe', 'ITS-PatchMgmt', 
           'ITS-WorkplaceTechServices', 'ITS-ProblemManagement', 'ITS-FIM-CAD']

In [5]:
to_predict = 'assignment_group'

In [6]:
df = pd.read_csv("/kaggle/input/nophonetickets/sn_customerservice_case (1).csv",encoding="utf-8",
            encoding_errors="ignore",
            engine="c",
            on_bad_lines="warn")
df = df[~df['assignment_group'].isin(defunct)]
evcats = df['assignment_group'].str.contains('EVCATS')
df = df[~evcats]
df = df[df['assignment_group'].str.contains('ITS')]
df['assignment_group'] = df['assignment_group'].apply(t1_change)
df['assignment_group'] = df['assignment_group'].str[4:]
df['description'] = df['description'].apply(ignore_email)
df['combined'] = df['short_description']+' #### '+df['description']
df = df[["combined", to_predict]]
df["combined"] = df["combined"].fillna("")
#df.to_csv("/kaggle/working/processed_its_top_25.csv")

In [7]:
#filename = '/kaggle/working/processed_its_top_25.csv'
df = df[df['combined'].str.len()<600]
df['combined'] = df['combined'].str.replace('\n', ' ')
df['combined'] = df['combined'].str.replace('\r', ' ')
df["combined"] = df["combined"].fillna("")
X = df["combined"].fillna("None").to_numpy()
le = LabelEncoder()
#y = le.fit(df["service_offering"])
y = le.fit(df[to_predict].astype(str))


In [8]:
offerings = df[to_predict].unique().astype(str)

labels = y.transform(offerings)

label2id = dict(zip(offerings, labels.tolist()))
id2label = dict(zip(labels.tolist(), offerings))
train_set=df

In [9]:
def preprocess(example):
    tokenized_example = tokenizer(example['combined'], truncation=True, padding=True, max_length=1024)
    tokenized_example['label'] = label2id[example[to_predict]]
    return tokenized_example
###
#def compute_metrics(eval_pred):
#    predictions, labels = eval_pred
#    predictions = np.argmax(predictions, axis=1)
#    return {'accuracy': (predictions==labels).mean()}
###

In [10]:
deberta_v3_large = '/kaggle/input/microsoftdeberta-v3-base/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(deberta_v3_large)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
df = df.dropna()

def sampling_k_elements(group, k=1500):
    if len(group) < k:
        return group
    return group.sample(k)

balanced = df.groupby(to_predict).apply(sampling_k_elements).reset_index(drop=True)


def map3(y_true, y_pred):
    m = (y_true.reshape((-1,1)) == y_pred)
    return np.mean(np.where(m.any(axis=1), m.argmax(axis=1)+1, np.inf)**(-1))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions_sorted = np.argsort(-predictions, axis=1)[:, :3]
    return {'map3': map3(labels, predictions_sorted)}

best_score = -float('inf')
#df.columns = ['Unnamed: 0', 'text', 'labels']
#train_df = df.sample(10000)[['text', 'labels']]
train_df = balanced[['combined', to_predict]]
trained_model_dir_path = '/kaggle/working'


#ds = Dataset.from_pandas(train_df[['text','labels']]).train_test_split(test_size=0.2, stratify_by_column="labels")
X_train, X_test, y_train, y_test = train_test_split(train_df['combined'], train_df[to_predict], test_size=0.2, random_state=40)
train = pd.concat([X_train, y_train], axis=1)
train_ds = Dataset.from_pandas(train)
test = pd.concat([X_test, y_test], axis=1)
test_ds = Dataset.from_pandas(test)
tokenized_train = train_ds.map(preprocess)
tokenized_train = tokenized_train.remove_columns([to_predict])
tokenized_valid = test_ds.map(preprocess)
tokenized_valid = tokenized_valid.remove_columns([to_predict])

  0%|          | 0/39954 [00:00<?, ?ex/s]

  0%|          | 0/9989 [00:00<?, ?ex/s]

In [12]:
output_dir = trained_model_dir_path + 'tts'
best_model_dir = trained_model_dir_path + '/best'

training_args = TrainingArguments(
    output_dir=output_dir,
    load_best_model_at_end=True,
    save_total_limit=1,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    warmup_ratio=0.2,
    learning_rate=4e-5,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    report_to='none',
    weight_decay=0.03,
    lr_scheduler_type='linear',
    metric_for_best_model='map3'
)

model = AutoModelForSequenceClassification.from_pretrained(deberta_v3_large,id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    compute_metrics=compute_metrics
)

trainer.train()    

Some weights of the model checkpoint at /kaggle/input/microsoftdeberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequen

Epoch,Training Loss,Validation Loss,Map3
0,3.666,1.999156,0.6058
2,1.87,1.261506,0.755448
2,1.2115,0.96586,0.818317
4,0.9195,0.885627,0.837071
4,0.5749,0.858514,0.846581
6,0.4788,0.876107,0.849418
6,0.3865,0.874882,0.853989
7,0.3257,0.883924,0.855908




TrainOutput(global_step=4992, training_loss=1.0455395319522955, metrics={'train_runtime': 11248.2116, 'train_samples_per_second': 28.416, 'train_steps_per_second': 0.444, 'total_flos': 2.852832907234891e+16, 'train_loss': 1.0455395319522955, 'epoch': 7.99})

In [13]:
trainer.save_model("/kaggle/working/model")
