In [114]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb


from path_setup import setup_paths
setup_paths()

In [115]:
from dataset_transformer import BaseDatasetTransform
from base_model_train import BaseModelTrain
from model_evaluator import ModelEvaluator

In [116]:
df = pd.read_csv('../datasets/helpdesk_customer_tickets.csv')

In [117]:
df = df.drop(['id','answer','priority','tag_5','tag_6','tag_7','tag_8','tag_9'],axis=1)
df.head()

Unnamed: 0,subject,body,type,queue,language,business_type,tag_1,tag_2,tag_3,tag_4
0,Anfrage zu den Spezifikationen und Anpassungso...,Sehr geehrtes Support-Team des Tech Online Sto...,Request,Customer Service,de,Tech Online Store,Product Support,Sales Inquiry,Technical Guidance,General Inquiry
1,Déconnexions fréquentes et plantages,Le client signale des déconnexions fréquentes ...,Incident,Product Support,fr,Software Development Company,Technical Support,Software Bug,Service Disruption,System Crash
2,Problema de sonido Dell XPS,"Problema con el sonido, manejando como devoluc...",Problem,Returns and Exchanges,es,Tech Online Store,Returns and Exchanges,Product Support,Customer Service,Refund Request
3,Assistance requise pour la configuration du ta...,"Cher support client,\n\nNotre client, <name>, ...",Request,Product Support,fr,Software Development Company,Technical Support,Product Support,General Inquiry,Problem Resolution
4,Urgente: Assistência Imediata Necessária para ...,Caro Suporte ao Cliente da Firma de Consultori...,Incident,Human Resources,pt,IT Consulting Firm,Urgent Issue,Payroll Issue,Technical Support,Service Disruption


In [118]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import pandas as pd

from transformers import BertTokenizer, BertModel

model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)



In [119]:
from tqdm import tqdm

def get_embeddings(texts, batch_size):
    embeddings_list = []
    model.eval()  
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
        
        inputs = {key: value.to(model.device) for key, value in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)

            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings_list.extend(batch_embeddings.cpu().numpy().tolist())
    
    return embeddings_list



In [120]:
descriptions_body = df['body'].tolist()
embeddings_body = get_embeddings(descriptions_body, 32)

Processing batches:   0%|          | 0/19 [00:00<?, ?it/s]

Processing batches: 100%|██████████| 19/19 [01:39<00:00,  5.24s/it]


In [121]:
descriptions_subject = df['subject'].fillna('').tolist()
embeddings_subject = get_embeddings(descriptions_subject, 32)

Processing batches: 100%|██████████| 19/19 [00:22<00:00,  1.20s/it]


In [122]:
np.array(embeddings_body).shape

(600, 768)

In [123]:
col = [f'{i+1}_body' for i in range(len(embeddings_body[0]))]

df_body = pd.DataFrame(embeddings_subject, columns=col)

df = pd.concat([df, df_body], axis=1)


col = [f'{i+1}_subject' for i in range(len(embeddings_body[0]))]

df_subject = pd.DataFrame(embeddings_subject, columns=col)

df = pd.concat([df, df_subject], axis=1)
df.head()

Unnamed: 0,subject,body,type,queue,language,business_type,tag_1,tag_2,tag_3,tag_4,...,759_subject,760_subject,761_subject,762_subject,763_subject,764_subject,765_subject,766_subject,767_subject,768_subject
0,Anfrage zu den Spezifikationen und Anpassungso...,Sehr geehrtes Support-Team des Tech Online Sto...,Request,Customer Service,de,Tech Online Store,Product Support,Sales Inquiry,Technical Guidance,General Inquiry,...,-0.038035,0.281225,-0.415702,-1.028785,0.114163,0.279624,-0.325105,1.055942,0.672377,-0.182328
1,Déconnexions fréquentes et plantages,Le client signale des déconnexions fréquentes ...,Incident,Product Support,fr,Software Development Company,Technical Support,Software Bug,Service Disruption,System Crash,...,0.275617,-0.059634,-0.451603,0.134279,-0.07731,0.443718,0.068902,0.072667,0.323374,0.257339
2,Problema de sonido Dell XPS,"Problema con el sonido, manejando como devoluc...",Problem,Returns and Exchanges,es,Tech Online Store,Returns and Exchanges,Product Support,Customer Service,Refund Request,...,0.535669,0.026694,-0.129374,-0.520028,0.067733,0.278351,-0.092054,0.325286,0.016915,-0.101192
3,Assistance requise pour la configuration du ta...,"Cher support client,\n\nNotre client, <name>, ...",Request,Product Support,fr,Software Development Company,Technical Support,Product Support,General Inquiry,Problem Resolution,...,0.299054,-0.104939,-0.47085,-0.384595,0.215907,0.55264,0.281748,0.457512,0.292831,-0.14696
4,Urgente: Assistência Imediata Necessária para ...,Caro Suporte ao Cliente da Firma de Consultori...,Incident,Human Resources,pt,IT Consulting Firm,Urgent Issue,Payroll Issue,Technical Support,Service Disruption,...,0.278346,-0.213875,-0.330915,-0.925604,0.33191,0.336009,0.005618,-0.089741,0.257151,-0.071302


In [124]:
df = df.drop(['subject'	,'body'],axis=1)

In [125]:
bdf = BaseDatasetTransform(df,target='queue')

In [126]:
df,categorical_features = bdf.fit_transform()

Series([], dtype: int64)

В колонке(ах) ['queue'] нет пропущенных значений
-------------------------------------------
В наборе данных нет пропущенных значений
-------------------------------------------
Информация о колонках в датасете

Категориальные колонки:
Index(['type', 'language', 'business_type', 'tag_1', 'tag_2', 'tag_3',
       'tag_4'],
      dtype='object')

Числовые колонки:
Index(['1_body', '2_body', '3_body', '4_body', '5_body', '6_body', '7_body',
       '8_body', '9_body', '10_body',
       ...
       '759_subject', '760_subject', '761_subject', '762_subject',
       '763_subject', '764_subject', '765_subject', '766_subject',
       '767_subject', '768_subject'],
      dtype='object', length=1536)


In [127]:
X_train, X_test, y_train, y_test  = bdf.get_train_test_split()

Количество значений целевой переменной по категориям:
queue
Technical Support                  210
Product Support                     93
Customer Service                    85
IT Support                          77
Billing and Payments                46
Returns and Exchanges               41
Human Resources                     15
Service Outages and Maintenance     15
Sales and Pre-Sales                 13
General Inquiry                      5
Name: count, dtype: int64
Следует ли выполнить стратифицированное раздеение на обучающую и тестовую выборку? y/n
Разделение датасета выполнено успешно


In [128]:
lgbm_model = lgb.LGBMClassifier(verbose=-1)

columns_to_delete = []  

unique_classes = len(set(y_train))
print(unique_classes)

model_train = BaseModelTrain(
    model=lgbm_model,
    columns_to_delete=columns_to_delete,
    categorical_features=categorical_features,
    boosting_type='gbdt',
    objective='multiclass',  
    num_class=unique_classes, 
    metric='multi_error' 
)

10


In [129]:
metrics = ['accuracy', 'precision', 'recall', 'f1_macro']
evaluator = ModelEvaluator(model_trainer=model_train, metrics=metrics)

evaluator.fit(X_train, y_train)

In [130]:
evaluator.evaluate_to_dataframe(X_test, y_test)

Unnamed: 0,Metric,Score
0,accuracy,0.6
1,precision,0.607268
2,recall,0.463224
3,f1,0.495618
