In [1]:
# Import necessary libraries
import lightgbm as lgb
import pandas as pd
from sklearn.compose import ColumnTransformer
from nltk import word_tokenize
from datasets import load_dataset
from langdetect import detect_langs, DetectorFactory
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Load the Hugging Face dataset
df = load_dataset('Tobi-Bueck/customer-support-tickets')

In [3]:
print(type(df))

<class 'datasets.dataset_dict.DatasetDict'>


In [4]:
# Convert Hugging Face Dataset to pandas DataFrame
df = df['train'].to_pandas()

In [5]:
# Display first 5 rows of df
print(df.head())

                                             subject  \
0                    Wesentlicher Sicherheitsvorfall   
1                                 Account Disruption   
2  Query About Smart Home System Integration Feat...   
3                  Inquiry Regarding Invoice Details   
4  Question About Marketing Agency Software Compa...   

                                                body  \
0  Sehr geehrtes Support-Team,\n\nich möchte eine...   
1  Dear Customer Support Team,\n\nI am writing to...   
2  Dear Customer Support Team,\n\nI hope this mes...   
3  Dear Customer Support Team,\n\nI hope this mes...   
4  Dear Support Team,\n\nI hope this message reac...   

                                              answer      type  \
0  Vielen Dank für die Meldung des kritischen Sic...  Incident   
1  Thank you for reaching out, <name>. We are awa...  Incident   
2  Thank you for your inquiry. Our products suppo...   Request   
3  We appreciate you reaching out with your billi...   Request

In [6]:
# Subset dataframe
df_set = df[['body', 'type', 'language']].copy()

In [7]:
# Display first 5 rows of new dataframe
print(df_set.head())

                                                body      type language
0  Sehr geehrtes Support-Team,\n\nich möchte eine...  Incident       de
1  Dear Customer Support Team,\n\nI am writing to...  Incident       en
2  Dear Customer Support Team,\n\nI hope this mes...   Request       en
3  Dear Customer Support Team,\n\nI hope this mes...   Request       en
4  Dear Support Team,\n\nI hope this message reac...   Problem       en


In [8]:
# Check for any missing values
print(df_set.isna().sum())

body        2
type        0
language    0
dtype: int64


In [9]:
# Remove rows with missing data
df_set = df_set.dropna()

In [10]:
# Re-check for missing values
print(df_set.isna().sum())

body        0
type        0
language    0
dtype: int64


In [11]:
# Make sure languages are correct using langdetect
languages = []

# Add for consistent results before calling detect_langs
DetectorFactory.seed = 9

# Loop over the rows of the DataFrame and append to languages list
for row in range(len(df_set)):
    languages.append(detect_langs(df_set.iloc[row, 0]))

# Clean the list by splitting and only keeping language
languages = [str(lang).split(':')[0][1:] for lang in languages]

In [12]:
print(languages[:10])

['de', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'de', 'de']


In [13]:
# Count number of different languages detected
counter = Counter(languages)
print(counter)

Counter({'en': 33432, 'de': 14971, 'fr': 65, 'nl': 20, 'ca': 20, 'it': 19, 'pt': 16, 'af': 12, 'es': 9, 'da': 5, 'no': 4, 'sv': 4, 'id': 3, 'ro': 2, 'et': 1, 'cy': 1, 'fi': 1})


In [14]:
# Replace old languages column with new languages
df_set['language'] = languages

In [15]:
# Display first 5 rows of new DataFrame
print(df_set.head())

                                                body      type language
0  Sehr geehrtes Support-Team,\n\nich möchte eine...  Incident       de
1  Dear Customer Support Team,\n\nI am writing to...  Incident       en
2  Dear Customer Support Team,\n\nI hope this mes...   Request       en
3  Dear Customer Support Team,\n\nI hope this mes...   Request       en
4  Dear Support Team,\n\nI hope this message reac...   Problem       en


In [16]:
# Keep only the English (en) rows, and reset index
it_ticks = df_set[df_set['language'] == 'en'].copy()
it_ticks.reset_index(inplace=True, drop=True)

In [17]:
# Lets check the shape and first 5 rows of it_ticks
print(it_ticks.shape)
print(it_ticks.head())

(33432, 3)
                                                body      type language
0  Dear Customer Support Team,\n\nI am writing to...  Incident       en
1  Dear Customer Support Team,\n\nI hope this mes...   Request       en
2  Dear Customer Support Team,\n\nI hope this mes...   Request       en
3  Dear Support Team,\n\nI hope this message reac...   Problem       en
4  Dear Customer Support,\n\nI hope this message ...   Request       en


In [18]:
# Rename columns
it_ticks.rename(columns={'body': 'text', 'type': 'label'}, inplace=True)

In [19]:
# Confirm changes were made
print(it_ticks.head())

                                                text     label language
0  Dear Customer Support Team,\n\nI am writing to...  Incident       en
1  Dear Customer Support Team,\n\nI hope this mes...   Request       en
2  Dear Customer Support Team,\n\nI hope this mes...   Request       en
3  Dear Support Team,\n\nI hope this message reac...   Problem       en
4  Dear Customer Support,\n\nI hope this message ...   Request       en


In [20]:
# Tokenize each item in the text column
word_tokens = [word_tokenize(text) for text in it_ticks['text']]

# Create an empty list to store the length of the text
len_tokens = []

# Iterate over the word_tokens list and determine the length of each item
for i in range(len(word_tokens)):
    len_tokens.append(len(word_tokens[i]))

# Create new feature for the length of each text
it_ticks['len_words'] = len_tokens

In [21]:
print(it_ticks.head())

                                                text     label language  \
0  Dear Customer Support Team,\n\nI am writing to...  Incident       en   
1  Dear Customer Support Team,\n\nI hope this mes...   Request       en   
2  Dear Customer Support Team,\n\nI hope this mes...   Request       en   
3  Dear Support Team,\n\nI hope this message reac...   Problem       en   
4  Dear Customer Support,\n\nI hope this message ...   Request       en   

   len_words  
0         92  
1         84  
2        102  
3        107  
4        110  


In [22]:
print(it_ticks['label'].unique())

['Incident' 'Request' 'Problem' 'Change']


In [23]:
# Set labels and features
features = it_ticks[['text', 'len_words']]
labels = it_ticks['label']

In [24]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=9)

In [25]:
# ColumnTransformer to handle text and len_words columns separately
preprocessor = ColumnTransformer(
    transformers=[
        ('vect', TfidfVectorizer(lowercase=False, ngram_range=(1, 2)), 'text'),
        ('len', 'passthrough', ['len_words'])
    ]
)

In [26]:
# Create pipeline to chain preprocessing, scaling, and classifier steps together for LogisticRegression
pipeline_logreg = Pipeline([
    ('preprocess', preprocessor),
    ('scaler', MaxAbsScaler()),
    ('log_reg', LogisticRegression(max_iter=5000, random_state=9))
])

In [27]:
# Create pipeline to chaing preprocessing, scaling, and classifier steps together for LightGBM Classifier
pipeline_lgb = Pipeline([
    ('preprocess', preprocessor),
    ('scaler', MaxAbsScaler()),
    ('lgb_clf', lgb.LGBMClassifier(random_state=9))
])

In [28]:
# Fit and Evaluate the LogisticRegression Pipeline

# Train the model
pipeline_logreg.fit(X_train, y_train)

# Predict on test set
y_pred_logreg = pipeline_logreg.predict(X_test)

# Evaluate
acc_score_logreg = accuracy_score(y_test, y_pred_logreg)
print('LogisticRegression Accuracy Score: {:.1%}'.format(acc_score_logreg))

LogisticRegression Accuracy Score: 91.2%


In [29]:
# Fit and Evaluate the LightGBM Classifier Pipeline

# Train the model
pipeline_lgb.fit(X_train, y_train)

# Predict on test set
y_pred_lgb = pipeline_lgb.predict(X_test)

# Evaluate
acc_score_lgb = accuracy_score(y_test, y_pred_lgb)
print('LightGBMClassifier Accuracy Score: {:.1%}'.format(acc_score_lgb))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.207842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 479316
[LightGBM] [Info] Number of data points in the train set: 26745, number of used features: 11505
[LightGBM] [Info] Start training from score -2.279120
[LightGBM] [Info] Start training from score -0.900801
[LightGBM] [Info] Start training from score -1.536800
[LightGBM] [Info] Start training from score -1.286220
LightGBMClassifier Accuracy Score: 86.9%


In [30]:
# Output first 10 y_test and y_pred_log labels
print(y_test.values[:10])
print(y_pred_logreg[:10])

['Problem' 'Request' 'Incident' 'Change' 'Problem' 'Incident' 'Request'
 'Change' 'Incident' 'Incident']
['Problem' 'Request' 'Incident' 'Change' 'Problem' 'Problem' 'Request'
 'Request' 'Incident' 'Problem']


In [31]:
# Build confusion matrix - LogisticRegression
print(confusion_matrix(y_test, y_pred_logreg))
print(confusion_matrix(y_test, y_pred_logreg).sum())

[[ 628    7    5   18]
 [   0 2572  184    4]
 [   0  353 1061    1]
 [   8    4    3 1839]]
6687


In [32]:
# Build classification report - LogisticRegression
print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

      Change       0.99      0.95      0.97       658
    Incident       0.88      0.93      0.90      2760
     Problem       0.85      0.75      0.80      1415
     Request       0.99      0.99      0.99      1854

    accuracy                           0.91      6687
   macro avg       0.92      0.91      0.91      6687
weighted avg       0.91      0.91      0.91      6687



In [33]:
# Output first 10 y_test and y_pred_lgb labels
print(y_test.values[:10])
print(y_pred_lgb[:10])

['Problem' 'Request' 'Incident' 'Change' 'Problem' 'Incident' 'Request'
 'Change' 'Incident' 'Incident']
['Problem' 'Request' 'Incident' 'Change' 'Problem' 'Problem' 'Request'
 'Request' 'Incident' 'Problem']


In [34]:
# Build confusion matrix - LGBMClassifier
print(confusion_matrix(y_test, y_pred_lgb))
print(confusion_matrix(y_test, y_pred_lgb).sum())

[[ 614    6    4   34]
 [   0 2574  182    4]
 [   1  617  793    4]
 [  11    5    5 1833]]
6687


In [35]:
# Build classification report - LGBMClassifier
print(classification_report(y_test, y_pred_lgb))

              precision    recall  f1-score   support

      Change       0.98      0.93      0.96       658
    Incident       0.80      0.93      0.86      2760
     Problem       0.81      0.56      0.66      1415
     Request       0.98      0.99      0.98      1854

    accuracy                           0.87      6687
   macro avg       0.89      0.85      0.87      6687
weighted avg       0.87      0.87      0.86      6687



In [36]:
# Lets do a test using row 23222 of the original data frame
print(it_ticks['label'][23222])

Incident


In [37]:
test = it_ticks[['text', 'len_words']].iloc[23222].to_frame().T

In [38]:
print(test)

                                                    text len_words
23222  A brief account of the issue: Encrypted medica...        39


In [39]:
# Prediction using LogisticRegression pipeline
prediction_logreg = pipeline_logreg.predict(test)
print(prediction_logreg[0])

Incident


Actual label and prediction match --> Incident

In [41]:
# Prediction using LGBMClassifier pipeline
prediction_lgb = pipeline_lgb.predict(test)
print(prediction_lgb[0])

Incident


Actual label and prediction match --> Incident