In [None]:
import matplotlib.pyplot as plt
import numpy as np

Figure 11a: Paper Distribution

In [None]:
# Set Arial as the font
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["font.size"] = '12'
plt.figure(figsize=(7, 7))

total = 55
sizes = np.array([2/55, 1/55, 2/55, 8/55, 8/55, 1/55, 33/55])
labels = ['Finance', 'Law', 'Marketing', 'Medicine', 'Social Science', 'Tourism', 'Computer Science']

sizes, labels = zip(*sorted(zip(sizes, labels), reverse=True))

def custom_autopct(pct):
    total_value = sum(sizes) * total
    val = int(round(pct * total_value / 100.0))
    return '{:.0f}%\n({:d})'.format(pct, val) if pct >= 5 else ''

colors = ['#2664b6', 'orange', 'green', 'red', 'violet', 'brown', 'pink']

patches, texts, autotexts = plt.pie(sizes, autopct=custom_autopct, startangle=90, wedgeprops=dict(edgecolor='black'), colors=colors)

plt.axis('equal') 

for autotext, size in zip(autotexts, sizes):
    autotext.set_color('black')

label_distance = 1.05
for i, size in enumerate(sizes):
    if size * 100 < 5:
        angle = np.deg2rad(90 + 360 * np.sum(sizes[:i + 1]) - (360 * size / 2))
        x, y = np.cos(angle) * label_distance, np.sin(angle) * label_distance
        plt.text(x, y, '{:.0f}%\n({:d})'.format(size * 100, int(size * total)), horizontalalignment='center', verticalalignment='bottom', fontsize=12)

plt.legend(patches, labels, loc="center left", bbox_to_anchor=(1, 0.5), frameon=False)
plt.show()

Figure 11b: Paper Distribution

In [None]:
# Set Arial as the font
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["font.size"] = '12'

number_of_methods = [1, 2, 3, 4, 5, 6]
number_of_papers = [37, 8, 4, 3, 1, 2]

plt.figure(figsize=(7, 7))
plt.barh(number_of_methods, number_of_papers, color='#2664b6', edgecolor='k')
plt.ylabel('Number of Methods Applied')
plt.xlabel('Number of Papers')
plt.yticks(number_of_methods)
plt.gca().invert_yaxis()
plt.show()

Figure 12: Papers that Apply or Introduce MLI Methods

In [None]:
# Set Arial as the font
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["font.size"] = '14'

data = [
    (2010, 1, 'New Method'), (2010, 1, 'Application'),
    (2014, 1, 'New Method'), (2014, 1, 'Application'),
    (2016, 1, 'New Method'), (2016, 1, 'Application'),
    (2017, 1, 'New Method'), (2017, 1, 'Application'),
    (2018, 2, 'New Method'), (2018, 4, 'Application'), 
    (2019, 2, 'New Method'), (2019, 5, 'Application'), 
    (2020, 1, 'New Method'), (2020, 11, 'Application'), 
    (2021, 4, 'New Method'), (2021, 9, 'Application'),
    (2022, 1, 'New Method'), (2022, 10, 'Application'),
    (2023, 2, 'New Method'), (2023, 12, 'Application'),
]

years = np.arange(2010, 2024)

new_method_counts = [0] * len(years)
application_counts = [0] * len(years)

for year, count, method in data:
    index = np.where(years == year)[0][0]
    if method == 'New Method':
        new_method_counts[index] += count
    elif method == 'Application':
        application_counts[index] += count

plt.figure(figsize=(14, 5))
plt.bar(years, new_method_counts, label='New Method', align='center', color='#2664b6', edgecolor='k')
plt.plot(years, application_counts, label='Method Application', marker='s', linestyle='-', color='orange')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.legend(frameon=False)
plt.xticks(years, rotation=45)
plt.tight_layout()

Figure 21: Share of Unverified Tickets

In [None]:
# Set Arial as the font
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["font.size"] = '18'
plt.figure(figsize=(7, 7))
colors = ['#2664b6', '#e10000']

total = 7075
sizes = np.array([6529/7075, 546/7075])
labels = ['Verified tickets', 'Unverified tickets']

sizes, labels = zip(*sorted(zip(sizes, labels), reverse=True))

def custom_autopct(pct):
    total_value = sum(sizes) * total
    val = int(round(pct * total_value / 100.0))
    return '{:.0f}%\n({:d})'.format(pct, val) if pct >= 5 else ''

patches, texts, autotexts = plt.pie(sizes, autopct=custom_autopct, startangle=90, wedgeprops=dict(edgecolor='black'), colors=colors)

plt.axis('equal') 

for autotext, size in zip(autotexts, sizes):
    autotext.set_color('black')

label_distance = 1.05
for i, size in enumerate(sizes):
    if size * 100 < 5:
        angle = np.deg2rad(90 + 360 * np.sum(sizes[:i + 1]) - (360 * size / 2))
        x, y = np.cos(angle) * label_distance, np.sin(angle) * label_distance
        plt.text(x, y, '{:.0f}%\n({:d})'.format(size * 100, int(size * total)), horizontalalignment='center', verticalalignment='bottom', fontsize=12)

plt.legend(patches, labels, loc="center left", bbox_to_anchor=(1, 0.5), frameon=False)
plt.show()

Figure 22: Language Distribution in the Ticket Dataset

In [None]:
# Set Arial as the font
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["font.size"] = '18'

languages = ['German (de)', 'English (en)', 'Unknown']
counts = [4289, 2506, 280]

plt.figure(figsize=(8, 6))
bars = plt.bar(languages, counts, width=0.5, color='#2664b6', edgecolor='k')

for bar, count in zip(bars, counts):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 50, str(count), ha='center', va='bottom')

plt.ylabel('Number of Tickets')
plt.ylim(0, 5000)
plt.show()

Figure 23: Accuracies for BERT Classifier Fine-tuned on Imbalanced Dataset for Product Classification Step in Stratified 5-Fold Cross-Validation

In [None]:
# Set Arial as the font
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["font.size"] = '18'

folds = np.array([1, 2, 3, 4, 5])
accuracies = np.array([0.6160714285714286, 0.65625, 0.7053571428571429, 0.6674107142857143, 0.671875])

average_accuracy = np.mean(accuracies)

plt.figure(figsize=(10, 6))
plt.bar(folds, accuracies, width=0.5, color='#2664b6', edgecolor='k')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.ylim(0, 1) 

for i, acc in enumerate(accuracies):
    plt.text(folds[i], acc + 0.02, f'{acc:.2f}', ha='center')

plt.axhline(y=average_accuracy, color='red', linestyle='--', label=f'Average: {average_accuracy:.2f}')
plt.legend(frameon=False)
plt.show()

Figure 25: Accuracies for Ensemble Classifier Trained on Imbalanced Dataset for Product Classification Step in Stratified 5-Fold Cross-Validation

In [None]:
# Set Arial as the font
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["font.size"] = '18'

folds = np.array([1, 2, 3, 4, 5])
accuracies = np.array([0.7008928571428571, 0.7611607142857143, 0.7678571428571429, 0.7566964285714286, 0.8058035714285714])

average_accuracy = np.mean(accuracies)

plt.figure(figsize=(10, 6))
plt.bar(folds, accuracies, width=0.5, color='#2664b6', edgecolor='k')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.ylim(0, 1) 

for i, acc in enumerate(accuracies):
    plt.text(folds[i], acc + 0.02, f'{acc:.2f}', ha='center')

plt.axhline(y=average_accuracy, color='red', linestyle='--', label=f'Average: {average_accuracy:.2f}')
plt.legend(frameon=False)
plt.show()

Figure 26: Mean Performance Metric Comparison for Imbalanced Dataset

In [None]:
# Set Arial as the font
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["font.size"] = '10'

classification_step = ['Level', 'Department', 'Product']

bert_accuracy = np.array([0.867043847241867, 0.8211963731927228, 0.6633928571428572])
bert_precision = np.array([0.8475437387295506, 0.8241677093128784, 0.6136126671472666])
bert_recall = np.array([0.8430813058961923, 0.7832091894387833, 0.5543349230911064])
bert_f1 = np.array([0.8445845757174062, 0.794574395325396, 0.5650926240247048])

stacked_bert_accuracy = np.array([0.8772277227722771, 0.8405227266965802, 0.7584821428571429])
stacked_bert_precision = np.array([0.860004055482614, 0.8336300424039518, 0.7253002863273899])
stacked_bert_recall = np.array([0.8519784048023503, 0.8240544097407099, 0.6886687750112287])
stacked_bert_f1 = np.array([0.8555326899051863, 0.8276617652423252, 0.6953693347983478])

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
data_bert = [bert_accuracy, bert_precision, bert_recall, bert_f1]
data_stacked_bert = [stacked_bert_accuracy, stacked_bert_precision, stacked_bert_recall, stacked_bert_f1]

fig, ax = plt.subplots(2, 2, figsize=(12, 8))
fig.tight_layout(pad=6.0)

for i, metric in enumerate(metrics):
    row = i // 2
    col = i % 2
    bars_bert = ax[row, col].bar(np.arange(len(classification_step)) - 0.15, data_bert[i], width=0.3, label='BERT', color='#2664b6', edgecolor='k')
    bars_stacked_bert = ax[row, col].bar(np.arange(len(classification_step)) + 0.15, data_stacked_bert[i], width=0.3, label='BERT + XGBoost', color='#e10000', edgecolor='k')
    
    for bar in bars_bert + bars_stacked_bert:
        height = bar.get_height()
        ax[row, col].annotate(f'{height:.2f}',
                              xy=(bar.get_x() + bar.get_width() / 2, height),
                              xytext=(0, 3),
                              textcoords='offset points',
                              ha='center', va='bottom')
    
    ax[row, col].set_title(metric)
    ax[row, col].set_xticks(np.arange(len(classification_step)))
    ax[row, col].set_xticklabels(classification_step)
    ax[row, col].set_ylim(0.3, 1)
    ax[row, col].legend(frameon=False)

plt.show()

Figure 27: Mean Performance Metric Comparison for Resampled Dataset

In [None]:
# Set Arial as the font
plt.rcParams['font.family'] = 'Arial'
plt.rcParams["font.size"] = '10'

classification_step = ['Level', 'Department', 'Product']

resampled_bert_accuracy = np.array([0.8161244695898162, 0.6908924210544105, 0.45])
resampled_bert_precision = np.array([0.793363164743371, 0.6860667325823757, 0.46211484148512305])
resampled_bert_recall = np.array([0.7722236214340066, 0.7056539144301975, 0.5567869953926449])
resampled_bert_f1 = np.array([0.77702523797411, 0.6621061232264001, 0.4688547487903663])

resampled_stacked_bert_accuracy = np.array([0.7983026874115984, 0.723815076769674, 0.7272321428571429])
resampled_stacked_bert_precision = np.array([0.771957839894052, 0.6787873377916666, 0.6790555172976784])
resampled_stacked_bert_recall = np.array([0.8021236706788505, 0.7750762295231409, 0.7303113469610631])
resampled_stacked_bert_f1 = np.array([0.7795007439315865, 0.7041758970206256, 0.6864236835890545])

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
resampled_data_bert = [resampled_bert_accuracy, resampled_bert_precision, resampled_bert_recall, resampled_bert_f1]
resampled_data_stacked_bert = [resampled_stacked_bert_accuracy, resampled_stacked_bert_precision, resampled_stacked_bert_recall, resampled_stacked_bert_f1]

fig, ax = plt.subplots(2, 2, figsize=(12, 8))
fig.tight_layout(pad=6.0)

for i, metric in enumerate(metrics):
    row = i // 2
    col = i % 2
    bars_bert = ax[row, col].bar(np.arange(len(classification_step)) - 0.15, resampled_data_bert[i], width=0.3, label='BERT', color='#2664b6', edgecolor='k')
    bars_stacked_bert = ax[row, col].bar(np.arange(len(classification_step)) + 0.15, resampled_data_stacked_bert[i], width=0.3, label='BERT + XGBoost', color='#e10000', edgecolor='k')
    
    for bar in bars_bert + bars_stacked_bert:
        height = bar.get_height()
        ax[row, col].annotate(f'{height:.2f}',
                              xy=(bar.get_x() + bar.get_width() / 2, height),
                              xytext=(0, 3),
                              textcoords='offset points',
                              ha='center', va='bottom')
    
    ax[row, col].set_title(metric)
    ax[row, col].set_xticks(np.arange(len(classification_step)))
    ax[row, col].set_xticklabels(classification_step)
    ax[row, col].set_ylim(0.3, 1)
    ax[row, col].legend(frameon=False)

plt.show()