In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
import time
import json

# sklearn imports
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# classifier imports
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB

seed = 1

## Load training and validation data

In [None]:
# load training validation data to use labels
load_path = "../data/clean/"
df_train_temp = pd.read_excel(
    load_path + "training_labels.xlsx", index_col=0
).reset_index()
df_val_temp = pd.read_excel(
    load_path + "validation_labels.xlsx", index_col=0
).reset_index()

In [None]:
# load vocab and tfidf matrices for the current task
code = "authorization_1percent_401words"
# code = "full_5352words"

vocab_path = "../data/interim/vocabs/vocab_" + code + ".json"
tfidf_train_path = "../data/interim/tfidfs/tfidf_" + code + "_train.npz"
tfidf_val_path = "../data/interim/tfidfs/tfidf_" + code + "_val.npz"

with open(vocab_path) as f:
    vocab = json.load(f)
tfidf_train = load_npz(tfidf_train_path)
tfidf_val = load_npz(tfidf_val_path)

In [None]:
tfidf_train

<379328x401 sparse matrix of type '<class 'numpy.float64'>'
	with 14795009 stored elements in Compressed Sparse Row format>

In [None]:
# convert tfidf to DataFrame with labels
df_train_all = pd.DataFrame.sparse.from_spmatrix(tfidf_train)
df_val_all = pd.DataFrame.sparse.from_spmatrix(tfidf_val)

df_train_all["label"], df_val_all["label"] = df_train_temp["label"], df_val_temp["label"]

In [None]:
# assign cluster labels (remove this step once clusters are finalized)
# clusters
# note: cluster names are not representative of the classes they contain
clusters = {
    "authorization": [
        "account cancellation",
        "account security",
        "login issues",
        "forgot my password",
        "software update",
    ],
    "order related and payments": [
        "best buy credit card",
        "payment failed",
        "billing or charge disputes",
        "cancel order",
        "unauthorized charge or payment",
        "refund request",
        "fraud concerns",
        "return request",
        "cancellation of a plan subscription or membership",
        "account cancellation",
        "change or update order",
        "schedule order pickup",
        "change shipping time",
        "delivery tracking",
        "refund status",
        "change payment method",
        "payment method",
        "change shipping address",
        "delivery or parts of delivery items missing",
        "renewal of a plan subscription or membership",
        "reschedule delivery",
        "reschedule order pickup",
        "rewards or discounts",
        "schedule delivery",
        "trade in inquiry",
        "delivery delays",
    ],
    "warranty": [
        "check warranty coverage",
        "damaged product",
        "warranty claim",
        "reschedule repair",
        "device damaged",
        "incomplete installation",
        "lost or forgot items",
        "reschedule installation",
        "schedule repair",
        "screen issues",
        "software error",
        "software installation",
        "schedule installation",
        "troubleshooting",
        "performance issues",
        "defective product",
    ],
    "queries regarding website": [
        "employment or career inquiries",
        "website or app complaints",
        "incomplete installation",
        "miscellaneous inquiries",
        "network or connectivity issues",
        "customer feedback",
        "bad customer service",
    ],
    "product queries": [
        "price match",  # 6759
        "product availability and stock",  # 37972
        "product compatibility",  # 10897
        "product details inquiry",  # 42698
        "transfer call to the right department or store",  # 5869
    ],
}

# other dict
other_dict = {
    "warranty": [
        "screen issues",
        "device damaged",
        "check warranty coverage",
        "lost or forgot items",
        "reschedule installation",
        "performance issues",
    ],
    "order related and payments": [
        "change shipping time",
        "best buy credit card",
        "payment failed",
        "account cancellation",
        "reschedule order pickup",
    ],
    "product queries": [],
    "queries regarding website": [
        "website or app complaints",
        "incomplete installation",
        "network or connectivity issues",
    ],
    "authorization": [],
}

# create a reverse lookup dict,
# i.e. keys are labels and values are cluster labels
cluster_lookup_dict = {}
for cluster_label, v in clusters.items():
    for label in v:
        cluster_lookup_dict[label] = cluster_label

df_train_all["cluster_label"] = df_train_all["label"].apply(lambda x: cluster_lookup_dict[x])
df_val_all["cluster_label"] = df_val_all["label"].apply(lambda x: cluster_lookup_dict[x])

In [None]:
# assign "other" labels
cluster_name = "authorization"

mask = df_train_all["cluster_label"] == cluster_name
df_train_clus = df_train_all[mask].copy()
df_train_nonclus = df_train_all[~mask].copy()

mask = df_val_all["cluster_label"] == cluster_name
df_val_clus = df_val_all[mask].copy()
df_val_nonclus = df_val_all[~mask].copy()

# assign "other" to all samples from other clusters
df_train_nonclus["new_label"] = "wrong_cluster"
df_val_nonclus["new_label"] = "wrong_cluster"

# within the current cluster, assign "other" to minority classes
df_train_clus["new_label"] = df_train_clus["label"].apply(
    lambda x: "other" if (x in other_dict[cluster_name]) else x
)
df_val_clus["new_label"] = df_val_clus["label"].apply(
    lambda x: "other" if (x in other_dict[cluster_name]) else x
)

In [None]:
df_train_nonclus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,394,395,396,397,398,399,400,label,cluster_label,new_label
0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,troubleshooting,warranty,wrong_cluster
1,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.026809,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,schedule installation,warranty,wrong_cluster
2,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,trade in inquiry,order related and payments,wrong_cluster
4,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,product availability and stock,product queries,wrong_cluster
5,0.0,0.0,0.000000,0.0,0.041691,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,schedule repair,warranty,wrong_cluster
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379323,0.0,0.0,0.000000,0.0,0.019798,0.0,0.0,0.0,0.051541,0.0,...,0.0,0.0,0.0,0.206185,0.0,0.033816,0.047898,rewards or discounts,order related and payments,wrong_cluster
379324,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,product details inquiry,product queries,wrong_cluster
379325,0.0,0.0,0.090262,0.0,0.055818,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,schedule repair,warranty,wrong_cluster
379326,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,troubleshooting,warranty,wrong_cluster


## Balance data and some preprocessing

In [None]:
df_train_clus["new_label"].value_counts()

new_label
login issues          3392
forgot my password    2579
account security      1701
software update        927
Name: count, dtype: int64

In [None]:
len(df_train_clus)

8599

In [None]:
df_train_nonclus["new_label"].value_counts()

new_label
wrong_cluster    370729
Name: count, dtype: int64

In [None]:
# no balancing, just need to augment the data with "wrong cluster" data
wrong_percent = 10 / 100
wrong_cluster_size_train = int(wrong_percent * len(df_train_clus))
wrong_cluster_size_val = int(wrong_percent * len(df_val_clus))

df_train = pd.concat(
    [
        df_train_clus,
        df_train_nonclus.sample(n=wrong_cluster_size_train, random_state=seed),
    ]
)
df_val = pd.concat(
    [df_val_clus, df_val_nonclus.sample(n=wrong_cluster_size_val, random_state=seed)]
)

In [None]:
# create X, y arrays
y_train, y_val = df_train["new_label"].values, df_val["new_label"].values
X_train = df_train.drop(columns=["label", "cluster_label", "new_label"]).to_numpy()
X_val = df_val.drop(columns=["label", "cluster_label", "new_label"]).to_numpy()

In [None]:
# encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

X_final, y_final = X_train, y_train

In [None]:
pd.Series(y_final).value_counts()

2    3392
1    2579
0    1701
3     927
4     859
Name: count, dtype: int64

## Train classifier

#### Naive Bayes

In [None]:
# create hyperparameter grid
# param_grid = [{'alpha': v} for v in [1.0, 0.5]]
param_grid = [{"alpha": v} for v in [10.0, 5.0, 1.0, 0.5, 0.1, 0.05, 0.001]]

In [None]:
# hyperparameter tuning
clf = MultinomialNB()
tuning_results = []
num_params = len(param_grid)

for i, params in enumerate(param_grid):
    # training
    start_time = time.time()

    clf = MultinomialNB(**params)
    clf.fit(X_final, y_final)

    end_time = time.time()

    # calculate metrics
    y_pred = clf.predict(X_val)
    micro_f1_score = f1_score(y_val, y_pred, average="micro")
    clf_report = classification_report(y_val, y_pred, output_dict=True)

    # save metrics
    result = {
        "params": params,
        "time": (end_time - start_time) / 60,
        "micro_f1_score": micro_f1_score,
        "classification_report": clf_report,
    }
    tuning_results.append(result)
    print(f"Hyperparameter set {i}/{num_params} completed.")
    print(result)

Hyperparameter set 0/7 completed.
{'params': {'alpha': 10.0}, 'time': 0.001426676909128825, 'micro_f1_score': 0.5088757396449705, 'classification_report': {'0': {'precision': 0.625, 'recall': 0.18779342723004694, 'f1-score': 0.2888086642599278, 'support': 213}, '1': {'precision': 0.6650485436893204, 'recall': 0.4241486068111455, 'f1-score': 0.5179584120982987, 'support': 323}, '2': {'precision': 0.4440639269406393, 'recall': 0.9174528301886793, 'f1-score': 0.5984615384615385, 'support': 424}, '3': {'precision': 0.9696969696969697, 'recall': 0.27586206896551724, 'f1-score': 0.42953020134228187, 'support': 116}, '4': {'precision': 1.0, 'recall': 0.037383177570093455, 'f1-score': 0.07207207207207207, 'support': 107}, 'accuracy': 0.5088757396449705, 'macro avg': {'precision': 0.7407618880653859, 'recall': 0.3685280221530965, 'f1-score': 0.3813661776468238, 'support': 1183}, 'weighted avg': {'precision': 0.6388027329833728, 'recall': 0.5088757396449705, 'f1-score': 0.45655259507204043, 'sup

#### XGBoost

In [None]:
param_grid = {
    "learning_rate": [0.7, 0.3, 0.1],
    "n_estimators": [30, 50, 70],  # more means overfitting
    "max_depth": [5, 6, 7],  # more means overfitting
    "lambda": [0.01, 0.1, 1.0],  # more means underfitting
    "alpha": [0.01, 0.1, 1.0],  # more means underfitting
    "subsample": [1.0, 0.5, 0.3, 0.1],
}
n_hyper_combinations = 100
param_list = list(
    ParameterSampler(param_grid, n_iter=n_hyper_combinations, random_state=seed)
)

In [None]:
param_list = param_list[0:100]
param_list

[{'subsample': 0.5,
  'n_estimators': 30,
  'max_depth': 6,
  'learning_rate': 0.1,
  'lambda': 0.1,
  'alpha': 0.1},
 {'subsample': 1.0,
  'n_estimators': 70,
  'max_depth': 6,
  'learning_rate': 0.7,
  'lambda': 1.0,
  'alpha': 0.1},
 {'subsample': 0.5,
  'n_estimators': 30,
  'max_depth': 7,
  'learning_rate': 0.1,
  'lambda': 0.1,
  'alpha': 0.01},
 {'subsample': 0.3,
  'n_estimators': 30,
  'max_depth': 6,
  'learning_rate': 0.3,
  'lambda': 0.01,
  'alpha': 1.0},
 {'subsample': 0.5,
  'n_estimators': 30,
  'max_depth': 7,
  'learning_rate': 0.7,
  'lambda': 1.0,
  'alpha': 0.01},
 {'subsample': 1.0,
  'n_estimators': 70,
  'max_depth': 5,
  'learning_rate': 0.1,
  'lambda': 1.0,
  'alpha': 1.0},
 {'subsample': 0.1,
  'n_estimators': 70,
  'max_depth': 6,
  'learning_rate': 0.1,
  'lambda': 1.0,
  'alpha': 0.01},
 {'subsample': 0.5,
  'n_estimators': 50,
  'max_depth': 5,
  'learning_rate': 0.7,
  'lambda': 0.01,
  'alpha': 0.1},
 {'subsample': 0.3,
  'n_estimators': 70,
  'max_de

In [None]:
# hyperparameter tuning
tuning_results = []
num_params = len(param_list)

for i, params in enumerate(param_list):
    # training
    start_time = time.time()

    clf = xgb.XGBClassifier(**params)
    clf.fit(X_final, y_final)

    end_time = time.time()

    # calculate metrics
    y_pred = clf.predict(X_val)
    micro_f1_score = f1_score(y_val, y_pred, average="micro")
    clf_report = classification_report(y_val, y_pred, output_dict=True)

    # save metrics
    result = {
        "params": params,
        "time": (end_time - start_time) / 60,
        "micro_f1_score": micro_f1_score,
        "classification_report": clf_report,
    }
    tuning_results.append(result)
    print(f"Hyperparameter set {i}/{num_params} completed.")
    print(result)

Hyperparameter set 0/100 completed.
{'params': {'subsample': 0.5, 'n_estimators': 30, 'max_depth': 6, 'learning_rate': 0.1, 'lambda': 0.1, 'alpha': 0.1}, 'time': 0.22447460095087687, 'micro_f1_score': 0.73541842772612, 'classification_report': {'0': {'precision': 0.7537688442211056, 'recall': 0.704225352112676, 'f1-score': 0.7281553398058254, 'support': 213}, '1': {'precision': 0.6925287356321839, 'recall': 0.7461300309597523, 'f1-score': 0.7183308494783903, 'support': 323}, '2': {'precision': 0.7250608272506083, 'recall': 0.7028301886792453, 'f1-score': 0.7137724550898202, 'support': 424}, '3': {'precision': 0.8782608695652174, 'recall': 0.8706896551724138, 'f1-score': 0.8744588744588744, 'support': 116}, '4': {'precision': 0.7272727272727273, 'recall': 0.7476635514018691, 'f1-score': 0.7373271889400922, 'support': 107}, 'accuracy': 0.73541842772612, 'macro avg': {'precision': 0.7553784007883685, 'recall': 0.7543077556651914, 'f1-score': 0.7544089415546005, 'support': 1183}, 'weighted

KeyboardInterrupt: 

In [None]:
fname = "exp_divij_results0_cluster1_authorization.json"
with open(fname, "w") as f:
    json.dump(tuning_results, f)

### Save the best model

In [None]:
tuning_results.sort(key=lambda x: x['micro_f1_score'], reverse=True)

In [None]:
tuning_results

[{'params': {'subsample': 1.0,
   'n_estimators': 30,
   'max_depth': 7,
   'learning_rate': 0.3,
   'lambda': 1.0,
   'alpha': 0.1},
  'time': 0.3365378816922506,
  'micro_f1_score': 0.7217268887846081,
  'classification_report': {'0': {'precision': 0.6374045801526718,
    'recall': 0.48405797101449277,
    'f1-score': 0.5502471169686985,
    'support': 345},
   '1': {'precision': 0.5333333333333333,
    'recall': 0.25,
    'f1-score': 0.3404255319148936,
    'support': 128},
   '2': {'precision': 0.8262032085561497,
    'recall': 0.9537037037037037,
    'f1-score': 0.8853868194842406,
    'support': 324},
   '3': {'precision': 0.7274368231046932,
    'recall': 0.8770402611534276,
    'f1-score': 0.7952639368524913,
    'support': 919},
   '4': {'precision': 0.738562091503268,
    'recall': 0.509009009009009,
    'f1-score': 0.6026666666666668,
    'support': 222},
   '5': {'precision': 0.6379310344827587,
    'recall': 0.5751295336787565,
    'f1-score': 0.6049046321525886,
    'supp

In [None]:
params = {
    # put params here
}

clf = xgb.XGBClassifier(**params)

clf.fit(X_final, y_final)

# calculate metrics
y_pred = clf.predict(X_val)
micro_f1_score = f1_score(y_val, y_pred, average="micro")

In [None]:
micro_f1_score

0.7217268887846081

In [None]:
import pickle
save_path = "../data/results/models/"
with open(save_path + "put_file_name_here", "wb") as files:
    pickle.dump(clf, files, protocol=3)