In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
import time
import json

# sklearn imports
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# classifier imports
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB

seed = 1

## Load training and validation data

In [None]:
# load training validation data to use labels
load_path = "../data/clean/"
df_train_temp = pd.read_excel(
    load_path + "training_labels.xlsx", index_col=0
).reset_index()
df_val_temp = pd.read_excel(
    load_path + "validation_labels.xlsx", index_col=0
).reset_index()

In [None]:
# load vocab and tfidf matrices for the current task
code = "order related and payments_1percent_439words"
# code = "full_5352words"

vocab_path = "../data/interim/vocabs/vocab_" + code + ".json"
tfidf_train_path = "../data/interim/tfidfs/tfidf_" + code + "_train.npz"
tfidf_val_path = "../data/interim/tfidfs/tfidf_" + code + "_val.npz"

with open(vocab_path) as f:
    vocab = json.load(f)
tfidf_train = load_npz(tfidf_train_path)
tfidf_val = load_npz(tfidf_val_path)

In [None]:
tfidf_train

<379328x439 sparse matrix of type '<class 'numpy.float64'>'
	with 15197811 stored elements in Compressed Sparse Row format>

In [None]:
# convert tfidf to DataFrame with labels
df_train_all = pd.DataFrame.sparse.from_spmatrix(tfidf_train)
df_val_all = pd.DataFrame.sparse.from_spmatrix(tfidf_val)

df_train_all["label"], df_val_all["label"] = df_train_temp["label"], df_val_temp["label"]

In [None]:
# assign cluster labels (remove this step once clusters are finalized)
# clusters
# note: cluster names are not representative of the classes they contain
clusters = {
    "authorization": [
        "account cancellation",
        "account security",
        "login issues",
        "forgot my password",
        "software update",
    ],
    "order related and payments": [
        "best buy credit card",
        "payment failed",
        "billing or charge disputes",
        "cancel order",
        "unauthorized charge or payment",
        "refund request",
        "fraud concerns",
        "return request",
        "cancellation of a plan subscription or membership",
        "account cancellation",
        "change or update order",
        "schedule order pickup",
        "change shipping time",
        "delivery tracking",
        "refund status",
        "change payment method",
        "payment method",
        "change shipping address",
        "delivery or parts of delivery items missing",
        "renewal of a plan subscription or membership",
        "reschedule delivery",
        "reschedule order pickup",
        "rewards or discounts",
        "schedule delivery",
        "trade in inquiry",
        "delivery delays",
    ],
    "warranty": [
        "check warranty coverage",
        "damaged product",
        "warranty claim",
        "reschedule repair",
        "device damaged",
        "incomplete installation",
        "lost or forgot items",
        "reschedule installation",
        "schedule repair",
        "screen issues",
        "software error",
        "software installation",
        "schedule installation",
        "troubleshooting",
        "performance issues",
        "defective product",
    ],
    "queries regarding website": [
        "employment or career inquiries",
        "website or app complaints",
        "incomplete installation",
        "miscellaneous inquiries",
        "network or connectivity issues",
        "customer feedback",
        "bad customer service",
    ],
    "product queries": [
        "price match",  # 6759
        "product availability and stock",  # 37972
        "product compatibility",  # 10897
        "product details inquiry",  # 42698
        "transfer call to the right department or store",  # 5869
    ],
}

# other dict
other_dict = {
    "warranty": [
        "screen issues",
        "device damaged",
        "check warranty coverage",
        "lost or forgot items",
        "reschedule installation",
        "performance issues",
    ],
    "order related and payments": [
        "change shipping time",
        "best buy credit card",
        "payment failed",
        "account cancellation",
        "reschedule order pickup",
    ],
    "product queries": [],
    "queries regarding website": [
        "website or app complaints",
        "incomplete installation",
        "network or connectivity issues",
    ],
    "authorization": [],
}

# create a reverse lookup dict,
# i.e. keys are labels and values are cluster labels
cluster_lookup_dict = {}
for cluster_label, v in clusters.items():
    for label in v:
        cluster_lookup_dict[label] = cluster_label

df_train_all["cluster_label"] = df_train_all["label"].apply(lambda x: cluster_lookup_dict[x])
df_val_all["cluster_label"] = df_val_all["label"].apply(lambda x: cluster_lookup_dict[x])

In [None]:
# assign "other" labels
cluster_name = "order related and payments"

mask = df_train_all["cluster_label"] == cluster_name
df_train_clus = df_train_all[mask].copy()
df_train_nonclus = df_train_all[~mask].copy()

mask = df_val_all["cluster_label"] == cluster_name
df_val_clus = df_val_all[mask].copy()
df_val_nonclus = df_val_all[~mask].copy()

# assign "other" to all samples from other clusters
df_train_nonclus["new_label"] = "wrong_cluster"
df_val_nonclus["new_label"] = "wrong_cluster"

# within the current cluster, assign "other" to minority classes
df_train_clus["new_label"] = df_train_clus["label"].apply(
    lambda x: "other" if (x in other_dict[cluster_name]) else x
)
df_val_clus["new_label"] = df_val_clus["label"].apply(
    lambda x: "other" if (x in other_dict[cluster_name]) else x
)

In [None]:
df_train_nonclus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,432,433,434,435,436,437,438,label,cluster_label,new_label
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,troubleshooting,warranty,wrong_cluster
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,schedule installation,warranty,wrong_cluster
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.069441,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,account security,authorization,wrong_cluster
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,product availability and stock,product queries,wrong_cluster
5,0.0,0.0,0.0,0.000000,0.0,0.0,0.043537,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,schedule repair,warranty,wrong_cluster
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379322,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.047245,0.0,0.0,0.0,0.0,0.0,0.0,schedule repair,warranty,wrong_cluster
379324,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,product details inquiry,product queries,wrong_cluster
379325,0.0,0.0,0.0,0.090768,0.0,0.0,0.056131,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,schedule repair,warranty,wrong_cluster
379326,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,troubleshooting,warranty,wrong_cluster


## Balance data and some preprocessing

In [None]:
df_train_clus["new_label"].value_counts()

new_label
change or update order                               24297
renewal of a plan subscription or membership         12402
return request                                       10634
schedule order pickup                                 8601
cancel order                                          7855
reschedule delivery                                   6817
rewards or discounts                                  6458
cancellation of a plan subscription or membership     6369
unauthorized charge or payment                        5536
delivery tracking                                     5513
change payment method                                 3774
schedule delivery                                     3647
trade in inquiry                                      3321
other                                                 3090
refund request                                        3039
billing or charge disputes                            2998
change shipping address                       

In [None]:
len(df_train_clus)

126932

In [None]:
df_train_nonclus["new_label"].value_counts()

new_label
wrong_cluster    252396
Name: count, dtype: int64

In [None]:
# no balancing, just need to augment the data with "wrong cluster" data
wrong_percent = 10 / 100
wrong_cluster_size_train = int(wrong_percent * len(df_train_clus))
wrong_cluster_size_val = int(wrong_percent * len(df_val_clus))

df_train = pd.concat(
    [
        df_train_clus,
        df_train_nonclus.sample(n=wrong_cluster_size_train, random_state=seed),
    ]
)
df_val = pd.concat(
    [df_val_clus, df_val_nonclus.sample(n=wrong_cluster_size_val, random_state=seed)]
)

12693


In [None]:
# create X, y arrays
y_train, y_val = df_train["new_label"].values, df_val["new_label"].values
X_train = df_train.drop(columns=["label", "cluster_label", "new_label"]).to_numpy()
X_val = df_val.drop(columns=["label", "cluster_label", "new_label"]).to_numpy()

In [None]:
# encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

X_final, y_final = X_train, y_train

In [None]:
pd.Series(y_final).value_counts()

3     24297
22    12693
14    12402
16    10634
19     8601
1      7855
15     6817
17     6458
2      6369
21     5536
8      5513
4      3774
18     3647
20     3321
10     3090
12     3039
0      2998
5      2992
11     2626
6      2066
9      1859
7      1526
13     1512
Name: count, dtype: int64

## Train classifier

#### Naive Bayes

In [None]:
# create hyperparameter grid
# param_grid = [{'alpha': v} for v in [1.0, 0.5]]
param_grid = [{"alpha": v} for v in [10.0, 5.0, 1.0, 0.5, 0.1, 0.05, 0.001]]

In [None]:
# hyperparameter tuning
clf = MultinomialNB()
tuning_results = []
num_params = len(param_grid)

for i, params in enumerate(param_grid):
    # training
    start_time = time.time()

    clf = MultinomialNB(**params)
    clf.fit(X_final, y_final)

    end_time = time.time()

    # calculate metrics
    y_pred = clf.predict(X_val)
    micro_f1_score = f1_score(y_val, y_pred, average="micro")
    clf_report = classification_report(y_val, y_pred, output_dict=True)

    # save metrics
    result = {
        "params": params,
        "time": (end_time - start_time) / 60,
        "micro_f1_score": micro_f1_score,
        "classification_report": clf_report,
    }
    tuning_results.append(result)
    print(f"Hyperparameter set {i}/{num_params} completed.")
    print(result)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hyperparameter set 0/7 completed.
{'params': {'alpha': 10.0}, 'time': 0.0037093480428059896, 'micro_f1_score': 0.5024639009855604, 'classification_report': {'0': {'precision': 0.7272727272727273, 'recall': 0.0213903743315508, 'f1-score': 0.04155844155844156, 'support': 374}, '1': {'precision': 0.7736389684813754, 'recall': 0.27494908350305497, 'f1-score': 0.40570999248685197, 'support': 982}, '2': {'precision': 0.5958904109589042, 'recall': 0.6557788944723618, 'f1-score': 0.6244019138755982, 'support': 796}, '3': {'precision': 0.31994595922377794, 'recall': 0.8577543628580836, 'f1-score': 0.466052419715538, 'support': 3037}, '4': {'precision': 0.5714285714285714, 'recall': 0.1440677966101695, 'f1-score': 0.23011844331641285, 'support': 472}, '5': {'precision': 1.0, 'recall': 0.00267379679144385, 'f1-score': 0.005333333333333333, 'support': 374}, '6': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 258}, '7': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 19

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hyperparameter set 1/7 completed.
{'params': {'alpha': 5.0}, 'time': 0.0033386627833048503, 'micro_f1_score': 0.5138666055466422, 'classification_report': {'0': {'precision': 0.7727272727272727, 'recall': 0.045454545454545456, 'f1-score': 0.08585858585858587, 'support': 374}, '1': {'precision': 0.7817679558011049, 'recall': 0.28818737270875766, 'f1-score': 0.4211309523809524, 'support': 982}, '2': {'precision': 0.5960044395116537, 'recall': 0.6746231155778895, 'f1-score': 0.6328815556865056, 'support': 796}, '3': {'precision': 0.33140729327170004, 'recall': 0.8498518274613105, 'f1-score': 0.4768591224018476, 'support': 3037}, '4': {'precision': 0.535031847133758, 'recall': 0.17796610169491525, 'f1-score': 0.2670906200317965, 'support': 472}, '5': {'precision': 1.0, 'recall': 0.008021390374331552, 'f1-score': 0.015915119363395226, 'support': 374}, '6': {'precision': 1.0, 'recall': 0.003875968992248062, 'f1-score': 0.007722007722007722, 'support': 258}, '7': {'precision': 0.0, 'recall': 

#### XGBoost

In [None]:
param_grid = {
    "learning_rate": [1.0, 0.7, 0.3],
    "n_estimators": [50, 75, 100],  # more means overfitting
    "max_depth": [5, 6, 7],  # more means overfitting
    "lambda": [0.01, 0.1, 1.0],  # more means underfitting
    "alpha": [0.01, 0.1, 1.0],  # more means underfitting
    "subsample": [1.0, 0.5, 0.3, 0.1],
}
n_hyper_combinations = 100
param_list = list(
    ParameterSampler(param_grid, n_iter=n_hyper_combinations, random_state=seed)
)

In [None]:
param_list = param_list[0:20]
param_list

[{'subsample': 0.5,
  'n_estimators': 50,
  'max_depth': 6,
  'learning_rate': 0.3,
  'lambda': 0.1,
  'alpha': 0.1},
 {'subsample': 1.0,
  'n_estimators': 100,
  'max_depth': 6,
  'learning_rate': 1.0,
  'lambda': 1.0,
  'alpha': 0.1},
 {'subsample': 0.5,
  'n_estimators': 50,
  'max_depth': 7,
  'learning_rate': 0.3,
  'lambda': 0.1,
  'alpha': 0.01},
 {'subsample': 0.3,
  'n_estimators': 50,
  'max_depth': 6,
  'learning_rate': 0.7,
  'lambda': 0.01,
  'alpha': 1.0},
 {'subsample': 0.5,
  'n_estimators': 50,
  'max_depth': 7,
  'learning_rate': 1.0,
  'lambda': 1.0,
  'alpha': 0.01},
 {'subsample': 1.0,
  'n_estimators': 100,
  'max_depth': 5,
  'learning_rate': 0.3,
  'lambda': 1.0,
  'alpha': 1.0},
 {'subsample': 0.1,
  'n_estimators': 100,
  'max_depth': 6,
  'learning_rate': 0.3,
  'lambda': 1.0,
  'alpha': 0.01},
 {'subsample': 0.5,
  'n_estimators': 75,
  'max_depth': 5,
  'learning_rate': 1.0,
  'lambda': 0.01,
  'alpha': 0.1},
 {'subsample': 0.3,
  'n_estimators': 100,
  'ma

In [None]:
# hyperparameter tuning
tuning_results = []
num_params = len(param_list)

for i, params in enumerate(param_list):
    # training
    start_time = time.time()

    clf = xgb.XGBClassifier(**params)
    clf.fit(X_final, y_final)

    end_time = time.time()

    # calculate metrics
    y_pred = clf.predict(X_val)
    micro_f1_score = f1_score(y_val, y_pred, average="micro")
    clf_report = classification_report(y_val, y_pred, output_dict=True)

    # save metrics
    result = {
        "params": params,
        "time": (end_time - start_time) / 60,
        "micro_f1_score": micro_f1_score,
        "classification_report": clf_report,
    }
    tuning_results.append(result)
    print(f"Hyperparameter set {i}/{num_params} completed.")
    print(result)

Hyperparameter set 0/20 completed.
{'params': {'subsample': 0.5, 'n_estimators': 50, 'max_depth': 6, 'learning_rate': 0.3, 'lambda': 0.1, 'alpha': 0.1}, 'time': 16.384627075990043, 'micro_f1_score': 0.6340820536328214, 'classification_report': {'0': {'precision': 0.42696629213483145, 'recall': 0.3048128342245989, 'f1-score': 0.35569422776911075, 'support': 374}, '1': {'precision': 0.6106639839034205, 'recall': 0.6181262729124236, 'f1-score': 0.6143724696356274, 'support': 982}, '2': {'precision': 0.6178343949044586, 'recall': 0.7311557788944724, 'f1-score': 0.669735327963176, 'support': 796}, '3': {'precision': 0.5648229441030144, 'recall': 0.6354955548238393, 'f1-score': 0.5980787108769755, 'support': 3037}, '4': {'precision': 0.5138888888888888, 'recall': 0.3919491525423729, 'f1-score': 0.4447115384615385, 'support': 472}, '5': {'precision': 0.5400696864111498, 'recall': 0.4144385026737968, 'f1-score': 0.4689863842662632, 'support': 374}, '6': {'precision': 0.40606060606060607, 'reca

KeyboardInterrupt: 

In [None]:
fname = "exp_divij_results0_cluster1.json"
with open(fname, "w") as f:
    json.dump(tuning_results, f)

### Save the best model

In [None]:
tuning_results.sort(key=lambda x: x['micro_f1_score'], reverse=True)

In [None]:
tuning_results

[{'params': {'subsample': 1.0,
   'n_estimators': 30,
   'max_depth': 7,
   'learning_rate': 0.3,
   'lambda': 1.0,
   'alpha': 0.1},
  'time': 0.3365378816922506,
  'micro_f1_score': 0.7217268887846081,
  'classification_report': {'0': {'precision': 0.6374045801526718,
    'recall': 0.48405797101449277,
    'f1-score': 0.5502471169686985,
    'support': 345},
   '1': {'precision': 0.5333333333333333,
    'recall': 0.25,
    'f1-score': 0.3404255319148936,
    'support': 128},
   '2': {'precision': 0.8262032085561497,
    'recall': 0.9537037037037037,
    'f1-score': 0.8853868194842406,
    'support': 324},
   '3': {'precision': 0.7274368231046932,
    'recall': 0.8770402611534276,
    'f1-score': 0.7952639368524913,
    'support': 919},
   '4': {'precision': 0.738562091503268,
    'recall': 0.509009009009009,
    'f1-score': 0.6026666666666668,
    'support': 222},
   '5': {'precision': 0.6379310344827587,
    'recall': 0.5751295336787565,
    'f1-score': 0.6049046321525886,
    'supp

In [None]:
params = {
    # put params here
}

clf = xgb.XGBClassifier(**params)

clf.fit(X_final, y_final)

# calculate metrics
y_pred = clf.predict(X_val)
micro_f1_score = f1_score(y_val, y_pred, average="micro")

In [None]:
micro_f1_score

0.7217268887846081

In [None]:
import pickle
save_path = "../data/results/models/"
with open(save_path + "put_file_name_here", "wb") as files:
    pickle.dump(clf, files, protocol=3)