In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
import time
import json

# sklearn imports
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# classifier imports
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB

seed = 1

## Load training and validation data

In [None]:
# load training validation data to use labels
load_path = "../data/clean/"
df_train_temp = pd.read_excel(
    load_path + "training_labels.xlsx", index_col=0
).reset_index()
df_val_temp = pd.read_excel(
    load_path + "validation_labels.xlsx", index_col=0
).reset_index()

In [None]:
# load vocab and tfidf matrices for the current task
code = "warranty_1percent_433words"
# code = "full_5352words"

vocab_path = "../data/interim/vocabs/vocab_" + code + ".json"
tfidf_train_path = "../data/interim/tfidfs/tfidf_" + code + "_train.npz"
tfidf_val_path = "../data/interim/tfidfs/tfidf_" + code + "_val.npz"

with open(vocab_path) as f:
    vocab = json.load(f)
tfidf_train = load_npz(tfidf_train_path)
tfidf_val = load_npz(tfidf_val_path)

In [None]:
tfidf_train

<379328x433 sparse matrix of type '<class 'numpy.float64'>'
	with 15185230 stored elements in Compressed Sparse Row format>

In [None]:
# convert tfidf to DataFrame with labels
df_train_all = pd.DataFrame.sparse.from_spmatrix(tfidf_train)
df_val_all = pd.DataFrame.sparse.from_spmatrix(tfidf_val)

df_train_all["label"], df_val_all["label"] = df_train_temp["label"], df_val_temp["label"]

In [None]:
# assign cluster labels (remove this step once clusters are finalized)
# clusters
# note: cluster names are not representative of the classes they contain
clusters = {
    "authorization": [
        "account cancellation",
        "account security",
        "login issues",
        "forgot my password",
        "software update",
    ],
    "order related and payments": [
        "best buy credit card",
        "payment failed",
        "billing or charge disputes",
        "cancel order",
        "unauthorized charge or payment",
        "refund request",
        "fraud concerns",
        "return request",
        "cancellation of a plan subscription or membership",
        "account cancellation",
        "change or update order",
        "schedule order pickup",
        "change shipping time",
        "delivery tracking",
        "refund status",
        "change payment method",
        "payment method",
        "change shipping address",
        "delivery or parts of delivery items missing",
        "renewal of a plan subscription or membership",
        "reschedule delivery",
        "reschedule order pickup",
        "rewards or discounts",
        "schedule delivery",
        "trade in inquiry",
        "delivery delays",
    ],
    "warranty": [
        "check warranty coverage",
        "damaged product",
        "warranty claim",
        "reschedule repair",
        "device damaged",
        "incomplete installation",
        "lost or forgot items",
        "reschedule installation",
        "schedule repair",
        "screen issues",
        "software error",
        "software installation",
        "schedule installation",
        "troubleshooting",
        "performance issues",
        "defective product",
    ],
    "queries regarding website": [
        "employment or career inquiries",
        "website or app complaints",
        "incomplete installation",
        "miscellaneous inquiries",
        "network or connectivity issues",
        "customer feedback",
        "bad customer service",
    ],
    "product queries": [
        "price match",  # 6759
        "product availability and stock",  # 37972
        "product compatibility",  # 10897
        "product details inquiry",  # 42698
        "transfer call to the right department or store",  # 5869
    ],
}

# other dict
other_dict = {
    "warranty": [
        "screen issues",
        "device damaged",
        "check warranty coverage",
        "lost or forgot items",
        "reschedule installation",
        "performance issues",
    ],
    "order related and payments": [
        "change shipping time",
        "best buy credit card",
        "payment failed",
        "account cancellation",
        "reschedule order pickup",
    ],
    "product queries": [],
    "queries regarding website": [
        "website or app complaints",
        "incomplete installation",
        "network or connectivity issues",
    ],
    "authorization": [],
}

# create a reverse lookup dict,
# i.e. keys are labels and values are cluster labels
cluster_lookup_dict = {}
for cluster_label, v in clusters.items():
    for label in v:
        cluster_lookup_dict[label] = cluster_label

df_train_all["cluster_label"] = df_train_all["label"].apply(lambda x: cluster_lookup_dict[x])
df_val_all["cluster_label"] = df_val_all["label"].apply(lambda x: cluster_lookup_dict[x])

In [None]:
# assign "other" labels
cluster_name = "warranty"

mask = df_train_all["cluster_label"] == cluster_name
df_train_clus = df_train_all[mask].copy()
df_train_nonclus = df_train_all[~mask].copy()

mask = df_val_all["cluster_label"] == cluster_name
df_val_clus = df_val_all[mask].copy()
df_val_nonclus = df_val_all[~mask].copy()

# assign "other" to all samples from other clusters
df_train_nonclus["new_label"] = "wrong_cluster"
df_val_nonclus["new_label"] = "wrong_cluster"

# within the current cluster, assign "other" to minority classes
df_train_clus["new_label"] = df_train_clus["label"].apply(
    lambda x: "other" if (x in other_dict[cluster_name]) else x
)
df_val_clus["new_label"] = df_val_clus["label"].apply(
    lambda x: "other" if (x in other_dict[cluster_name]) else x
)

In [None]:
df_train_nonclus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,426,427,428,429,430,431,432,label,cluster_label,new_label
2,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,trade in inquiry,order related and payments,wrong_cluster
3,0.000000,0.0,0.0,0.069683,0.0,0.000000,0.0,0.0,0.241878,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,account security,authorization,wrong_cluster
4,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,product availability and stock,product queries,wrong_cluster
7,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.085608,0.0,...,0.402373,0.0,0.0,0.0,0.0,0.000000,0.0,product details inquiry,product queries,wrong_cluster
8,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,product availability and stock,product queries,wrong_cluster
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379316,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,product compatibility,product queries,wrong_cluster
379317,0.130903,0.0,0.0,0.000000,0.0,0.058201,0.0,0.0,0.037639,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.150570,0.0,change payment method,order related and payments,wrong_cluster
379318,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,transfer call to the right department or store,product queries,wrong_cluster
379323,0.000000,0.0,0.0,0.023049,0.0,0.000000,0.0,0.0,0.060005,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.240041,0.0,rewards or discounts,order related and payments,wrong_cluster


## Balance data and some preprocessing

In [None]:
df_train_clus["new_label"].value_counts()

new_label
schedule repair          35386
defective product        19269
schedule installation    14616
troubleshooting          14242
other                    11097
damaged product           9065
software error            7502
software installation     6156
reschedule repair         3759
warranty claim            3006
Name: count, dtype: int64

In [None]:
len(df_train_clus)

124098

In [None]:
df_train_nonclus["new_label"].value_counts()

new_label
wrong_cluster    255230
Name: count, dtype: int64

In [None]:
# no balancing, just need to augment the data with "wrong cluster" data
wrong_percent = 10 / 100
wrong_cluster_size_train = int(wrong_percent * len(df_train_clus))
wrong_cluster_size_val = int(wrong_percent * len(df_val_clus))

df_train = pd.concat(
    [
        df_train_clus,
        df_train_nonclus.sample(n=wrong_cluster_size_train, random_state=seed),
    ]
)
df_val = pd.concat(
    [df_val_clus, df_val_nonclus.sample(n=wrong_cluster_size_val, random_state=seed)]
)

In [None]:
# create X, y arrays
y_train, y_val = df_train["new_label"].values, df_val["new_label"].values
X_train = df_train.drop(columns=["label", "cluster_label", "new_label"]).to_numpy()
X_val = df_val.drop(columns=["label", "cluster_label", "new_label"]).to_numpy()

In [None]:
# encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

X_final, y_final = X_train, y_train

In [None]:
pd.Series(y_final).value_counts()

5     35386
1     19269
4     14616
8     14242
10    12409
2     11097
0      9065
6      7502
7      6156
3      3759
9      3006
Name: count, dtype: int64

## Train classifier

#### Naive Bayes

In [None]:
# create hyperparameter grid
# param_grid = [{'alpha': v} for v in [1.0, 0.5]]
param_grid = [{"alpha": v} for v in [10.0, 5.0, 1.0, 0.5, 0.1, 0.05, 0.001]]

In [None]:
# hyperparameter tuning
clf = MultinomialNB()
tuning_results = []
num_params = len(param_grid)

for i, params in enumerate(param_grid):
    # training
    start_time = time.time()

    clf = MultinomialNB(**params)
    clf.fit(X_final, y_final)

    end_time = time.time()

    # calculate metrics
    y_pred = clf.predict(X_val)
    micro_f1_score = f1_score(y_val, y_pred, average="micro")
    clf_report = classification_report(y_val, y_pred, output_dict=True)

    # save metrics
    result = {
        "params": params,
        "time": (end_time - start_time) / 60,
        "micro_f1_score": micro_f1_score,
        "classification_report": clf_report,
    }
    tuning_results.append(result)
    print(f"Hyperparameter set {i}/{num_params} completed.")
    print(result)

Hyperparameter set 0/7 completed.
{'params': {'alpha': 10.0}, 'time': 0.003301533063252767, 'micro_f1_score': 0.4453495868252945, 'classification_report': {'0': {'precision': 0.7944664031620553, 'recall': 0.17740511915269197, 'f1-score': 0.29004329004329005, 'support': 1133}, '1': {'precision': 0.5051975051975052, 'recall': 0.5045681063122923, 'f1-score': 0.5048826095990027, 'support': 2408}, '2': {'precision': 0.3723404255319149, 'recall': 0.05043227665706052, 'f1-score': 0.08883248730964469, 'support': 1388}, '3': {'precision': 1.0, 'recall': 0.002127659574468085, 'f1-score': 0.004246284501061571, 'support': 470}, '4': {'precision': 0.8190954773869347, 'recall': 0.35686918445539134, 'f1-score': 0.49714067861227595, 'support': 1827}, '5': {'precision': 0.3731115107913669, 'recall': 0.9380510965408094, 'f1-score': 0.5338737695425594, 'support': 4423}, '6': {'precision': 0.5185873605947955, 'recall': 0.29775880469583776, 'f1-score': 0.37830508474576274, 'support': 937}, '7': {'precision

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hyperparameter set 2/7 completed.
{'params': {'alpha': 1.0}, 'time': 0.002692262331644694, 'micro_f1_score': 0.45976674676199963, 'classification_report': {'0': {'precision': 0.7854889589905363, 'recall': 0.21977052074139453, 'f1-score': 0.343448275862069, 'support': 1133}, '1': {'precision': 0.5163425734381465, 'recall': 0.5182724252491694, 'f1-score': 0.5173056994818652, 'support': 2408}, '2': {'precision': 0.37962962962962965, 'recall': 0.059077809798270896, 'f1-score': 0.10224438902743144, 'support': 1388}, '3': {'precision': 0.875, 'recall': 0.014893617021276596, 'f1-score': 0.029288702928870296, 'support': 470}, '4': {'precision': 0.8190364277320799, 'recall': 0.38149972632731255, 'f1-score': 0.520537714712472, 'support': 1827}, '5': {'precision': 0.3831374743518, 'recall': 0.9287813701107845, 'f1-score': 0.542489270386266, 'support': 4423}, '6': {'precision': 0.5180921052631579, 'recall': 0.33617929562433296, 'f1-score': 0.4077669902912621, 'support': 937}, '7': {'precision': 0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hyperparameter set 4/7 completed.
{'params': {'alpha': 0.1}, 'time': 0.002673697471618652, 'micro_f1_score': 0.4608802672449159, 'classification_report': {'0': {'precision': 0.7832817337461301, 'recall': 0.22330097087378642, 'f1-score': 0.3475274725274725, 'support': 1133}, '1': {'precision': 0.5159025196199918, 'recall': 0.518687707641196, 'f1-score': 0.5172913646717746, 'support': 2408}, '2': {'precision': 0.38636363636363635, 'recall': 0.061239193083573486, 'f1-score': 0.10572139303482588, 'support': 1388}, '3': {'precision': 0.875, 'recall': 0.014893617021276596, 'f1-score': 0.029288702928870296, 'support': 470}, '4': {'precision': 0.8222222222222222, 'recall': 0.38478379857690204, 'f1-score': 0.5242356450410142, 'support': 1827}, '5': {'precision': 0.3839319117096895, 'recall': 0.9281030974451729, 'f1-score': 0.543169037380086, 'support': 4423}, '6': {'precision': 0.5179153094462541, 'recall': 0.33938100320170755, 'f1-score': 0.4100580270793037, 'support': 937}, '7': {'precision':

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hyperparameter set 5/7 completed.
{'params': {'alpha': 0.05}, 'time': 0.0025612791379292805, 'micro_f1_score': 0.4610560862685343, 'classification_report': {'0': {'precision': 0.7832817337461301, 'recall': 0.22330097087378642, 'f1-score': 0.3475274725274725, 'support': 1133}, '1': {'precision': 0.5161023947151114, 'recall': 0.5191029900332226, 'f1-score': 0.5175983436853002, 'support': 2408}, '2': {'precision': 0.38636363636363635, 'recall': 0.061239193083573486, 'f1-score': 0.10572139303482588, 'support': 1388}, '3': {'precision': 0.875, 'recall': 0.014893617021276596, 'f1-score': 0.029288702928870296, 'support': 470}, '4': {'precision': 0.822637106184364, 'recall': 0.38587848932676516, 'f1-score': 0.5253353204172877, 'support': 1827}, '5': {'precision': 0.38403966694732905, 'recall': 0.9281030974451729, 'f1-score': 0.543276866066702, 'support': 4423}, '6': {'precision': 0.5179153094462541, 'recall': 0.33938100320170755, 'f1-score': 0.4100580270793037, 'support': 937}, '7': {'precisio

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### XGBoost

In [None]:
param_grid = {
    "learning_rate": [0.7, 0.3],
    "n_estimators": [30, 50, 70],  # more means overfitting
    "max_depth": [5, 6, 7],  # more means overfitting
    "lambda": [0.1, 1.0],  # more means underfitting
    "alpha": [0.1, 1.0],  # more means underfitting
    "subsample": [0.5, 0.3, 0.1],
}
n_hyper_combinations = 100
param_list = list(
    ParameterSampler(param_grid, n_iter=n_hyper_combinations, random_state=seed)
)

In [None]:
param_list = param_list[0:10]
param_list

[{'subsample': 0.5,
  'n_estimators': 70,
  'max_depth': 7,
  'learning_rate': 0.7,
  'lambda': 1.0,
  'alpha': 0.1},
 {'subsample': 0.5,
  'n_estimators': 70,
  'max_depth': 5,
  'learning_rate': 0.3,
  'lambda': 0.1,
  'alpha': 0.1},
 {'subsample': 0.3,
  'n_estimators': 70,
  'max_depth': 5,
  'learning_rate': 0.3,
  'lambda': 0.1,
  'alpha': 0.1},
 {'subsample': 0.1,
  'n_estimators': 70,
  'max_depth': 5,
  'learning_rate': 0.7,
  'lambda': 1.0,
  'alpha': 0.1},
 {'subsample': 0.3,
  'n_estimators': 70,
  'max_depth': 6,
  'learning_rate': 0.3,
  'lambda': 1.0,
  'alpha': 0.1},
 {'subsample': 0.1,
  'n_estimators': 50,
  'max_depth': 6,
  'learning_rate': 0.3,
  'lambda': 1.0,
  'alpha': 0.1},
 {'subsample': 0.3,
  'n_estimators': 30,
  'max_depth': 6,
  'learning_rate': 0.7,
  'lambda': 0.1,
  'alpha': 1.0},
 {'subsample': 0.3,
  'n_estimators': 50,
  'max_depth': 5,
  'learning_rate': 0.3,
  'lambda': 1.0,
  'alpha': 1.0},
 {'subsample': 0.5,
  'n_estimators': 50,
  'max_depth':

In [None]:
# hyperparameter tuning
tuning_results = []
num_params = len(param_list)

for i, params in enumerate(param_list):
    # training
    start_time = time.time()

    clf = xgb.XGBClassifier(**params)
    clf.fit(X_final, y_final)

    end_time = time.time()

    # calculate metrics
    y_pred = clf.predict(X_val)
    micro_f1_score = f1_score(y_val, y_pred, average="micro")
    clf_report = classification_report(y_val, y_pred, output_dict=True)

    # save metrics
    result = {
        "params": params,
        "time": (end_time - start_time) / 60,
        "micro_f1_score": micro_f1_score,
        "classification_report": clf_report,
    }
    tuning_results.append(result)
    print(f"Hyperparameter set {i}/{num_params} completed.")
    print(result)

KeyboardInterrupt: 

In [None]:
fname = "exp_divij_results0_cluster1.json"
with open(fname, "w") as f:
    json.dump(tuning_results, f)

### Save the best model

In [None]:
tuning_results.sort(key=lambda x: x['micro_f1_score'], reverse=True)

In [None]:
tuning_results

[{'params': {'subsample': 1.0,
   'n_estimators': 30,
   'max_depth': 7,
   'learning_rate': 0.3,
   'lambda': 1.0,
   'alpha': 0.1},
  'time': 0.3365378816922506,
  'micro_f1_score': 0.7217268887846081,
  'classification_report': {'0': {'precision': 0.6374045801526718,
    'recall': 0.48405797101449277,
    'f1-score': 0.5502471169686985,
    'support': 345},
   '1': {'precision': 0.5333333333333333,
    'recall': 0.25,
    'f1-score': 0.3404255319148936,
    'support': 128},
   '2': {'precision': 0.8262032085561497,
    'recall': 0.9537037037037037,
    'f1-score': 0.8853868194842406,
    'support': 324},
   '3': {'precision': 0.7274368231046932,
    'recall': 0.8770402611534276,
    'f1-score': 0.7952639368524913,
    'support': 919},
   '4': {'precision': 0.738562091503268,
    'recall': 0.509009009009009,
    'f1-score': 0.6026666666666668,
    'support': 222},
   '5': {'precision': 0.6379310344827587,
    'recall': 0.5751295336787565,
    'f1-score': 0.6049046321525886,
    'supp

In [None]:
params = {
    # put params here
}

clf = xgb.XGBClassifier(**params)

clf.fit(X_final, y_final)

# calculate metrics
y_pred = clf.predict(X_val)
micro_f1_score = f1_score(y_val, y_pred, average="micro")

In [None]:
micro_f1_score

0.7217268887846081

In [None]:
import pickle
save_path = "../data/results/models/"
with open(save_path + "put_file_name_here", "wb") as files:
    pickle.dump(clf, files, protocol=3)