In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
import time
import json

# sklearn imports
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# classifier imports
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB

seed = 1

## Load training and validation data

In [None]:
# load training validation data to use labels
load_path = "../data/clean/"
df_train_temp = pd.read_excel(
    load_path + "training_data_cleaned.xlsx", usecols=[1], index_col=0
).reset_index()
df_val_temp = pd.read_excel(
    load_path + "validation_data_cleaned.xlsx", usecols=[1], index_col=0
).reset_index()

In [None]:
# load vocab and tfidf matrices for the current task
code = "350k_1percent_422words"
# code = "full_5352words"

vocab_path = "../data/interim/vocabs/vocab_" + code + ".json"
tfidf_train_path = "../data/interim/tfidfs/tfidf_" + code + "_train.npz"
tfidf_val_path = "../data/interim/tfidfs/tfidf_" + code + "_val.npz"

with open(vocab_path) as f:
    vocab = json.load(f)
tfidf_train = load_npz(tfidf_train_path)
tfidf_val = load_npz(tfidf_val_path)

In [None]:
# convert tfidf to DataFrame with labels
df_train = pd.DataFrame.sparse.from_spmatrix(tfidf_train)
df_val = pd.DataFrame.sparse.from_spmatrix(tfidf_val)

df_train["label"], df_val["label"] = df_train_temp["label"], df_val_temp["label"]

In [None]:
# assign cluster labels (remove this step once clusters are finalized)
# clusters
# note: cluster names are not representative of the classes they contain
clusters = {
    "authorization": [
        "account cancellation",
        "account security",
        "login issues",
        "forgot my password",
        "software update",
    ],
    "order related and payments": [
        "best buy credit card",
        "payment failed",
        "billing or charge disputes",
        "cancel order",
        "unauthorized charge or payment",
        "refund request",
        "fraud concerns",
        "return request",
        "cancellation of a plan subscription or membership",
        "account cancellation",
        "change or update order",
        "schedule order pickup",
        "change shipping time",
        "delivery tracking",
        "refund status",
        "change payment method",
        "payment method",
        "change shipping address",
        "delivery or parts of delivery items missing",
        "renewal of a plan subscription or membership",
        "reschedule delivery",
        "reschedule order pickup",
        "rewards or discounts",
        "schedule delivery",
        "trade in inquiry",
        "delivery delays",
    ],
    "warranty": [
        "check warranty coverage",
        "damaged product",
        "warranty claim",
        "reschedule repair",
        "device damaged",
        "incomplete installation",
        "lost or forgot items",
        "reschedule installation",
        "schedule repair",
        "screen issues",
        "software error",
        "software installation",
        "schedule installation",
        "troubleshooting",
        "performance issues",
        "defective product",
    ],
    "queries regarding website": [
        "employment or career inquiries",
        "website or app complaints",
        "incomplete installation",
        "miscellaneous inquiries",
        "network or connectivity issues",
        "customer feedback",
        "bad customer service",
    ],
    "product queries": [
        "price match",  # 6759
        "product availability and stock",  # 37972
        "product compatibility",  # 10897
        "product details inquiry",  # 42698
        "transfer call to the right department or store",  # 5869
    ],
}

# create a reverse lookup dict,
# i.e. keys are labels and values are cluster labels
cluster_lookup_dict = {}
for cluster_label, v in clusters.items():
    for label in v:
        cluster_lookup_dict[label] = cluster_label

df_train["cluster_label"] = df_train["label"].apply(lambda x: cluster_lookup_dict[x])
df_val["cluster_label"] = df_val["label"].apply(lambda x: cluster_lookup_dict[x])

# remove "label" column for now
df_train.drop(columns=["label"], inplace=True)
df_val.drop(columns=["label"], inplace=True)

## Balance data and some preprocessing

In [None]:
# create X, y arrays
y_train, y_val = df_train["cluster_label"].values, df_val["cluster_label"].values
X_train = df_train.drop(columns="cluster_label").to_numpy()
X_val = df_val.drop(columns="cluster_label").to_numpy()

In [None]:
# encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

# code to save label encoder

In [None]:
# balance data: undersample majority classes, then oversample minority classes
selected_values = np.array([1, 2, 4])
# Use boolean indexing to filter the array
y_major = y_train[np.isin(y_train, selected_values)]
y_minor = y_train[~np.isin(y_train, selected_values)]
X_major = X_train[np.where(np.isin(y_train, selected_values))[0]]
X_minor = X_train[np.where(~np.isin(y_train, selected_values))[0]]

#### Undersampling of majority class
sampling_strategy = {1: 30000, 2: 30000, 4: 30000}
# Create an instance of RandomOverSampler with the specified sampling strategy
under = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_resampled, y_resampled = under.fit_resample(X_major, y_major)

#### Merging with minor class
X_int = np.vstack((X_resampled, X_minor))
y_int = np.concatenate((y_resampled, y_minor), axis=0)

#### Oversampling the minority class
sampling_strategy = {1: 30000, 2: 30000, 4: 30000, 0: 30000, 3: 30000}
# Create an instance of RandomOverSampler with the specified sampling strategy
over = RandomOverSampler(sampling_strategy=sampling_strategy)
X_final, y_final = over.fit_resample(X_int, y_int)

## Train classifier

#### Naive Bayes

In [None]:
# create hyperparameter grid
# param_grid = [{'alpha': v} for v in [1.0, 0.5]]
param_grid = [{"alpha": v} for v in [10.0, 5.0, 1.0, 0.5, 0.1, 0.05, 0.001]]

In [None]:
# hyperparameter tuning
clf = MultinomialNB()
tuning_results = []
num_params = len(param_grid)

for i, params in enumerate(param_grid):
    # training
    start_time = time.time()

    clf = MultinomialNB(**params)
    clf.fit(X_final, y_final)

    end_time = time.time()

    # calculate metrics
    y_pred = clf.predict(X_val)
    micro_f1_score = f1_score(y_val, y_pred, average="micro")
    clf_report = classification_report(y_val, y_pred, output_dict=True)

    # save metrics
    result = {
        "params": params,
        "time": (end_time - start_time) / 60,
        "micro_f1_score": micro_f1_score,
        "classification_report": clf_report,
    }
    tuning_results.append(result)
    print(f"Hyperparameter set {i}/{num_params} completed.")
    print(result)

Hyperparameter set 0/7 completed.
{'params': {'alpha': 10.0}, 'time': 0.003846466541290283, 'micro_f1_score': 0.6898093470558462, 'classification_report': {'0': {'precision': 0.30328441649196364, 'recall': 0.8066914498141264, 'f1-score': 0.4408329101066531, 'support': 1076}, '1': {'precision': 0.8316709137240627, 'recall': 0.7199672255136771, 'f1-score': 0.7717982500591197, 'support': 15866}, '2': {'precision': 0.7477657349305952, 'recall': 0.6038851351351351, 'f1-score': 0.6681675303712513, 'support': 13024}, '3': {'precision': 0.27155854430379744, 'recall': 0.7084623323013416, 'f1-score': 0.392622247640835, 'support': 1938}, '4': {'precision': 0.7332896031485733, 'recall': 0.7206678700361011, 'f1-score': 0.7269239522710278, 'support': 15512}, 'accuracy': 0.6898093470558462, 'macro avg': {'precision': 0.5775138425197984, 'recall': 0.7119348025600762, 'f1-score': 0.6000689780897774, 'support': 47416}, 'weighted avg': {'precision': 0.7415554762934206, 'recall': 0.6898093470558462, 'f1-s

#### XGBoost

In [None]:
param_grid = {
    "learning_rate": [1.0, 0.7, 0.3],
    "n_estimators": [50, 75, 100],  # more means overfitting
    "max_depth": [5, 6, 7],  # more means overfitting
    "lambda": [0.01, 0.1, 1.0],  # more means underfitting
    "alpha": [0.01, 0.1, 1.0],  # more means underfitting
    "subsample": [1.0, 0.5, 0.3, 0.1],
}
n_hyper_combinations = 100
param_list = list(
    ParameterSampler(param_grid, n_iter=n_hyper_combinations, random_state=seed)
)

In [None]:
param_list = param_list[0:20]
param_list

[{'subsample': 0.5,
  'n_estimators': 50,
  'max_depth': 6,
  'learning_rate': 0.3,
  'lambda': 0.1,
  'alpha': 0.1},
 {'subsample': 1.0,
  'n_estimators': 100,
  'max_depth': 6,
  'learning_rate': 1.0,
  'lambda': 1.0,
  'alpha': 0.1},
 {'subsample': 0.5,
  'n_estimators': 50,
  'max_depth': 7,
  'learning_rate': 0.3,
  'lambda': 0.1,
  'alpha': 0.01},
 {'subsample': 0.3,
  'n_estimators': 50,
  'max_depth': 6,
  'learning_rate': 0.7,
  'lambda': 0.01,
  'alpha': 1.0},
 {'subsample': 0.5,
  'n_estimators': 50,
  'max_depth': 7,
  'learning_rate': 1.0,
  'lambda': 1.0,
  'alpha': 0.01},
 {'subsample': 1.0,
  'n_estimators': 100,
  'max_depth': 5,
  'learning_rate': 0.3,
  'lambda': 1.0,
  'alpha': 1.0},
 {'subsample': 0.1,
  'n_estimators': 100,
  'max_depth': 6,
  'learning_rate': 0.3,
  'lambda': 1.0,
  'alpha': 0.01},
 {'subsample': 0.5,
  'n_estimators': 75,
  'max_depth': 5,
  'learning_rate': 1.0,
  'lambda': 0.01,
  'alpha': 0.1},
 {'subsample': 0.3,
  'n_estimators': 100,
  'ma

In [None]:
# hyperparameter tuning
tuning_results = []
num_params = len(param_list)

for i, params in enumerate(param_list):
    # training
    start_time = time.time()

    clf = xgb.XGBClassifier(**params)
    clf.fit(X_final, y_final)

    end_time = time.time()

    # calculate metrics
    y_pred = clf.predict(X_val)
    micro_f1_score = f1_score(y_val, y_pred, average="micro")
    clf_report = classification_report(y_val, y_pred, output_dict=True)

    # save metrics
    result = {
        "params": params,
        "time": (end_time - start_time) / 60,
        "micro_f1_score": micro_f1_score,
        "classification_report": clf_report,
    }
    tuning_results.append(result)
    print(f"Hyperparameter set {i}/{num_params} completed.")
    print(result)

In [None]:
fname = "exp_divij_results1.json"
with open(fname, "w") as f:
    json.dump(tuning_results, f)

In [None]:
micro_f1_score

0.7217268887846081

### Save the best model

In [None]:
tuning_results.sort(key=lambda x: x['micro_f1_score'], reverse=True)

In [None]:
tuning_results

[{'params': {'subsample': 1.0,
   'n_estimators': 30,
   'max_depth': 7,
   'learning_rate': 0.3,
   'lambda': 1.0,
   'alpha': 0.1},
  'time': 0.3365378816922506,
  'micro_f1_score': 0.7217268887846081,
  'classification_report': {'0': {'precision': 0.6374045801526718,
    'recall': 0.48405797101449277,
    'f1-score': 0.5502471169686985,
    'support': 345},
   '1': {'precision': 0.5333333333333333,
    'recall': 0.25,
    'f1-score': 0.3404255319148936,
    'support': 128},
   '2': {'precision': 0.8262032085561497,
    'recall': 0.9537037037037037,
    'f1-score': 0.8853868194842406,
    'support': 324},
   '3': {'precision': 0.7274368231046932,
    'recall': 0.8770402611534276,
    'f1-score': 0.7952639368524913,
    'support': 919},
   '4': {'precision': 0.738562091503268,
    'recall': 0.509009009009009,
    'f1-score': 0.6026666666666668,
    'support': 222},
   '5': {'precision': 0.6379310344827587,
    'recall': 0.5751295336787565,
    'f1-score': 0.6049046321525886,
    'supp

In [None]:
params = {
    # put params here
}

clf = xgb.XGBClassifier(**params)

clf.fit(X_final, y_final)

# calculate metrics
y_pred = clf.predict(X_val)
micro_f1_score = f1_score(y_val, y_pred, average="micro")

In [None]:
micro_f1_score

0.7217268887846081

In [None]:
import pickle
save_path = "../data/results/models/"
with open(save_path + "put_file_name_here", "wb") as files:
    pickle.dump(clf, files, protocol=3)